diff options
Diffstat (limited to 'moses/TranslationModel/UG')
105 files changed, 2195 insertions, 2195 deletions
diff --git a/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc b/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc index bf449247e..1217b9711 100644 --- a/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc +++ b/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc @@ -16,7 +16,7 @@ namespace Moses if (a.tv_sec != b.tv_sec) return a.tv_sec > b.tv_sec; return (a.tv_nsec >= b.tv_nsec); } -#endif +#endif bool operator<(timeval const& a, timeval const& b) { @@ -30,10 +30,10 @@ namespace Moses return (a.tv_usec >= b.tv_usec); } - void + void bubble_up(std::vector<TPCollWrapper*>& v, size_t k) { - if (k >= v.size()) return; + if (k >= v.size()) return; for (;k && (v[k]->tstamp < v[k/2]->tstamp); k /=2) { std::swap(v[k],v[k/2]); @@ -41,7 +41,7 @@ namespace Moses } } - void + void bubble_down(std::vector<TPCollWrapper*>& v, size_t k) { for (size_t j = 2*(k+1); j <= v.size(); j = 2*((k=j)+1)) @@ -62,7 +62,7 @@ namespace Moses TPCollWrapper* TPCollCache - ::encache(TPCollWrapper* const& ptr) + ::encache(TPCollWrapper* const& ptr) { using namespace boost; // update time stamp: @@ -76,7 +76,7 @@ namespace Moses { vector<TPCollWrapper*>& v = m_history; if (ptr->idx >= 0) // ptr is already in history - { + { assert(ptr == v[ptr->idx]); size_t k = 2 * (ptr->idx + 1); if (k < v.size()) bubble_up(v,k--); @@ -88,7 +88,7 @@ namespace Moses v.push_back(ptr); bubble_up(v,k); } - else // someone else needs to go + else // someone else needs to go { v[0]->idx = -1; release(v[0]); @@ -98,28 +98,28 @@ namespace Moses } return ptr; } // TPCollCache::encache(...) - - TPCollWrapper* + + TPCollWrapper* TPCollCache - ::get(uint64_t key, size_t revision) + ::get(uint64_t key, size_t revision) { using namespace boost; cache_t::iterator m; - { + { shared_lock<shared_mutex> lock(m_cache_lock); m = m_cache.find(key); - if (m == m_cache.end() || m->second->revision != revision) + if (m == m_cache.end() || m->second->revision != revision) return NULL; ++m->second->refCount; } - + encache(m->second); return NULL; } // TPCollCache::get(...) - + void TPCollCache - ::add(uint64_t key, TPCollWrapper* ptr) + ::add(uint64_t key, TPCollWrapper* ptr) { { boost::unique_lock<boost::shared_mutex> lock(m_cache_lock); @@ -129,7 +129,7 @@ namespace Moses } encache(ptr); } // TPCollCache::add(...) - + void TPCollCache ::release(TPCollWrapper*& ptr) @@ -137,25 +137,25 @@ namespace Moses if (!ptr) return; if (--ptr->refCount || ptr->idx >= 0) // tpc is still in use - { - ptr = NULL; - return; + { + ptr = NULL; + return; } - + #if 0 timespec t; clock_gettime(CLOCK_MONOTONIC,&t); timespec r; clock_getres(CLOCK_MONOTONIC,&r); float delta = t.tv_sec - ptr->tstamp.tv_sec; cerr << "deleting old cache entry after " << delta << " seconds." - << " clock resolution is " << r.tv_sec << ":" << r.tv_nsec + << " clock resolution is " << r.tv_sec << ":" << r.tv_nsec << " at " << __FILE__ << ":" << __LINE__ << endl; #endif - + boost::upgrade_lock<boost::shared_mutex> lock(m_cache_lock); cache_t::iterator m = m_cache.find(ptr->key); if (m != m_cache.end() && m->second == ptr) - { // the cache could have been updated with a new pointer - // for the same phrase already, so we need to check + { // the cache could have been updated with a new pointer + // for the same phrase already, so we need to check // if the pointer we cound is the one we want to get rid of, // hence the second check boost::upgrade_to_unique_lock<boost::shared_mutex> xlock(lock); @@ -163,7 +163,7 @@ namespace Moses } delete ptr; ptr = NULL; - } // TPCollCache::release(...) + } // TPCollCache::release(...) TPCollWrapper:: TPCollWrapper(size_t r, uint64_t k) @@ -175,5 +175,5 @@ namespace Moses { assert(this->refCount == 0); } - + } // namespace diff --git a/moses/TranslationModel/UG/TargetPhraseCollectionCache.h b/moses/TranslationModel/UG/TargetPhraseCollectionCache.h index fc9ce8921..269200647 100644 --- a/moses/TranslationModel/UG/TargetPhraseCollectionCache.h +++ b/moses/TranslationModel/UG/TargetPhraseCollectionCache.h @@ -5,15 +5,15 @@ namespace Moses { - class TPCollWrapper + class TPCollWrapper // wrapper around TargetPhraseCollection that includes reference counts // and a time stamp for least-recently-used caching of TargetPhraseCollection-s : public TargetPhraseCollection { public: - size_t const revision; + size_t const revision; // revison; gets changed when the underlying corpus in Mmsapt is updated - + uint64_t const key; // phrase key uint32_t refCount; // reference count #if defined(timespec) // timespec is better, but not available everywhere @@ -32,12 +32,12 @@ namespace Moses typedef std::vector<TPCollWrapper*> history_t; cache_t m_cache; // maps from phrase ids to target phrase collections mutable history_t m_history; // heap of live items, least recently used one on top - + mutable boost::shared_mutex m_cache_lock; // locks m_cache mutable boost::shared_mutex m_history_lock; // locks m_history #if 0 - // mutable size_t m_tpc_ctr; + // mutable size_t m_tpc_ctr; // counter of all live item, for debugging. probably obsolete; was used // to track memory leaks #endif @@ -47,14 +47,14 @@ namespace Moses public: TPCollCache(size_t capacity=1000); - - TPCollWrapper* + + TPCollWrapper* get(uint64_t key, size_t revision); - void + void add(uint64_t key, TPCollWrapper* ptr); - void + void release(TPCollWrapper*& tpc); }; diff --git a/moses/TranslationModel/UG/bitext-find.cc b/moses/TranslationModel/UG/bitext-find.cc index 46978d16e..18cc6e0fa 100644 --- a/moses/TranslationModel/UG/bitext-find.cc +++ b/moses/TranslationModel/UG/bitext-find.cc @@ -30,15 +30,15 @@ write_sentence } } -bool -fill(string const& query, TSA<Token> const& tsa, +bool +fill(string const& query, TSA<Token> const& tsa, TokenIndex const& V, bitvector& v) { v.resize(tsa.getCorpus()->size()); Bitext<Token>::iter m(&tsa); - istringstream buf(query); string w; - while (buf >> w) - if (!m.extend(V[w])) + istringstream buf(query); string w; + while (buf >> w) + if (!m.extend(V[w])) return false; m.markSentences(v); return true; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { interpret_args(argc, argv); if (Q1.empty() && Q2.empty()) exit(0); - + mmbitext B; string w; B.open(bname, L1, L2); @@ -64,13 +64,13 @@ int main(int argc, char* argv[]) bitvector check(B.T1->size()); if (Q1.size() == 0 || Q2.size() == 0) check.set(); else (m2.markSentences(check)); - + Bitext<Token>::iter& m = m1.size() ? m1 : m2; char const* x = m.lower_bound(-1); char const* stop = m.upper_bound(-1); uint64_t sid; ushort off; - boost::taus88 rnd; + boost::taus88 rnd; size_t N = m.approxOccurrenceCount(); maxhits = min(N, maxhits); size_t k = 0; // selected @@ -80,7 +80,7 @@ int main(int argc, char* argv[]) x = m.root->readOffset(x,stop,off); if (!check[sid]) continue; - size_t r = (N - i) * rnd()/(rnd.max()+1.) + k; + size_t r = (N - i) * rnd()/(rnd.max()+1.) + k; if (maxhits != N && r >= maxhits) continue; ++k; @@ -94,20 +94,20 @@ int main(int argc, char* argv[]) // cout << "alignment failure" << endl; } - cout << sid << " " << B.docname(sid) + cout << sid << " " << B.docname(sid) << " dfwd=" << po_fwd << " dbwd=" << po_bwd << "\n"; write_sentence(*B.T1, sid, *B.V1, cout); cout << "\n"; write_sentence(*B.T2, sid, *B.V2, cout); cout << "\n"; - B.write_yawat_alignment(sid, - m1.size() ? &m1 : NULL, - m2.size() ? &m2 : NULL, cout); + B.write_yawat_alignment(sid, + m1.size() ? &m1 : NULL, + m2.size() ? &m2 : NULL, cout); cout << endl; - + } } -void +void interpret_args(int ac, char* av[]) { po::variables_map vm; @@ -120,7 +120,7 @@ interpret_args(int ac, char* av[]) ("q1", po::value<string>(&Q1), "query in L1") ("q2", po::value<string>(&Q2), "query in L2") ; - + po::options_description h("Hidden Options"); h.add_options() ("bname", po::value<string>(&bname), "base name of corpus") @@ -133,7 +133,7 @@ interpret_args(int ac, char* av[]) a.add("bname",1); a.add("L1",1); a.add("L2",1); - + po::store(po::command_line_parser(ac,av) .options(h) .positional(a) @@ -141,7 +141,7 @@ interpret_args(int ac, char* av[]) po::notify(vm); if (vm.count("help")) { - cout << "\nusage:\n\t" << av[0] + cout << "\nusage:\n\t" << av[0] << " [options] [--q1=<L1string>] [--q2=<L2string>]" << endl; cout << o << endl; exit(0); diff --git a/moses/TranslationModel/UG/count-ptable-features.cc b/moses/TranslationModel/UG/count-ptable-features.cc index b4d2cb4dd..4c9022075 100644 --- a/moses/TranslationModel/UG/count-ptable-features.cc +++ b/moses/TranslationModel/UG/count-ptable-features.cc @@ -21,6 +21,6 @@ int main() cout << PT.GetFeatureNames().size() << endl; exit(0); } - - + + diff --git a/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp b/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp index 073b64dfc..b87aa1d0c 100644 --- a/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp +++ b/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp @@ -11,7 +11,7 @@ namespace ugdiss using namespace boost::algorithm; using namespace boost::iostreams; - filtering_istream* + filtering_istream* open_input_stream(string fname) { filtering_istream* ret = new filtering_istream(); @@ -19,7 +19,7 @@ namespace ugdiss return ret; } - filtering_ostream* + filtering_ostream* open_output_stream(string fname) { filtering_ostream* ret = new filtering_ostream(); @@ -27,7 +27,7 @@ namespace ugdiss return ret; } - void + void open_input_stream(string fname, filtering_istream& in) { if (ends_with(fname, ".gz")) @@ -41,7 +41,7 @@ namespace ugdiss in.push(file_source(fname.c_str())); } - void + void open_output_stream(string fname, filtering_ostream& out) { if (ends_with(fname, ".gz") || ends_with(fname, ".gz_")) diff --git a/moses/TranslationModel/UG/generic/file_io/ug_stream.h b/moses/TranslationModel/UG/generic/file_io/ug_stream.h index e2c9e4764..5555e36f8 100644 --- a/moses/TranslationModel/UG/generic/file_io/ug_stream.h +++ b/moses/TranslationModel/UG/generic/file_io/ug_stream.h @@ -23,7 +23,7 @@ using namespace boost::iostreams; /** open input file that is possibly compressed * decompression filters are automatically added based on the file name - * gzip for .gz; bzip2 for bz2. + * gzip for .gz; bzip2 for bz2. */ filtering_istream* open_input_stream(string fname); void open_input_stream(string fname, filtering_istream& in); diff --git a/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp b/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp index 31927ac84..6c1644837 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp +++ b/moses/TranslationModel/UG/generic/program_options/ug_get_options.cpp @@ -11,7 +11,7 @@ namespace ugdiss { using namespace std; - void + void get_options(int ac, char* av[], progopts& o, posopts& a, optsmap& vm, char const* cfgFileParam) { @@ -30,17 +30,17 @@ namespace ugdiss } else { - cerr << "Error: cannot find config file '" + cerr << "Error: cannot find config file '" << cfgFile << "'!" << endl; exit(1); } } } - + // process positional args, ignoring those set in the config file if (a.max_total_count()) po::store(po::command_line_parser(ac,av) - .options(o).positional(a).run(),vm); + .options(o).positional(a).run(),vm); po::notify(vm); // IMPORTANT } } diff --git a/moses/TranslationModel/UG/generic/program_options/ug_get_options.h b/moses/TranslationModel/UG/generic/program_options/ug_get_options.h index 79b626ef5..636b11302 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_get_options.h +++ b/moses/TranslationModel/UG/generic/program_options/ug_get_options.h @@ -6,18 +6,18 @@ #include <boost/program_options.hpp> -namespace ugdiss +namespace ugdiss { namespace po=boost::program_options; typedef po::options_description progopts; typedef po::positional_options_description posopts; typedef po::variables_map optsmap; - void - get_options(int ac, char* av[], - progopts & o, - posopts & a, - optsmap & vm, + void + get_options(int ac, char* av[], + progopts & o, + posopts & a, + optsmap & vm, char const* cfgFileParam=NULL); } diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc index 7dc2cd18f..f30d91acc 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc +++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc @@ -5,15 +5,15 @@ #include <boost/foreach.hpp> namespace Moses { - - void + + void filter_arguments(int const argc_in, char const* const* const argv_in, - int & argc_moses, char*** argv_moses, + int & argc_moses, char*** argv_moses, int & argc_other, char*** argv_other, vector<pair<string,int> > const& filter) { *argv_moses = new char*[argc_in]; - *argv_other = new char*[argc_in]; + *argv_other = new char*[argc_in]; (*argv_moses)[0] = new char[strlen(argv_in[0])+1]; strcpy((*argv_moses)[0], argv_in[0]); argc_moses = 1; @@ -30,7 +30,7 @@ namespace Moses { strcpy((*argv_other)[argc_other++],argv_in[i]); for (int k = 0; k < o.second; ++k) { - UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-', + UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-', "[" << HERE << "] Missing argument for " << "parameter " << o.first << "!"); (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1]; @@ -44,7 +44,7 @@ namespace Moses { strcpy((*argv_moses)[argc_moses++], argv_in[i++]); } } - + } // namespace Moses diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h index e56585e8a..605acee6c 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h +++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h @@ -5,12 +5,12 @@ namespace Moses { using namespace std; - // Function to splice the argument list (e.g. before handing it over to + // Function to splice the argument list (e.g. before handing it over to // Moses LoadParam() function. /filter/ is a vector of argument names - // and the number of arguments after each of them - void + // and the number of arguments after each of them + void filter_arguments(int const argc_in, char const* const* const argv_in, - int & argc_moses, char*** argv_moses, + int & argc_moses, char*** argv_moses, int & argc_other, char*** argv_other, vector<pair<string,int> > const& filter); diff --git a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h index f26e28c52..31132c63c 100644 --- a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h +++ b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h @@ -17,40 +17,40 @@ namespace Moses { using namespace std; - template<typename VAL, + template<typename VAL, typename COMP = greater<VAL>, typename IDX_T=size_t> class - VectorIndexSorter + VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool> { vector<VAL> const& m_vecref; boost::shared_ptr<COMP> m_comp; public: - + COMP const& Compare; VectorIndexSorter(vector<VAL> const& v, COMP const& comp) : m_vecref(v), Compare(comp) { } - + VectorIndexSorter(vector<VAL> const& v) : m_vecref(v), m_comp(new COMP()), Compare(*m_comp) { } - + bool operator()(IDX_T const & a, IDX_T const & b) const { bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b)); bool bwd = Compare(m_vecref[b], m_vecref[a]); return (fwd == bwd ? a < b : fwd); } - + boost::shared_ptr<vector<IDX_T> > GetOrder() const; - + void GetOrder(vector<IDX_T> & order) const; - + }; - + template<typename VAL, typename COMP, typename IDX_T> boost::shared_ptr<vector<IDX_T> > VectorIndexSorter<VAL,COMP,IDX_T>:: @@ -60,7 +60,7 @@ namespace Moses get_order(*ret); return ret; } - + template<typename VAL, typename COMP, typename IDX_T> void VectorIndexSorter<VAL,COMP,IDX_T>:: @@ -70,6 +70,6 @@ namespace Moses for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i; sort(order.begin(), order.end(), *this); } - + } #endif diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc index 4b61ecd60..877b7a816 100644 --- a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc +++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc @@ -6,14 +6,14 @@ // string distance measures // Code by Ulrich Germann -namespace stringdist +namespace stringdist { - UErrorCode strip_accents(UnicodeString & trg) + UErrorCode strip_accents(UnicodeString & trg) { UErrorCode status = U_ZERO_ERROR; - static Transliterator *stripper - = Transliterator::createInstance("NFD; [:M:] Remove; NFC", + static Transliterator *stripper + = Transliterator::createInstance("NFD; [:M:] Remove; NFC", UTRANS_FORWARD, status); stripper->transliterate(trg); return status; @@ -22,9 +22,9 @@ namespace stringdist char const* StringDiff:: Segment:: - elabel[] = { "same", "cap", "flip", "permutation", - "accent", "duplication", - "insertion", "deletion", + elabel[] = { "same", "cap", "flip", "permutation", + "accent", "duplication", + "insertion", "deletion", "mismatch", "noinit" }; StringDiff:: @@ -44,7 +44,7 @@ namespace stringdist Segment() : start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0) {} - + UnicodeString const& StringDiff:: set_a(string const& a) @@ -74,8 +74,8 @@ namespace stringdist { return this->b; } - - size_t + + size_t StringDiff:: size() { @@ -94,7 +94,7 @@ namespace stringdist // if (s.match == same) continue; // else if (s.match == insertion) ret += s.end_b - s.start_b; // else if (s.match == deletion) ret += s.end_a - s.start_a; - + // } // } @@ -138,7 +138,7 @@ namespace stringdist #endif } - float + float fillAlignmentMatrix(UChar const* a, size_t const lenA, UChar const* b, size_t const lenB, vector<vector<float> > & M) @@ -164,7 +164,7 @@ namespace stringdist return M.back().back(); } - float + float levenshtein(UChar const* a, size_t const lenA, UChar const* b, size_t const lenB) { @@ -180,7 +180,7 @@ namespace stringdist cout << endl; } cout << string(25,'-') << endl; -#endif +#endif int i = M.size() -1; int j = M.back().size() -1; @@ -207,29 +207,29 @@ namespace stringdist return ret; } - + StringDiff:: Segment:: - Segment(size_t const as, size_t const ae, + Segment(size_t const as, size_t const ae, size_t const bs, size_t const be, - UnicodeString const& a, - UnicodeString const& b) + UnicodeString const& a, + UnicodeString const& b) { dist = 0; - start_a = as; end_a = ae; + start_a = as; end_a = ae; start_b = bs; end_b = be; if (as == ae) match = bs == be ? same : insertion; - else if (bs == be) + else if (bs == be) match = deletion; - else if (be-bs != ae-as) + else if (be-bs != ae-as) { match = mismatch; dist = stringdist::levenshtein(a.getBuffer() + as, ae - as, b.getBuffer() + bs, be - bs); } - else + else { match = same; size_t stop = ae-as; @@ -251,11 +251,11 @@ namespace stringdist } } } - if (match == insertion) + if (match == insertion) { dist = be-bs; } - else if (match == deletion) + else if (match == deletion) { dist = ae-as; } @@ -309,18 +309,18 @@ namespace stringdist if (i) --i; if (j) --j; } - for (size_t k = 0; k < A.size(); ++k) + for (size_t k = 0; k < A.size(); ++k) A[k] = min(A[k],A2[k]); - for (size_t k = 0; k < B.size(); ++k) + for (size_t k = 0; k < B.size(); ++k) B[k] = min(B[k],B2[k]); - + if (a[i] == b[j]) { A[i] = j; B[j] = i; } i = 0; j = 0; size_t I, J; while (i < a.length() and j < b.length()) { - if (A[i] < 0) + if (A[i] < 0) { I = i + 1; while (I < A.size() and A[I] < 0) ++I; @@ -338,24 +338,24 @@ namespace stringdist difflist.push_back(Segment(i,i,j,J,a,b)); j = J; } - else + else { - I = i; + I = i; J = j; - while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0) + while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0) { ++I; ++J; } difflist.push_back(Segment(i,I,j,J,a,b)); i = I; j = J; } } - if (i < a.length() || j < b.length()) + if (i < a.length() || j < b.length()) difflist.push_back(Segment(i,a.length(),j,b.length(),a,b)); diffcnt.assign(noinit,0); for (size_t i = 0; i < difflist.size(); ++i) { Segment & s = difflist[i]; - if (s.match == insertion and + if (s.match == insertion and ((s.start_a and a[s.start_a - 1] == b[s.start_b]) or (s.end_a < a.length() and a[s.end_a] == b[s.start_b]))) { @@ -364,7 +364,7 @@ namespace stringdist sameletter = b[i] == b[i-1]; if (sameletter) s.match = duplication; } - else if (s.match == deletion and + else if (s.match == deletion and ((s.start_b and b[s.start_b - 1] == a[s.start_a]) or (s.end_b < b.length() and b[s.end_b] == a[s.start_a]))) { @@ -380,15 +380,15 @@ namespace stringdist void StringDiff:: - showDiff(std::ostream& out) + showDiff(std::ostream& out) { if (difflist.size() == 0) align(); vector<size_t> fromEnd(difflist.size(),0); for (int d = difflist.size()-1; d-- > 0;) { fromEnd[d] = a.length() - difflist[d].end_a; - // cout << d << " " << fromEnd[d] << " " - // << difflist[d].start_a << "-" + // cout << d << " " << fromEnd[d] << " " + // << difflist[d].start_a << "-" // << difflist[d].end_a << endl; } for (size_t d = 0; d < difflist.size(); ++d) @@ -402,7 +402,7 @@ namespace stringdist bseg.toUTF8String(bbuf); out << abuf << " "; out << bbuf << " "; - out << s.label() << " " + out << s.label() << " " << s.dist << " " << fromEnd[d] << endl; @@ -423,7 +423,7 @@ namespace stringdist { return difflist.at(i); } - + vector<int> const& StringDiff:: getFeatures() const diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h index 43fb089f1..8dfcfb58a 100644 --- a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h +++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h @@ -21,15 +21,15 @@ using namespace std; //using namespace boost; using namespace ugdiss; -namespace stringdist +namespace stringdist { - float + float levenshtein(UChar const* a, size_t const lenA, UChar const* b, size_t const lenB); UErrorCode strip_accents(UnicodeString & trg); - float + float fillAlignmentMatrix(UChar const* a, size_t const lenA, UChar const* b, size_t const lenB, vector<vector<float> > & M); @@ -37,9 +37,9 @@ namespace stringdist class StringDiff { public: - enum MATCHTYPE + enum MATCHTYPE { - same, // a and b are identical + same, // a and b are identical cap, // a and b differ only in capitalization flip, // two-letter flip permutation, // a and b have same letters but in different order @@ -48,7 +48,7 @@ namespace stringdist insertion, // a is empty deletion, // b is empty mismatch, // none of the above - noinit // not initialized + noinit // not initialized }; struct Segment @@ -59,9 +59,9 @@ namespace stringdist MATCHTYPE match; float dist; Segment(); - Segment(size_t const as, size_t const ae, + Segment(size_t const as, size_t const ae, size_t const bs, size_t const be, - UnicodeString const& a, + UnicodeString const& a, UnicodeString const& b); char const* label() const; }; diff --git a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc index 662493e18..b4565f99d 100644 --- a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc +++ b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc @@ -3,10 +3,10 @@ namespace Moses { ThreadSafeCounter:: ThreadSafeCounter() - : ctr(0) + : ctr(0) { } - size_t + size_t ThreadSafeCounter:: operator++() { @@ -14,21 +14,21 @@ namespace Moses return ++ctr; } - size_t + size_t ThreadSafeCounter:: operator++(int foo) { boost::lock_guard<boost::mutex> guard(this->lock); return ctr++; } - + ThreadSafeCounter:: operator size_t() const { return ctr; } - size_t + size_t ThreadSafeCounter:: operator--() { @@ -36,13 +36,13 @@ namespace Moses return --ctr; } - size_t + size_t ThreadSafeCounter:: operator--(int foo) { boost::lock_guard<boost::mutex> guard(this->lock); return ctr--; } - - + + } diff --git a/moses/TranslationModel/UG/mm/calc-coverage.cc b/moses/TranslationModel/UG/mm/calc-coverage.cc index ef17656d9..83f67220d 100644 --- a/moses/TranslationModel/UG/mm/calc-coverage.cc +++ b/moses/TranslationModel/UG/mm/calc-coverage.cc @@ -16,7 +16,7 @@ using namespace ugdiss; typedef L2R_Token<SimpleWordId> Token; TokenIndex V; sptr<vector<vector<Token> > > C(new vector<vector<Token> >()); -void +void add_file(string fname) { filtering_istream in; diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc index 93c8c0eb0..1a51aa8a4 100644 --- a/moses/TranslationModel/UG/mm/custom-pt.cc +++ b/moses/TranslationModel/UG/mm/custom-pt.cc @@ -31,7 +31,7 @@ using namespace Moses; using namespace Moses::bitext; #define CACHING_THRESHOLD 1000 -#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p +#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p size_t mctr=0,xctr=0; typedef L2R_Token<SimpleWordId> Token; @@ -49,15 +49,15 @@ PScoreWC<Token> apply_wp; vector<float> fweights; void -nbest_phrasepairs(uint64_t const pid1, - pstats const& ps, +nbest_phrasepairs(uint64_t const pid1, + pstats const& ps, vector<PhrasePair> & nbest) { pstats::trg_map_t::const_iterator m; vector<size_t> idx(nbest.size()); size_t i=0; - for (m = ps.trg.begin(); - m != ps.trg.end() && i < nbest.size(); + for (m = ps.trg.begin(); + m != ps.trg.end() && i < nbest.size(); ++m) { // cout << m->second.rcnt() << " " << ps.good << endl; @@ -74,17 +74,17 @@ nbest_phrasepairs(uint64_t const pid1, ++i; } // cout << i << " " << nbest.size() << endl; - if (i < nbest.size()) + if (i < nbest.size()) { // cout << "Resizing from " << nbest.size() << " to " << i << endl; nbest.resize(i); idx.resize(i); } VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>()); - if (m != ps.trg.end()) + if (m != ps.trg.end()) { make_heap(idx.begin(),idx.end(),sorter); - PhrasePair cand; + PhrasePair cand; cand.init(pid1,ps,5); for (; m != ps.trg.end(); ++m) { @@ -104,7 +104,7 @@ nbest_phrasepairs(uint64_t const pid1, } sort(nbest.begin(),nbest.end(),greater<PhrasePair>()); } - + int main(int argc, char* argv[]) { // assert(argc == 4); @@ -120,8 +120,8 @@ int main(int argc, char* argv[]) string L2 = "en"; size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000; #endif - char c = *base.rbegin(); - if (c != '/' && c != '.') + char c = *base.rbegin(); + if (c != '/' && c != '.') base += "."; fweights.resize(5,.25); @@ -138,7 +138,7 @@ int main(int argc, char* argv[]) string line; while (getline(cin,line)) { - vector<id_type> snt; + vector<id_type> snt; bt.V1->fillIdSeq(line,snt); for (size_t i = 0; i < snt.size(); ++i) { @@ -156,8 +156,8 @@ int main(int argc, char* argv[]) sptr<pstats> s = bt.lookup(m); for (size_t j = i; j <= k; ++j) cout << (*bt.V1)[snt[j]] << " "; - cout << s->good << "/" - << s->sample_cnt << "/" + cout << s->good << "/" + << s->sample_cnt << "/" << s->raw_cnt << endl; // vector<PhrasePair> nbest(min(s->trg.size(),size_t(20))); vector<PhrasePair> nbest(s->trg.size()); @@ -172,17 +172,17 @@ int main(int argc, char* argv[]) cout << " " << setw(6) << pp.score << " "; for (uint32_t i = off; i < stop; ++i) cout << (*bt.V2)[o[i].id()] << " "; - cout << pp.joint << "/" + cout << pp.joint << "/" << pp.raw1 << "/" << pp.raw2 << " |"; - BOOST_FOREACH(float f, pp.fvals) + BOOST_FOREACH(float f, pp.fvals) cout << " " << f; cout << endl; } } } } -#endif +#endif exit(0); } #endif diff --git a/moses/TranslationModel/UG/mm/mam2symal.cc b/moses/TranslationModel/UG/mm/mam2symal.cc index 9610e6f56..eb5034aab 100644 --- a/moses/TranslationModel/UG/mm/mam2symal.cc +++ b/moses/TranslationModel/UG/mm/mam2symal.cc @@ -22,7 +22,7 @@ typedef L2R_Token<Conll_Sform> Token; mmTtrack<char> MAM; bool with_sids; -void +void interpret_args(int ac, char* av[]) { po::variables_map vm; @@ -31,7 +31,7 @@ interpret_args(int ac, char* av[]) ("help,h", "print this message") ("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token") ; - + po::options_description h("Hidden Options"); h.add_options() ("mamfile", po::value<string>(&mamfile), "mamfile") @@ -40,7 +40,7 @@ interpret_args(int ac, char* av[]) po::positional_options_description a; a.add("mamfile",1); a.add("range",-1); - + po::store(po::command_line_parser(ac,av) .options(h.add(o)) .positional(a) @@ -56,11 +56,11 @@ interpret_args(int ac, char* av[]) } } -void +void printRangeMAM(size_t start, size_t stop) { for (;start < stop; start++) - { + { // size_t i = 0; char const* p = MAM.sntStart(start); char const* q = MAM.sntEnd(start); @@ -76,7 +76,7 @@ printRangeMAM(size_t start, size_t stop) } } -int +int main(int argc, char*argv[]) { interpret_args(argc,argv); @@ -91,7 +91,7 @@ main(int argc, char*argv[]) buf>>first; if (buf.peek() == '-') buf>>c>>last; else last = first; - if (last < MAM.size()) + if (last < MAM.size()) printRangeMAM(first,last+1); } } diff --git a/moses/TranslationModel/UG/mm/mam_verify.cc b/moses/TranslationModel/UG/mm/mam_verify.cc index d43539742..798baa947 100644 --- a/moses/TranslationModel/UG/mm/mam_verify.cc +++ b/moses/TranslationModel/UG/mm/mam_verify.cc @@ -21,7 +21,7 @@ mmTtrack<char> MAM; mmTtrack<Token> T1,T2; bool inv; vector<string> range; -void +void interpret_args(int ac, char* av[]) { po::variables_map vm; @@ -30,7 +30,7 @@ interpret_args(int ac, char* av[]) ("help,h", "print this message") ("inv,i", po::bool_switch(&inv), "inverse") ; - + po::options_description h("Hidden Options"); h.add_options() ("bname", po::value<string>(&bname), "base name") @@ -43,7 +43,7 @@ interpret_args(int ac, char* av[]) a.add("L1",1); a.add("L2",1); a.add("range",-1); - + po::store(po::command_line_parser(ac,av) .options(h.add(o)) .positional(a) @@ -87,7 +87,7 @@ check_range(size_t start, size_t stop) return noAln; } -int +int main(int argc, char*argv[]) { interpret_args(argc,argv); @@ -100,7 +100,7 @@ main(int argc, char*argv[]) exit(1); } size_t noAln; - if (!range.size()) + if (!range.size()) noAln = check_range(0, MAM.size()); else { @@ -112,7 +112,7 @@ main(int argc, char*argv[]) buf>>first; if (buf.peek() == '-') buf>>c>>last; else last = first; - if (last < MAM.size()) + if (last < MAM.size()) noAln += check_range(first,last+1); } } diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc index 5e5ea194c..1e7bee5cb 100644 --- a/moses/TranslationModel/UG/mm/mmlex-build.cc +++ b/moses/TranslationModel/UG/mm/mmlex-build.cc @@ -1,8 +1,8 @@ // -*- c++ -*- // Program to extract word cooccurrence counts from a memory-mapped // word-aligned bitext stores the counts lexicon in the format for -// mm2dTable<uint32_t> (ug_mm_2d_table.h) -// +// mm2dTable<uint32_t> (ug_mm_2d_table.h) +// // (c) 2010-2012 Ulrich Germann // to do: multi-threading @@ -20,8 +20,8 @@ #include <boost/foreach.hpp> #include <boost/thread.hpp> #include <boost/math/distributions/binomial.hpp> -#include <boost/unordered_map.hpp> -#include <boost/unordered_set.hpp> +#include <boost/unordered_map.hpp> +#include <boost/unordered_set.hpp> #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h" #include "moses/Util.h" @@ -36,7 +36,7 @@ using namespace boost::math; typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t; typedef SimpleWordId Token; -// DECLARATIONS +// DECLARATIONS void interpret_args(int ac, char* av[]); mmTtrack<Token> T1,T2; @@ -52,7 +52,7 @@ struct Count Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {} }; -bool +bool operator<(pair<id_type,Count> const& a, pair<id_type,Count> const& b) { @@ -72,7 +72,7 @@ public: countlist_t & LEX; size_t offset; size_t skip; - Counter(countlist_t& lex, size_t o, size_t s) + Counter(countlist_t& lex, size_t o, size_t s) : LEX(lex), offset(o), skip(s) {} void processSentence(id_type sid); void operator()(); @@ -83,7 +83,7 @@ int verbose; size_t truncat; size_t num_threads; -void +void Counter:: operator()() { @@ -105,17 +105,17 @@ struct lexsorter { vector<countlist_t> const& v; id_type wid; - lexsorter(vector<countlist_t> const& vx, id_type widx) + lexsorter(vector<countlist_t> const& vx, id_type widx) : v(vx),wid(widx) {} bool operator()(pair<uint32_t,uint32_t> const& a, pair<uint32_t,uint32_t> const& b) const { - return (v.at(a.first).at(wid).at(a.second).first > + return (v.at(a.first).at(wid).at(a.second).first > v.at(b.first).at(wid).at(b.second).first); } }; -void +void writeTableHeader(ostream& out) { filepos_type idxOffset=0; @@ -159,7 +159,7 @@ void writeTable(ostream* aln_out, ostream* coc_out) H.pop_back(); else push_heap(H.begin(),H.end(),sorter); - while (H.size() && + while (H.size() && XLEX[H[0].first][id1].at(H[0].second).first == id2) { aln += XLEX[H[0].first][id1][H[0].second].second.a; @@ -178,7 +178,7 @@ void writeTable(ostream* aln_out, ostream* coc_out) numwrite(*aln_out,aln); m1a[id1] += aln; m2a[id2] += aln; - } + } if (coc_out && coc) { ++CellCountC; @@ -191,7 +191,7 @@ void writeTable(ostream* aln_out, ostream* coc_out) } idxa.back() = CellCountA; idxc.back() = CellCountC; - if (aln_out) + if (aln_out) { filepos_type idxOffsetA = aln_out->tellp(); BOOST_FOREACH(id_type foo, idxa) @@ -201,7 +201,7 @@ void writeTable(ostream* aln_out, ostream* coc_out) aln_out->seekp(0); numwrite(*aln_out,idxOffsetA); } - if (coc_out) + if (coc_out) { filepos_type idxOffsetC = coc_out->tellp(); BOOST_FOREACH(id_type foo, idxc) @@ -223,9 +223,9 @@ processSentence(id_type sid) Token const* e2 = T2.sntEnd(sid); vector<ushort> cnt1(V1.ksize(),0); vector<ushort> cnt2(V2.ksize(),0); - for (Token const* x = s1; x < e1; ++x) + for (Token const* x = s1; x < e1; ++x) ++cnt1.at(x->id()); - for (Token const* x = s2; x < e2; ++x) + for (Token const* x = s2; x < e2; ++x) ++cnt2.at(x->id()); boost::unordered_set<wpair> seen; @@ -257,21 +257,21 @@ processSentence(id_type sid) wpair k(id1,id2); Count& cnt = CNT[k]; cnt.a++; - if (seen.insert(k).second) + if (seen.insert(k).second) cnt.c += cnt1[id1] * cnt2[id2]; } // count unaliged words - for (size_t i = check1.find_first(); - i < check1.size(); + for (size_t i = check1.find_first(); + i < check1.size(); i = check1.find_next(i)) CNT[wpair((s1+i)->id(),0)].a++; - for (size_t i = check2.find_first(); - i < check2.size(); + for (size_t i = check2.find_first(); + i < check2.size(); i = check2.find_next(i)) CNT[wpair(0,(s2+i)->id())].a++; } -int +int main(int argc, char* argv[]) { interpret_args(argc,argv); @@ -299,7 +299,7 @@ main(int argc, char* argv[]) if (cooc.size()) coc_out.close(); } -void +void interpret_args(int ac, char* av[]) { namespace po=boost::program_options; @@ -321,7 +321,7 @@ interpret_args(int ac, char* av[]) ("truncate,n", po::value<size_t>(&truncat)->default_value(0), "truncate corpus to <N> sentences (for debugging)") ; - + h.add_options() ("bname", po::value<string>(&bname), "base name") ("L1", po::value<string>(&L1),"L1 tag") diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc index fbdceeaa0..3ba9ef492 100644 --- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc +++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc @@ -1,8 +1,8 @@ // -*- c++ -*- // Program to extract word cooccurrence counts from a memory-mapped // word-aligned bitext stores the counts lexicon in the format for -// mm2dTable<uint32_t> (ug_mm_2d_table.h) -// +// mm2dTable<uint32_t> (ug_mm_2d_table.h) +// // (c) 2010-2012 Ulrich Germann // to do: multi-threading @@ -20,8 +20,8 @@ #include <boost/foreach.hpp> #include <boost/thread.hpp> #include <boost/math/distributions/binomial.hpp> -#include <boost/unordered_map.hpp> -#include <boost/unordered_set.hpp> +#include <boost/unordered_map.hpp> +#include <boost/unordered_set.hpp> #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h" #include "ug_mm_2d_table.h" @@ -35,7 +35,7 @@ using namespace boost::math; typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t; typedef SimpleWordId Token; -// DECLARATIONS +// DECLARATIONS void interpret_args(int ac, char* av[]); string swrd,twrd,L1,L2,bname; @@ -43,7 +43,7 @@ TokenIndex V1,V2; LEX_t LEX; -void +void lookup_source(ostream& out, id_type r) { vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop); @@ -57,7 +57,7 @@ lookup_source(ostream& out, id_type r) } } -void +void lookup_target(ostream& out, id_type c) { vector<LEX_t::Cell> foo; @@ -65,7 +65,7 @@ lookup_target(ostream& out, id_type c) for (size_t r = 0; r < LEX.numRows; ++r) { size_t j = LEX[r][c]; - if (j) + if (j) { cell.id = r; cell.val = j; @@ -82,7 +82,7 @@ lookup_target(ostream& out, id_type c) } } -void +void dump(ostream& out) { for (size_t r = 0; r < LEX.numRows; ++r) @@ -91,7 +91,7 @@ dump(ostream& out) } -int +int main(int argc, char* argv[]) { interpret_args(argc,argv); @@ -100,14 +100,14 @@ main(int argc, char* argv[]) V1.open(bname+L1+".tdx"); V2.open(bname+L2+".tdx"); LEX.open(bname+L1+"-"+L2+".lex"); - + cout.precision(2); id_type swid = V1[swrd]; id_type twid = V2[twrd]; if (swid != 1 && twid != 1) { - cout << swrd << " " << twrd << " " - << LEX.m1(swid) << " / " + cout << swrd << " " << twrd << " " + << LEX.m1(swid) << " / " << LEX[swid][twid] << " / " << LEX.m2(twid) << endl; } @@ -119,7 +119,7 @@ main(int argc, char* argv[]) dump(cout); } -void +void interpret_args(int ac, char* av[]) { namespace po=boost::program_options; @@ -133,7 +133,7 @@ interpret_args(int ac, char* av[]) ("source,s",po::value<string>(&swrd),"source word") ("target,t",po::value<string>(&twrd),"target word") ; - + h.add_options() ("bname", po::value<string>(&bname), "base name") ("L1", po::value<string>(&L1),"L1 tag") diff --git a/moses/TranslationModel/UG/mm/mtt-build.cc b/moses/TranslationModel/UG/mm/mtt-build.cc index f49895ebf..a61cbac3f 100644 --- a/moses/TranslationModel/UG/mm/mtt-build.cc +++ b/moses/TranslationModel/UG/mm/mtt-build.cc @@ -46,8 +46,8 @@ bool quiet = false; // no progress reporting string vocabBase; // base name for existing vocabs that should be used string baseName; // base name for all files -string tmpFile, mttFile; /* name of temporary / actual track file - * (.mtt for Conll format, .mct for plain text) +string tmpFile, mttFile; /* name of temporary / actual track file + * (.mtt for Conll format, .mct for plain text) */ string UNK; @@ -60,7 +60,7 @@ void interpret_args(int ac, char* av[]); inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; } -id_type +id_type get_id(TokenIndex const& T, string const& w) { id_type ret = T[w]; @@ -73,21 +73,21 @@ get_id(TokenIndex const& T, string const& w) return ret; } -void +void open_vocab(TokenIndex& T, string fname) { - if (!access(fname.c_str(), F_OK)) - { - T.open(fname,UNK); - assert(T[UNK] == 1); + if (!access(fname.c_str(), F_OK)) + { + T.open(fname,UNK); + assert(T[UNK] == 1); } else T.setUnkLabel(UNK); if (incremental) T.setDynamic(true); - assert(T["NULL"] == 0); + assert(T["NULL"] == 0); assert(T[UNK] == 1); } -void +void ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v) { v.resize(T.totalVocabSize()); @@ -142,7 +142,7 @@ void fill_rec(Conll_Record& rec, vector<string> const& w) else if (w.size() >= 8) // CONLL format { int id = atoi(w[0].c_str()); - int gov = atoi(w[6].c_str()); + int gov = atoi(w[6].c_str()); rec.sform = get_id(SF, w[1]); rec.lemma = get_id(LM, w[2]); rec.majpos = rangeCheck(get_id(PS, w[3]), 256); @@ -161,12 +161,12 @@ void log_progress(size_t ctr) } else if (ctr % 10000 == 0) { - cerr << "."; + cerr << "."; } } -size_t +size_t process_plain_input(ostream& out, vector<id_type> & s_index) { id_type totalWords = 0; @@ -176,7 +176,7 @@ process_plain_input(ostream& out, vector<id_type> & s_index) istringstream buf(line); if (!quiet) log_progress(s_index.size()); s_index.push_back(totalWords); - while (buf>>w) + while (buf>>w) { numwrite(out,get_id(SF,w)); ++totalWords; @@ -186,9 +186,9 @@ process_plain_input(ostream& out, vector<id_type> & s_index) return totalWords; } -size_t -process_tagged_input(ostream& out, - vector<id_type> & s_index, +size_t +process_tagged_input(ostream& out, + vector<id_type> & s_index, vector<id_type> & p_index) { string line; @@ -196,7 +196,7 @@ process_tagged_input(ostream& out, bool new_sent = true; bool new_par = true; id_type totalWords = 0; - + while (getline(cin,line)) { vector<string> w; string f; istringstream buf(line); @@ -205,7 +205,7 @@ process_tagged_input(ostream& out, if (w.size() == 0 || starts_with(w[0], "SID=")) new_sent = true; - else if (w.size() == 1 && w[0] == "<P>") + else if (w.size() == 1 && w[0] == "<P>") new_par = new_sent = true; if (w.size() < 3) continue; @@ -244,7 +244,7 @@ numberize() index = &p_index; } - if (!quiet) + if (!quiet) cerr << endl << "Writing index ... (" << index->size() << " chunks) "; startIdx = out.tellp(); @@ -261,7 +261,7 @@ numberize() vector<id_type> smap,lmap,pmap,dmap; -void +void invert(vector<id_type> const& from, vector<id_type> & to) { to.resize(from.size()); @@ -269,11 +269,11 @@ invert(vector<id_type> const& from, vector<id_type> & to) to[from[i]] = i; } -// sorts new items based on occurrence counts but won't reassign +// sorts new items based on occurrence counts but won't reassign // existing token ids -void -conservative_sort(TokenIndex const & V, - vector<size_t> const & cnt, +void +conservative_sort(TokenIndex const & V, + vector<size_t> const & cnt, vector<id_type> & xmap) { xmap.resize(V.totalVocabSize()); @@ -344,21 +344,21 @@ void save_vocabs() string vbase = baseName; if (is_conll) { - if (SF.totalVocabSize() > SF.knownVocabSize()) + if (SF.totalVocabSize() > SF.knownVocabSize()) write_tokenindex(vbase+".tdx.sfo",SF,smap); - if (LM.totalVocabSize() > LM.knownVocabSize()) + if (LM.totalVocabSize() > LM.knownVocabSize()) write_tokenindex(vbase+".tdx.lem",LM,lmap); - if (PS.totalVocabSize() > PS.knownVocabSize()) + if (PS.totalVocabSize() > PS.knownVocabSize()) write_tokenindex(vbase+".tdx.pos",PS,pmap); - if (DT.totalVocabSize() > DT.knownVocabSize()) + if (DT.totalVocabSize() > DT.knownVocabSize()) write_tokenindex(vbase+".tdx.drl",DT,dmap); } - else if (SF.totalVocabSize() > SF.knownVocabSize()) + else if (SF.totalVocabSize() > SF.knownVocabSize()) write_tokenindex(vbase+".tdx",SF,smap); } template<typename Token> -size_t +size_t build_mmTSA(string infile, string outfile) { size_t mypid = fork(); @@ -371,14 +371,14 @@ build_mmTSA(string infile, string outfile) exit(0); } -bool +bool build_plaintext_tsas() { typedef L2R_Token<SimpleWordId> L2R; typedef R2L_Token<SimpleWordId> R2L; size_t c = with_sfas + with_pfas; - if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa"); - if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa"); + if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa"); + if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa"); while (c--) wait(NULL); return true; } @@ -388,27 +388,27 @@ void build_conll_tsas() string bn = baseName; string mtt = tmpFile; size_t c = 3 * (with_sfas + with_pfas + with_dcas); - if (with_sfas) + if (with_sfas) { build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform"); build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma"); build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos"); } - if (with_pfas) + if (with_pfas) { build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform"); build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma"); build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos"); } - if (with_dcas) + if (with_dcas) { - build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform"); - build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma"); + build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform"); + build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma"); build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos"); } - while (c--) wait(NULL); + while (c--) wait(NULL); } @@ -430,7 +430,7 @@ int main(int argc, char* argv[]) rename(tmpFile.c_str(),mttFile.c_str()); } -void +void interpret_args(int ac, char* av[]) { po::variables_map vm; @@ -439,10 +439,10 @@ interpret_args(int ac, char* av[]) ("help,h", "print this message") - ("quiet,q", po::bool_switch(&quiet), + ("quiet,q", po::bool_switch(&quiet), "don't print progress information") - ("incremental,i", po::bool_switch(&incremental), + ("incremental,i", po::bool_switch(&incremental), "incremental mode; rewrites vocab files!") ("vocab-base,v", po::value<string>(&vocabBase), @@ -451,15 +451,15 @@ interpret_args(int ac, char* av[]) ("output,o", po::value<string>(&baseName), "base file name of the resulting file(s)") - ("sfa,s", po::value<int>(&with_sfas)->default_value(1), + ("sfa,s", po::value<int>(&with_sfas)->default_value(1), "also build suffix arrays") ("pfa,p", po::value<int>(&with_pfas) - ->default_value(0)->implicit_value(1), + ->default_value(0)->implicit_value(1), "also build prefix arrays") ("dca,d", po::value<int>(&with_dcas) - ->default_value(0)->implicit_value(1), + ->default_value(0)->implicit_value(1), "also build dependency chain arrays") ("conll,c", po::bool_switch(&is_conll), @@ -468,18 +468,18 @@ interpret_args(int ac, char* av[]) ("unk,u", po::value<string>(&UNK)->default_value("UNK"), "label for unknown tokens") - // ("map,m", po::value<string>(&vmap), + // ("map,m", po::value<string>(&vmap), // "map words to word classes for indexing") - + ; - + po::options_description h("Hidden Options"); h.add_options() ; h.add(o); po::positional_options_description a; a.add("output",1); - + po::store(po::command_line_parser(ac,av) .options(h) .positional(a) @@ -487,7 +487,7 @@ interpret_args(int ac, char* av[]) po::notify(vm); if (vm.count("help") || !vm.count("output")) { - cout << "\nusage:\n\t cat <corpus> | " << av[0] + cout << "\nusage:\n\t cat <corpus> | " << av[0] << " [options] <output .mtt file>" << endl; cout << o << endl; exit(0); diff --git a/moses/TranslationModel/UG/mm/mtt-count-words.cc b/moses/TranslationModel/UG/mm/mtt-count-words.cc index c9b435477..223ba2090 100644 --- a/moses/TranslationModel/UG/mm/mtt-count-words.cc +++ b/moses/TranslationModel/UG/mm/mtt-count-words.cc @@ -36,7 +36,7 @@ int main(int argc, char* argv[]) { interpret_args(argc,argv); T.open(bname+".mct"); - V.open(bname+".tdx"); + V.open(bname+".tdx"); vector<size_t> cnt(V.ksize(),0); for (size_t sid = 0; sid < T.size(); ++sid) { @@ -48,7 +48,7 @@ int main(int argc, char* argv[]) exit(0); } -void +void interpret_args(int ac, char* av[]) { namespace po=boost::program_options; @@ -60,7 +60,7 @@ interpret_args(int ac, char* av[]) o.add_options() ("help,h", "print this message") ; - + h.add_options() ("bname", po::value<string>(&bname), "base name") ; diff --git a/moses/TranslationModel/UG/mm/mtt-demo1.cc b/moses/TranslationModel/UG/mm/mtt-demo1.cc index a253e9ed3..d3506fa0f 100644 --- a/moses/TranslationModel/UG/mm/mtt-demo1.cc +++ b/moses/TranslationModel/UG/mm/mtt-demo1.cc @@ -21,17 +21,17 @@ int main(int argc, char* argv[]) using namespace std; if (argc < 3) { - cerr << "usage: " << argv[0] << " <track base name> lookup word sequence" + cerr << "usage: " << argv[0] << " <track base name> lookup word sequence" << endl; } string base = argv[1]; - TokenIndex V; + TokenIndex V; V.open(base+".tdx"); - boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>()); + boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>()); T->open(base+".mct"); mmTSA<Token> I; I.open(base+".sfa",T); mmTSA<Token>::tree_iterator m(&I); - + // look up the search string m.extend() returns true upon success for (int i = 2; i < argc && m.extend(V[argv[i]]); ++i); if (int(m.size() + 2) < argc) @@ -39,7 +39,7 @@ int main(int argc, char* argv[]) cerr << "NOT FOUND" << endl; exit(1); } - + tsa::ArrayEntry e(m.lower_bound(-1)); char const* stop = m.upper_bound(-1); do diff --git a/moses/TranslationModel/UG/mm/mtt-dump.cc b/moses/TranslationModel/UG/mm/mtt-dump.cc index b7d85d623..eea1bb400 100644 --- a/moses/TranslationModel/UG/mm/mtt-dump.cc +++ b/moses/TranslationModel/UG/mm/mtt-dump.cc @@ -25,7 +25,7 @@ bool sform; bool have_mtt, have_mct; bool with_sids; bool with_positions; -void +void interpret_args(int ac, char* av[]) { po::variables_map vm; @@ -36,7 +36,7 @@ interpret_args(int ac, char* av[]) ("sform,s", po::bool_switch(&sform), "sform only") ("with-positions,p", po::bool_switch(&with_positions), "show word positions") ; - + po::options_description h("Hidden Options"); h.add_options() ("bname", po::value<string>(&bname), "base name") @@ -45,7 +45,7 @@ interpret_args(int ac, char* av[]) po::positional_options_description a; a.add("bname",1); a.add("range",-1); - + po::store(po::command_line_parser(ac,av) .options(h.add(o)) .positional(a) @@ -63,11 +63,11 @@ interpret_args(int ac, char* av[]) mct = bname+".mct"; } -void +void printRangeMTT(size_t start, size_t stop) { for (;start < stop; start++) - { + { size_t i = 0; Token const* s = MTT.sntStart(start); Token const* e = MTT.sntEnd(start); @@ -92,7 +92,7 @@ printRangeMTT(size_t start, size_t stop) cout << i+t->parent << " "; cout << DT[t->dtype] << endl; } - else + else { if (with_positions) cout << t-s << ":"; cout << SF[t->id()] << " "; @@ -102,16 +102,16 @@ printRangeMTT(size_t start, size_t stop) } } -void +void printRangeMCT(size_t start, size_t stop) { for (;start < stop; start++) - { + { SimpleWordId const* s = MCT.sntStart(start); SimpleWordId const* t = s; SimpleWordId const* e = MCT.sntEnd(start); if (with_sids) cout << start << " "; - while (t < e) + while (t < e) { if (with_positions) cout << t-s << ":"; cout << SF[(t++)->id()] << " "; @@ -120,7 +120,7 @@ printRangeMCT(size_t start, size_t stop) } } -int +int main(int argc, char*argv[]) { interpret_args(argc,argv); @@ -139,14 +139,14 @@ main(int argc, char*argv[]) DT.open(bname+".tdx.drl"); DT.iniReverseIndex(); MTT.open(mtt); } - else + else { sform = true; SF.open(bname+".tdx"); SF.iniReverseIndex(); MCT.open(mct); } - - if (!range.size()) + + if (!range.size()) have_mtt ? printRangeMTT(0, MTT.size()) : printRangeMCT(0, MCT.size()); else { @@ -157,9 +157,9 @@ main(int argc, char*argv[]) buf>>first; if (buf.peek() == '-') buf>>c>>last; else last = first; - if (have_mtt && last < MTT.size()) + if (have_mtt && last < MTT.size()) printRangeMTT(first,last+1); - else if (last < MCT.size()) + else if (last < MCT.size()) printRangeMCT(first,last+1); } } diff --git a/moses/TranslationModel/UG/mm/mtt.count.cc b/moses/TranslationModel/UG/mm/mtt.count.cc index 423c12ec7..1e2382f67 100644 --- a/moses/TranslationModel/UG/mm/mtt.count.cc +++ b/moses/TranslationModel/UG/mm/mtt.count.cc @@ -36,14 +36,14 @@ bool echo; int main(int argc, char* argv[]) { interpret_args(argc,argv); - + T.open(bname+".mct"); V.open(bname+".tdx"); V.iniReverseIndex(); I.open(bname+".sfa",&T); string line; while (getline(cin,line)) { - vector<id_type> phr; + vector<id_type> phr; V.fillIdSeq(line,phr); TSA<Token>::tree_iterator m(&I); size_t i = 0; @@ -55,7 +55,7 @@ int main(int argc, char* argv[]) exit(0); } -void +void interpret_args(int ac, char* av[]) { namespace po=boost::program_options; @@ -68,7 +68,7 @@ interpret_args(int ac, char* av[]) ("help,h", "print this message") ("echo,e", po::bool_switch(&echo), "repeat lookup phrases") ; - + h.add_options() ("bname", po::value<string>(&bname), "base name") ; diff --git a/moses/TranslationModel/UG/mm/num_read_write.cc b/moses/TranslationModel/UG/mm/num_read_write.cc index 403f7d300..5c281d9dd 100644 --- a/moses/TranslationModel/UG/mm/num_read_write.cc +++ b/moses/TranslationModel/UG/mm/num_read_write.cc @@ -2,7 +2,7 @@ namespace ugdiss { typedef unsigned char uchar; - void + void numwrite(std::ostream& out, uint16_t const& x) { char buf[2]; @@ -11,7 +11,7 @@ namespace ugdiss { out.write(buf,2); } - void + void numwrite(std::ostream& out, uint32_t const& x) { char buf[4]; @@ -22,7 +22,7 @@ namespace ugdiss { out.write(buf,4); } - void + void numwrite(std::ostream& out, uint64_t const& x) { char buf[8]; @@ -37,7 +37,7 @@ namespace ugdiss { out.write(buf,8); } - char const* + char const* numread(char const* src, uint16_t & x) { uchar const* d = reinterpret_cast<uchar const*>(src); @@ -45,28 +45,28 @@ namespace ugdiss { return src+2; } - char const* + char const* numread(char const* src, uint32_t & x) { uchar const* d = reinterpret_cast<uchar const*>(src); - x = ((uint32_t(d[0])<<0) | - (uint32_t(d[1])<<8) | - (uint32_t(d[2])<<16)| + x = ((uint32_t(d[0])<<0) | + (uint32_t(d[1])<<8) | + (uint32_t(d[2])<<16)| (uint32_t(d[3])<<24)); return src+4; } - char const* + char const* numread(char const* src, uint64_t & x) { uchar const* d = reinterpret_cast<uchar const*>(src); - x = ((uint64_t(d[0])<<0) | - (uint64_t(d[1])<<8) | - (uint64_t(d[2])<<16) | + x = ((uint64_t(d[0])<<0) | + (uint64_t(d[1])<<8) | + (uint64_t(d[2])<<16) | (uint64_t(d[3])<<24) | - (uint64_t(d[4])<<32) | - (uint64_t(d[5])<<40) | - (uint64_t(d[6])<<48) | + (uint64_t(d[4])<<32) | + (uint64_t(d[5])<<40) | + (uint64_t(d[6])<<48) | (uint64_t(d[7])<<56)); return src+8; } diff --git a/moses/TranslationModel/UG/mm/num_read_write.h b/moses/TranslationModel/UG/mm/num_read_write.h index 6fdcecc81..f83e1c982 100644 --- a/moses/TranslationModel/UG/mm/num_read_write.h +++ b/moses/TranslationModel/UG/mm/num_read_write.h @@ -14,11 +14,11 @@ namespace ugdiss { void numwrite(std::ostream& out, uint16_t const& x); void numwrite(std::ostream& out, uint32_t const& x); void numwrite(std::ostream& out, uint64_t const& x); - + char const* numread(char const* src, uint16_t & x); char const* numread(char const* src, uint32_t & x); char const* numread(char const* src, uint64_t & x); - + // template<typename uintNumber> // void // numwrite(std::ostream& out, uintNumber const& x) @@ -54,7 +54,7 @@ namespace ugdiss { // case 8: x = bswap_64(x); break; // default: break; // } -// #endif +// #endif // } // template<typename uintNumber> @@ -71,7 +71,7 @@ namespace ugdiss { // case 8: x = bswap_64(x); break; // default: break; // } -// #endif +// #endif // return src+sizeof(uintNumber); // } } // end of namespace ugdiss diff --git a/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h b/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h index 1810027af..e5e9ca88c 100644 --- a/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h +++ b/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h @@ -39,8 +39,8 @@ namespace Moses { class jstats; // phrase pair ("joint") statistics class agenda { - boost::mutex lock; - boost::condition_variable ready; + boost::mutex lock; + boost::condition_variable ready; class job; class worker; list<job> joblist; @@ -52,9 +52,9 @@ namespace Moses { agenda(bitext_base const& bitext); ~agenda(); void add_workers(int n); - sptr<pstats> add_job(mmbitext::iter const& phrase, + sptr<pstats> add_job(mmbitext::iter const& phrase, size_t const max_samples); - bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, + bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, bool & fwd, sptr<bitext_base::pstats> & stats); }; @@ -65,22 +65,22 @@ namespace Moses { mmTtrack<char> Tx; // word alignments mmTtrack<Token> T1,T2; // token tracks TokenIndex V1,V2; // vocabs - mmTSA<Token> I1,I2; // suffix arrays + mmTSA<Token> I1,I2; // suffix arrays /// given the source phrase sid[start:stop] - // find the possible start (s1 .. s2) and end (e1 .. e2) + // find the possible start (s1 .. s2) and end (e1 .. e2) // points of the target phrase; if non-NULL, store word - // alignments in *core_alignment. If /flip/, source phrase is + // alignments in *core_alignment. If /flip/, source phrase is // L2. - bool + bool find_trg_phr_bounds - (size_t const sid, size_t const start, size_t const stop, - size_t & s1, size_t & s2, size_t & e1, size_t & e2, + (size_t const sid, size_t const start, size_t const stop, + size_t & s1, size_t & s2, size_t & e1, size_t & e2, vector<uchar> * core_alignment, bool const flip) const; boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2; private: - sptr<pstats> + sptr<pstats> prep2(iter const& phrase); public: mmbitext(); @@ -98,8 +98,8 @@ namespace Moses { jstats { uint32_t my_rcnt; // unweighted count - float my_wcnt; // weighted count - vector<pair<size_t, vector<uchar> > > my_aln; + float my_wcnt; // weighted count + vector<pair<size_t, vector<uchar> > > my_aln; boost::mutex lock; public: jstats(); @@ -110,22 +110,22 @@ namespace Moses { void add(float w, vector<uchar> const& a); }; - struct + struct mmbitext:: pstats { boost::mutex lock; // for parallel gathering of stats boost::condition_variable ready; // consumers can wait for this data structure to be ready. - size_t raw_cnt; // (approximate) raw occurrence count + size_t raw_cnt; // (approximate) raw occurrence count size_t sample_cnt; // number of instances selected during sampling size_t good; // number of selected instances with valid word alignments size_t sum_pairs; - // size_t snt_cnt; + // size_t snt_cnt; // size_t sample_snt; size_t in_progress; // keeps track of how many threads are currently working on this boost::unordered_map<uint64_t, jstats> trg; - pstats(); + pstats(); // vector<phrase> nbest; // void select_nbest(size_t const N=10); void release(); @@ -142,7 +142,7 @@ namespace Moses { public: worker(agenda& a); void operator()(); - + }; class diff --git a/moses/TranslationModel/UG/mm/symal2mam.cc b/moses/TranslationModel/UG/mm/symal2mam.cc index 631d4ae07..6d0af57b0 100644 --- a/moses/TranslationModel/UG/mm/symal2mam.cc +++ b/moses/TranslationModel/UG/mm/symal2mam.cc @@ -2,9 +2,9 @@ // program to convert GIZA-style alignments into memory-mapped format // (c) 2010 Ulrich Germann -// Reads from stdin a file with alternating lines: sentence lengths and symal output. -// We need the sentence lenghts for sanity checks, because GIZA alignment might skip -// sentences. If --skip, we skip such sentence pairs, otherwise, we leave the word +// Reads from stdin a file with alternating lines: sentence lengths and symal output. +// We need the sentence lenghts for sanity checks, because GIZA alignment might skip +// sentences. If --skip, we skip such sentence pairs, otherwise, we leave the word // alignment matrix blank. #include "ug_mm_ttrack.h" @@ -24,7 +24,7 @@ #include "util/exception.hh" // #include "headers-base/util/check.hh" -// NOTE TO SELF: +// NOTE TO SELF: /* Program to filter out sentences that GIZA will skip or truncate, * i.e. sentences longer than 100 words or sentence pairs with a length */ @@ -42,7 +42,7 @@ TokenIndex V1; string mtt1name,mtt2name,o1name,o2name,mamname,cfgFile; string dataFormat,A3filename; -void +void interpret_args(int ac, char* av[]) { namespace po=boost::program_options; @@ -63,7 +63,7 @@ interpret_args(int ac, char* av[]) ("t2", po::value<string>(&mtt2name), "file name of L2 mapped token track") ("format,F", po::value<string>(&dataFormat)->default_value("plain"), "data format (plain or conll)") ; - + h.add_options() ("mamname", po::value<string>(&mamname), "name of output file for mam") ; @@ -76,8 +76,8 @@ interpret_args(int ac, char* av[]) if (vm.count("help") || mamname.empty()) { cout << "usage:\n" - << "\t\n" - << "\t ... | " << av[0] + << "\t\n" + << "\t ... | " << av[0] << " <.mam file> \n" << endl; cout << o << endl; cout << "If an A3 file is given (as produced by (m)giza), symal2mam performs\n" @@ -117,8 +117,8 @@ procSymalLine(string const& line, ostream& out) { cerr << a << "-" << b << " " << len1 << "/" << len2 << endl; } - assert(len1 == 0 || a<len1); - assert(len2 == 0 || b<len2); + assert(len1 == 0 || a<len1); + assert(len2 == 0 || b<len2); binwrite(out,a); binwrite(out,b); } @@ -138,7 +138,7 @@ void finiMAM(ofstream& out, vector<id_type>& idx, id_type numTok) out.close(); } -void +void finalize(ofstream& out, vector<id_type> const& idx, id_type tokenCount) { id_type idxSize = idx.size(); @@ -184,7 +184,7 @@ go() while(getline(cin,line)) { idxm.push_back(procSymalLine(line,mam)); - if (debug && ++ctr%100000==0) + if (debug && ++ctr%100000==0) cerr << ctr/1000 << "K lines processed" << endl; } finiMAM(mam,idxm,0); @@ -208,20 +208,20 @@ go(string t1name, string t2name, string A3filename) for (sid = 0; sid < T1.size(); ++sid) { - len1 = T1.sntLen(sid); + len1 = T1.sntLen(sid); len2 = T2.sntLen(sid); - if (debug) - cerr << "[" << lineCtr << "] " - << len1 << " (" << check1 << ") / " + if (debug) + cerr << "[" << lineCtr << "] " + << len1 << " (" << check1 << ") / " << len2 << " (" << check2 << ")" << endl; - if ((check1 >=0 && check1!=len1) || + if ((check1 >=0 && check1!=len1) || (check2 >=0 && check2!=len2)) { if (skip) { - cerr << "[" << ++skipCtr << "] skipping " - << check1 << "/" << check2 << " vs. " - << len1 << "/" << len2 + cerr << "[" << ++skipCtr << "] skipping " + << check1 << "/" << check2 << " vs. " + << len1 << "/" << len2 << " at line " << lineCtr << endl; } else @@ -238,9 +238,9 @@ go(string t1name, string t2name, string A3filename) } if (skip) { - idx1.push_back(tokenCount1 += len1); + idx1.push_back(tokenCount1 += len1); copySentence(T1,sid,t1out); - idx2.push_back(tokenCount2 += len2); + idx2.push_back(tokenCount2 += len2); copySentence(T2,sid,t2out); } @@ -250,7 +250,7 @@ go(string t1name, string t2name, string A3filename) lineCtr++; idxm.push_back(procSymalLine(line,mam)); if (debug) cerr << "[" << lineCtr << "] " - << check1 << " (" << len1 <<") " + << check1 << " (" << len1 <<") " << check2 << " (" << len2 <<") " << line << endl; getCheckValues(A3file,check1,check2); @@ -264,7 +264,7 @@ go(string t1name, string t2name, string A3filename) cout << idxm.size() << endl; } -void +void initialize(ofstream& out, string const& fname) { out.open(fname.c_str()); diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.cc b/moses/TranslationModel/UG/mm/tpt_pickler.cc index c23913fc2..353e5b901 100644 --- a/moses/TranslationModel/UG/mm/tpt_pickler.cc +++ b/moses/TranslationModel/UG/mm/tpt_pickler.cc @@ -73,45 +73,45 @@ namespace ugdiss data += T(c&mask) << 63; } - void - binwrite(std::ostream& out, unsigned char data) - { + void + binwrite(std::ostream& out, unsigned char data) + { binwrite_unsigned_integer(out, data); } - void + void binwrite(std::ostream& out, unsigned short data) - { + { binwrite_unsigned_integer(out, data); } - void + void binwrite(std::ostream& out, unsigned long data) - { + { binwrite_unsigned_integer(out, data); } - void + void binwrite(std::ostream& out, unsigned long long data) - { + { binwrite_unsigned_integer(out, data); } #if __WORDSIZE == 64 - void + void binwrite(std::ostream& out, unsigned int data) - { + { binwrite_unsigned_integer(out, data); } -#else - void +#else + void binwrite(std::ostream& out, size_t data) - { + { binwrite_unsigned_integer(out, data); } #endif - void + void binread(std::istream& in, unsigned short& data) { assert(sizeof(data)==2); @@ -127,7 +127,7 @@ namespace ugdiss data += uint16_t(c&mask) << 14; } - void + void binread(std::istream& in, unsigned int& data) { assert(sizeof(data) == 4); @@ -149,7 +149,7 @@ namespace ugdiss data += uint32_t(c&mask) << 28; } - void + void binread(std::istream& in, unsigned long& data) { #if __WORDSIZE == 32 @@ -185,16 +185,16 @@ namespace ugdiss data += static_cast<unsigned long long>(c&mask) << 49; if (c < 0) return; in.get(c); - + data += static_cast<unsigned long long>(c&mask) << 56; if (c < 0) return; in.get(c); - + data += static_cast<unsigned long long>(c&mask) << 63; #endif } - void + void binread(std::istream& in, unsigned long long& data) { assert(sizeof(unsigned long long)==8); @@ -231,14 +231,14 @@ namespace ugdiss } // writing and reading strings ... - void + void binwrite(std::ostream& out, std::string const& s) { size_t len = s.size(); ugdiss::binwrite(out,len); out.write(s.c_str(),len); } - + void binread(std::istream& in, std::string& s) { @@ -250,28 +250,28 @@ namespace ugdiss buf[len] = 0; s = buf; } - + void binwrite(std::ostream& out, float x) - { - // IMPORTANT: this is not robust against the big/little endian - // issue. - out.write(reinterpret_cast<char*>(&x),sizeof(float)); + { + // IMPORTANT: this is not robust against the big/little endian + // issue. + out.write(reinterpret_cast<char*>(&x),sizeof(float)); } - + void binread(std::istream& in, float& x) - { - // IMPORTANT: this is not robust against the big/little endian - // issue. - in.read(reinterpret_cast<char*>(&x),sizeof(x)); + { + // IMPORTANT: this is not robust against the big/little endian + // issue. + in.read(reinterpret_cast<char*>(&x),sizeof(x)); } - + char const *binread(char const* p, uint16_t& buf) { static char mask = 127; - buf = (*p)&mask; + buf = (*p)&mask; if (*p++ < 0) return p; buf += uint16_t((*p)&mask)<<7; if (*p++ < 0) return p; @@ -294,26 +294,26 @@ namespace ugdiss char const *binread(char const* p, uint32_t& buf) { static char mask = 127; - - if (*p < 0) - { - buf = (*p)&mask; - return ++p; + + if (*p < 0) + { + buf = (*p)&mask; + return ++p; } buf = *p; - if (*(++p) < 0) + if (*(++p) < 0) { buf += uint32_t((*p)&mask)<<7; return ++p; } buf += uint32_t(*p)<<7; - if (*(++p) < 0) + if (*(++p) < 0) { buf += uint32_t((*p)&mask)<<14; return ++p; } buf += uint32_t(*p)<<14; - if (*(++p) < 0) + if (*(++p) < 0) { buf += uint32_t((*p)&mask)<<21; return ++p; @@ -331,56 +331,56 @@ namespace ugdiss char const *binread(char const* p, filepos_type& buf) { static char mask = 127; - - if (*p < 0) - { - buf = (*p)&mask; - return ++p; + + if (*p < 0) + { + buf = (*p)&mask; + return ++p; } buf = *p; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<7; return ++p; } buf += filepos_type(*p)<<7; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<14; return ++p; } buf += filepos_type(*p)<<14; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<21; return ++p; } buf += filepos_type(*p)<<21; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<28; return ++p; } buf += filepos_type(*p)<<28; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<35; return ++p; } buf += filepos_type(*p)<<35; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<42; return ++p; } buf += filepos_type(*p)<<42; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<49; return ++p; } buf += filepos_type(*p)<<49; - if (*(++p) < 0) + if (*(++p) < 0) { buf += filepos_type((*p)&mask)<<56; return ++p; diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.h b/moses/TranslationModel/UG/mm/tpt_pickler.h index 7305a858e..5ac71c16d 100644 --- a/moses/TranslationModel/UG/mm/tpt_pickler.h +++ b/moses/TranslationModel/UG/mm/tpt_pickler.h @@ -17,30 +17,30 @@ namespace ugdiss /// @return the size of file fname. ::uint64_t getFileSize(const std::string& fname); - /** - * The following functions write and read data in a compact binary + /** + * The following functions write and read data in a compact binary * representation. Write and read errors can be checked directly * on the ostream object after the function call, so no return value is * necessary.*/ - void binwrite(std::ostream& out, char data); - void binwrite(std::ostream& out, unsigned char data); + void binwrite(std::ostream& out, char data); + void binwrite(std::ostream& out, unsigned char data); void binwrite(std::ostream& out, unsigned short data); void binwrite(std::ostream& out, unsigned int data); void binwrite(std::ostream& out, unsigned long data); void binwrite(std::ostream& out, size_t data); void binwrite(std::ostream& out, unsigned long long data); void binwrite(std::ostream& out, std::string const& data); - void binwrite(std::ostream& out, float data); + void binwrite(std::ostream& out, float data); - void binread(std::istream& in, char &data); - void binread(std::istream& in, unsigned char &data); + void binread(std::istream& in, char &data); + void binread(std::istream& in, unsigned char &data); void binread(std::istream& in, unsigned short &data); void binread(std::istream& in, unsigned int &data); void binread(std::istream& in, unsigned long &data); void binread(std::istream& in, size_t &data); void binread(std::istream& in, unsigned long long &data); void binread(std::istream& in, std::string &data); - void binread(std::istream& in, float &data); + void binread(std::istream& in, float &data); char const *binread(char const* p, uint16_t& buf); char const *binread(char const* p, uint32_t& buf); @@ -68,11 +68,11 @@ namespace ugdiss /* template<typename WHATEVER> - char const* + char const* binread(char const* p, WHATEVER* buf); template<typename numtype> - char const* + char const* binread(char const* p, numtype& buf); */ @@ -113,7 +113,7 @@ namespace ugdiss p = binread(p,v[i]); return p; } - + template<typename T> T read(std::istream& in) { @@ -132,7 +132,7 @@ namespace ugdiss template<typename T> - void + void binwrite(std::ostream& out, std::vector<T> const& data) { binwrite(out,data.size()); @@ -141,7 +141,7 @@ namespace ugdiss } template<typename T> - void + void binread(std::istream& in, std::vector<T>& data) { size_t s; @@ -157,8 +157,8 @@ namespace ugdiss { size_t s; K k; V v; binread(in,s); - data.clear(); - // I have no idea why this is necessary, but it is, even when + data.clear(); + // I have no idea why this is necessary, but it is, even when // /data/ is supposed to be empty for (size_t i = 0; i < s; i++) { @@ -174,7 +174,7 @@ namespace ugdiss binwrite(std::ostream& out, std::map<K,V> const& data) { binwrite(out,data.size()); - for (typename std::map<K,V>::const_iterator m = data.begin(); + for (typename std::map<K,V>::const_iterator m = data.begin(); m != data.end(); m++) { binwrite(out,m->first); @@ -200,7 +200,7 @@ namespace ugdiss template<typename WHATEVER> - char const* + char const* binread(char const* p, WHATEVER* buf) { #ifdef VERIFY_TIGHT_PACKING @@ -209,6 +209,6 @@ namespace ugdiss return binread(p,*buf); } - + } // end namespace ugdiss #endif diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.cc b/moses/TranslationModel/UG/mm/tpt_tightindex.cc index da28c6d93..72cf0c183 100644 --- a/moses/TranslationModel/UG/mm/tpt_tightindex.cc +++ b/moses/TranslationModel/UG/mm/tpt_tightindex.cc @@ -8,10 +8,10 @@ */ // // ugTightIndex.cc -// +// // Made by Ulrich Germann // Login <germann@germann-laptop> -// +// // Started on Tue Jul 17 15:09:33 2007 Ulrich Germann // Started on Tue Jul 17 15:09:33 2007 Ulrich Germann // @@ -63,7 +63,7 @@ namespace ugdiss // } // #define LOG_WRITE_ACTIVITY - + // write a key or value into a tight index // flag indicates wheter it's a key or a value void tightwrite(std::ostream& out, uint64_t data, bool flag) @@ -80,10 +80,10 @@ namespace ugdiss std::cerr << " with flag 1 "; #endif while (data >= 128) - { + { char c = char(data%128)|char(-128); - out.put(c); - data >>= 7; + out.put(c); + data >>= 7; #ifdef LOG_WRITE_ACTIVITY bytes_written++; #endif @@ -99,7 +99,7 @@ namespace ugdiss while (data >= 128) { char c = data&127; - out.put(c); + out.put(c); data >>= 7; #ifdef LOG_WRITE_ACTIVITY bytes_written++; @@ -112,16 +112,16 @@ namespace ugdiss std::cerr << " in " << bytes_written << " bytes" << std::endl; #endif } - -// For the code below: does it make a difference if I hard-code the + +// For the code below: does it make a difference if I hard-code the // unraveled loop or does code optimization by the compiler take care // of that? #define DEBUG_TIGHTREAD 0 - // read a key value from a tight index; filepos_type must be at least as + // read a key value from a tight index; filepos_type must be at least as // large as count_type - filepos_type + filepos_type tightread(std::istream& in, std::ios::pos_type stop) { // debug=true; @@ -131,8 +131,8 @@ namespace ugdiss short int bitshift = 7; int pos = in.tellg(); #if DEBUG_TIGHTREAD - if (debug) - cerr << bitpattern(uint(in.peek())) << " " << in.peek() + if (debug) + cerr << bitpattern(uint(in.peek())) << " " << in.peek() << " pos=" << in.tellg() << "\n"; #endif int buf = in.get(); @@ -141,24 +141,24 @@ namespace ugdiss else stop = std::min(size_t(stop),size_t(in.tellg())+in.rdbuf()->in_avail()); if (buf < 0) - std::cerr << "number read: " << buf << " " << pos << " " + std::cerr << "number read: " << buf << " " << pos << " " << in.tellg() << std::endl; assert (buf>=0); - + if (buf >= 128) // continuation bit is 1 { data = buf-128; // unset the bit while (in.tellg() < stop && in.peek() >= 128) { #if DEBUG_TIGHTREAD - if (debug) + if (debug) cerr << bitpattern(uint(in.peek())) << " " << in.peek(); #endif // cerr << bitpattern(size_t(in.peek())) << std::endl; data += size_t(in.get()-128)<<bitshift; bitshift += 7; #if DEBUG_TIGHTREAD - if (debug) + if (debug) cerr << " " << data << " pos=" << in.tellg() << std::endl; #endif } @@ -170,14 +170,14 @@ namespace ugdiss { // cerr << bitpattern(size_t(in.peek())) << std::endl; #if DEBUG_TIGHTREAD - if (debug) + if (debug) cerr << bitpattern(uint(in.peek())) << " " << in.peek(); - + #endif data += size_t(in.get())<<bitshift; bitshift += 7; #if DEBUG_TIGHTREAD - if (debug) + if (debug) cerr << " " << data << " pos=" << in.tellg() << "\n"; #endif } @@ -189,16 +189,16 @@ namespace ugdiss #if DEBUG_TIGHTFIND bool debug=true; #endif - bool + bool tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop) { - in.seekg((start+stop)/2); - // Jump approximately to the middle. Since we might land in the - // middle of a number, we need to find the start of the next + in.seekg((start+stop)/2); + // Jump approximately to the middle. Since we might land in the + // middle of a number, we need to find the start of the next // [index key/file offset] pair first. Bytes belonging to an index - // key have the leftmost bit set to 0, bytes belonging to a file + // key have the leftmost bit set to 0, bytes belonging to a file // offset have it set to 1 - + // if we landed in the middle of an index key, skip to the end of it while (static_cast<filepos_type>(in.tellg()) < stop && in.get() < 128) { @@ -216,9 +216,9 @@ bool debug=true; while (static_cast<filepos_type>(in.tellg()) < stop && in.peek() >= 128) { #if DEBUG_TIGHTFIND - int r = in.get(); + int r = in.get(); if (debug) - std::cerr << in.tellg() << " skipped value byte " << r + std::cerr << in.tellg() << " skipped value byte " << r << " next is " << in.peek() << std::endl; #else @@ -227,9 +227,9 @@ bool debug=true; } return true; } - - char const* - tightfind_midpoint(char const* const start, + + char const* + tightfind_midpoint(char const* const start, char const* const stop) { char const* mp = start + (stop - start)/2; @@ -238,46 +238,46 @@ bool debug=true; return (*mp < 0) ? ++mp : mp; } - bool - linear_search(std::istream& in, filepos_type start, filepos_type stop, + bool + linear_search(std::istream& in, filepos_type start, filepos_type stop, id_type key, unsigned char& flags) { // performs a linear search in the range in.seekg(start); - + #if DEBUG_TIGHTFIND if (debug) std::cerr << in.tellg() << " "; #endif - - // ATTENTION! The bitshift operations below are important: - // We use some of the bits in the key value to store additional + + // ATTENTION! The bitshift operations below are important: + // We use some of the bits in the key value to store additional // information about what and where node iformation is stored. - + id_type foo; - for(foo = tightread(in,stop); - (foo>>FLAGBITS) < key; - foo = tightread(in,stop)) + for(foo = tightread(in,stop); + (foo>>FLAGBITS) < key; + foo = tightread(in,stop)) { // skip the value associated with key /foo/ - while (static_cast<filepos_type>(in.tellg()) < stop - && in.peek() >= 128) in.get(); - + while (static_cast<filepos_type>(in.tellg()) < stop + && in.peek() >= 128) in.get(); + #if DEBUG_TIGHTFIND - if (debug) - std::cerr << (foo>>FLAGBITS) << " [" << key << "] " + if (debug) + std::cerr << (foo>>FLAGBITS) << " [" << key << "] " << in.tellg() << std::endl; #endif - + if (in.tellg() == std::ios::pos_type(stop)) return false; // not found } - + #if DEBUG_TIGHTFIND - if (debug && (foo>>FLAGBITS)==key) + if (debug && (foo>>FLAGBITS)==key) std::cerr << "found entry for " << key << std::endl; - std::cerr << "current file position is " << in.tellg() + std::cerr << "current file position is " << in.tellg() << " (value read: " << key << std::endl; #endif - + assert(static_cast<filepos_type>(in.tellg()) < stop); if ((foo>>FLAGBITS)==key) { @@ -288,51 +288,51 @@ bool debug=true; else return false; } - + bool - tightfind(std::istream& in, filepos_type start, filepos_type stop, + tightfind(std::istream& in, filepos_type start, filepos_type stop, id_type key, unsigned char& flags) { - // returns true if the value is found + // returns true if the value is found #if DEBUG_TIGHTFIND if (debug) - std::cerr << "looking for " << key + std::cerr << "looking for " << key << " in range [" << start << ":" << stop << "]" << std::endl; #endif if (start==stop) return false; assert(stop>start); if ((start+1)==stop) return false; // list is empty - - unsigned int const granularity = sizeof(filepos_type)*5; + + unsigned int const granularity = sizeof(filepos_type)*5; // granularity: point where we should switch to linear search, // because otherwise we might skip over the entry we are looking for // because we land right in the middle of it. - + if (stop > start + granularity) - if (!tightfind_midpoint(in,start,stop)) + if (!tightfind_midpoint(in,start,stop)) return false; // something went wrong (empty index) - + if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop)) { // If the search range is very short, tightfind_midpoint might skip the // entry we are loking for. In this case, we can afford a linear // search return linear_search(in,start,stop,key,flags); } - + // perform binary search filepos_type curpos = in.tellg(); id_type foo = tightread(in,stop); id_type tmpid = foo>>FLAGBITS; - if (tmpid == key) + if (tmpid == key) { - flags = foo%256; + flags = foo%256; flags &= FLAGMASK; #if DEBUG_TIGHTFIND if (debug) std::cerr << "found entry for " << key << std::endl; #endif - return true; // done, found + return true; // done, found } - else if (tmpid > key) + else if (tmpid > key) { // look in the lower half #if DEBUG_TIGHTFIND if (debug) std::cerr << foo << " > " << key << std::endl; @@ -343,7 +343,7 @@ bool debug=true; { // look in the upper half while (static_cast<filepos_type>(in.tellg()) < stop && in.rdbuf()->in_avail() > 0 // is that still necessary??? - && in.peek() >= 128) + && in.peek() >= 128) in.get(); // skip associated value if (in.rdbuf()->in_avail() == 0 || in.tellg() == std::ios::pos_type(stop)) return false; @@ -353,16 +353,16 @@ bool debug=true; return tightfind(in,in.tellg(),stop,key,flags); } } - + char const* - tightfind(char const* const start, + tightfind(char const* const start, char const* const stop, - id_type key, + id_type key, unsigned char& flags) { - // returns true if the value is found - + // returns true if the value is found + if (start==stop) return NULL; assert(stop>start); if ((start+1)==stop) return NULL; // list is empty @@ -374,11 +374,11 @@ bool debug=true; id_type tmpId = foo>>FLAGBITS; if (tmpId == key) { - flags = foo%256; + flags = foo%256; flags &= FLAGMASK; return after; } - else if (tmpId > key) + else if (tmpId > key) { // look in the lower half return tightfind(start,p,key,flags); } @@ -389,14 +389,14 @@ bool debug=true; return tightfind(after,stop,key,flags); } } - + char const* - tightfind_noflags(char const* const start, + tightfind_noflags(char const* const start, char const* const stop, id_type key) { - // returns true if the value is found - + // returns true if the value is found + if (start==stop) return NULL; assert(stop>start); if ((start+1)==stop) return NULL; // list is empty @@ -407,7 +407,7 @@ bool debug=true; char const* after = tightread(p,stop,foo); if (foo == key) return after; - else if (foo > key) + else if (foo > key) { // look in the lower half return tightfind_noflags(start,p,key); } @@ -419,19 +419,19 @@ bool debug=true; } } - bool - linear_search_noflags(std::istream& in, filepos_type start, + bool + linear_search_noflags(std::istream& in, filepos_type start, filepos_type stop, id_type key) { // performs a linear search in the range - std::ios::pos_type mystop = stop; + std::ios::pos_type mystop = stop; in.seekg(start); id_type foo; - for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop)) + for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop)) { // skip the value associated with key /foo/ - while (in.tellg() < mystop && in.peek() >= 128) - in.get(); + while (in.tellg() < mystop && in.peek() >= 128) + in.get(); if (in.tellg() == mystop) return false; // not found } @@ -441,45 +441,45 @@ bool debug=true; bool - tightfind_noflags(std::istream& in, filepos_type start, + tightfind_noflags(std::istream& in, filepos_type start, filepos_type stop, id_type key) { - // returns true if the value is found + // returns true if the value is found if (start==stop) return false; assert(stop>start); if ((start+1)==stop) return false; // list is empty - + // granularity: point where we should switch to linear search, // because otherwise we might skip over the entry we are looking for // because we land right in the middle of it. - unsigned int const granularity = sizeof(filepos_type)*5; + unsigned int const granularity = sizeof(filepos_type)*5; // UG: why 5? we should be able to get away with less! - + if (stop > start + granularity) - if (!tightfind_midpoint(in,start,stop)) + if (!tightfind_midpoint(in,start,stop)) return false; // something went wrong (empty index) - + // If the search range is very short, tightfind_midpoint might skip the // entry we are loking for. In this case, we can afford a linear // search if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop)) return linear_search_noflags(in,start,stop,key); - + // Otherwise, perform binary search filepos_type curpos = in.tellg(); id_type foo = tightread(in,stop); - if (foo == key) - return true; // done, found + if (foo == key) + return true; // done, found else if (foo > key) // search first half return tightfind_noflags(in,start,curpos,key); else // search second half - { - std::ios::pos_type mystop = stop; + { + std::ios::pos_type mystop = stop; while (in.tellg() < mystop && in.rdbuf()->in_avail() > 0 // is that still necessary??? - && in.peek() >= 128) + && in.peek() >= 128) in.get(); // skip associated value if (in.rdbuf()->in_avail() == 0 || in.tellg() == mystop) return false; @@ -496,9 +496,9 @@ bool debug=true; { foo += 32768; // set first bit while (data >= 32768) // = 2^15 - { + { out.write(reinterpret_cast<char*>(&foo),2); - data >>= 15; + data >>= 15; foo = (data%32768)+32768; } } @@ -507,7 +507,7 @@ bool debug=true; while (data >= 32768) // = 2^15 { out.write(reinterpret_cast<char*>(&foo),2); - data >>= 15; + data >>= 15; foo = data%32768; } } @@ -515,8 +515,8 @@ bool debug=true; } char const* - tightread8(char const* start, - char const* stop, + tightread8(char const* start, + char const* stop, uint64_t& dest) { static char bitmask=127; @@ -570,8 +570,8 @@ bool debug=true; } char const* - tightread4(char const* start, - char const* stop, + tightread4(char const* start, + char const* stop, uint32_t& dest) { static char bitmask=127; @@ -605,8 +605,8 @@ bool debug=true; } char const* - tightread2(char const* start, - char const* stop, + tightread2(char const* start, + char const* stop, uint16_t& dest) { static char bitmask=127; diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.h b/moses/TranslationModel/UG/mm/tpt_tightindex.h index 66594bc0a..967215aeb 100644 --- a/moses/TranslationModel/UG/mm/tpt_tightindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tightindex.h @@ -28,46 +28,46 @@ extern bool debug; namespace ugdiss { // void tightwritex(iostream& out, size_t data, bool flag); - void + void tightwrite(std::ostream& out, ::uint64_t data, bool flag); - filepos_type + filepos_type tightread(std::istream& in, std::ios::pos_type stop); bool - tightfind(std::istream& in, - filepos_type start, - filepos_type stop, + tightfind(std::istream& in, + filepos_type start, + filepos_type stop, id_type key, unsigned char& flags); bool - tightfind_noflags(std::istream& in, - filepos_type start, - filepos_type stop, + tightfind_noflags(std::istream& in, + filepos_type start, + filepos_type stop, id_type key); char const* - tightfind(char const* const start, + tightfind(char const* const start, char const* const stop, - id_type key, + id_type key, unsigned char& flags); char const* - tightfind_noflags(char const* const start, + tightfind_noflags(char const* const start, char const* const stop, id_type key); - /** move read header in istream /in/ to the first entry after the midpoint of - * file position range [start,stop) in in a 'tight' index + /** move read header in istream /in/ to the first entry after the midpoint of + * file position range [start,stop) in in a 'tight' index * @param in the data input stream * @param start start of the search range * @param stop end of the search range - * @return true if no errors occurred - */ - bool + * @return true if no errors occurred + */ + bool tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop); // the bitpattern functions below are for debugging @@ -115,8 +115,8 @@ namespace ugdiss #if 0 template<typename dtype> - char const* - tightread(char const* start, + char const* + tightread(char const* start, char const* stop, dtype& dest) { diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc index c6704beac..5fc6a6acc 100644 --- a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc +++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc @@ -15,15 +15,15 @@ namespace ugdiss { TokenIndex:: - TokenIndex(string unkToken) + TokenIndex(string unkToken) : ridx(0),unkLabel(unkToken),unkId(1),numTokens(0) - { + { lock.reset(new boost::mutex()); }; - + #if 0 TokenIndex:: - TokenIndex(string fname, string unkToken,bool dyna) + TokenIndex(string fname, string unkToken,bool dyna) : ridx(0),unkLabel(unkToken) { this->open(fname,unkToken,dyna); @@ -58,8 +58,8 @@ namespace ugdiss if (!unkToken.empty()) { Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp); - unkId = ((bla < endIdx && unkToken == comp.base+bla->offset) - ? bla->id + unkId = ((bla < endIdx && unkToken == comp.base+bla->offset) + ? bla->id : numTokens); } this->dynamic=dyna; @@ -69,7 +69,7 @@ namespace ugdiss this->newWords.reset(new vector<string>()); } } - + void TokenIndex:: close() @@ -79,9 +79,9 @@ namespace ugdiss TokenIndex:: CompFunc:: - CompFunc() + CompFunc() {}; - + bool TokenIndex:: CompFunc:: @@ -90,7 +90,7 @@ namespace ugdiss return strcmp(base+A.offset,w) < 0; }; - id_type + id_type TokenIndex:: operator[](char const* p) const { @@ -101,7 +101,7 @@ namespace ugdiss if (!dynamic) return unkId; boost::lock_guard<boost::mutex> lk(*this->lock); // stuff below is new as of 2011-01-30, for dynamic adding of unknown items - // IMPORTANT: numTokens is not currently not changed, it is the number of + // IMPORTANT: numTokens is not currently not changed, it is the number of // PRE-EXISING TOKENS, not including dynamically added Items map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens); pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem); @@ -110,14 +110,14 @@ namespace ugdiss return foo.first->second; } - id_type + id_type TokenIndex:: operator[](string const& w) const { return (*this)[w.c_str()]; } - vector<char const*> + vector<char const*> TokenIndex:: reverseIndex() const { @@ -125,11 +125,11 @@ namespace ugdiss // cout << "tokenindex has " << numToks << " tokens" << endl; - vector<char const*> v(numToks,NULL); + vector<char const*> v(numToks,NULL); // v.reserve(endIdx-startIdx); for (Entry const* x = startIdx; x != endIdx; x++) { - if (x->id >= v.size()) + if (x->id >= v.size()) v.resize(x->id+1); v[x->id] = comp.base+x->offset; } @@ -141,12 +141,12 @@ namespace ugdiss TokenIndex:: operator[](id_type id) const { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } - if (id < ridx.size()) + if (id < ridx.size()) return ridx[id]; boost::lock_guard<boost::mutex> lk(*this->lock); if (dynamic && id < ridx.size()+newWords->size()) @@ -156,26 +156,26 @@ namespace ugdiss void TokenIndex:: - iniReverseIndex() + iniReverseIndex() { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } } - + char const* const TokenIndex:: - operator[](id_type id) + operator[](id_type id) { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } - if (id < ridx.size()) + if (id < ridx.size()) return ridx[id]; boost::lock_guard<boost::mutex> lk(*this->lock); if (dynamic && id < ridx.size()+newWords->size()) @@ -183,11 +183,11 @@ namespace ugdiss return unkLabel.c_str(); } - string + string TokenIndex:: - toString(vector<id_type> const& v) + toString(vector<id_type> const& v) { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); @@ -198,11 +198,11 @@ namespace ugdiss return buf.str(); } - string + string TokenIndex:: toString(vector<id_type> const& v) const { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); @@ -213,11 +213,11 @@ namespace ugdiss return buf.str(); } - string + string TokenIndex:: - toString(id_type const* start, id_type const* const stop) + toString(id_type const* start, id_type const* const stop) { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); @@ -230,11 +230,11 @@ namespace ugdiss return buf.str(); } - string + string TokenIndex:: toString(id_type const* start, id_type const* const stop) const { - if (!ridx.size()) + if (!ridx.size()) { boost::lock_guard<boost::mutex> lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); @@ -266,7 +266,7 @@ namespace ugdiss { bool allgood = true; string w; v.clear(); - for (istringstream buf(line); buf>>w;) + for (istringstream buf(line); buf>>w;) { v.push_back((*this)[w]); allgood = allgood && v.back() > 1; @@ -325,15 +325,15 @@ namespace ugdiss } void - write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok, + write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok, string const& ofile, string const& unkToken) { typedef pair<uint32_t,id_type> IndexEntry; // offset and id // Write token strings to a buffer, keep track of offsets - vector<IndexEntry> index(tok.size()); + vector<IndexEntry> index(tok.size()); ostringstream data; - id_type unkId = tok.size(); + id_type unkId = tok.size(); for (size_t i = 0; i < tok.size(); i++) { if (tok[i].first == unkToken) @@ -342,7 +342,7 @@ namespace ugdiss index[i].second = tok[i].second; // respective ID data<<tok[i].first<<char(0); // write string to buffer } - + // Now write the actual file ofstream out(ofile.c_str()); uint32_t vsize = index.size(); // how many vocab items? @@ -356,26 +356,26 @@ namespace ugdiss out<<data.str(); } - void + void TokenIndex:: write(string fname) { typedef pair<string,uint32_t> Token; // token and id - vector<Token> tok(totalVocabSize()); + vector<Token> tok(totalVocabSize()); for (id_type i = 0; i < tok.size(); ++i) tok[i] = Token((*this)[i],i); sort(tok.begin(),tok.end()); write_tokenindex_to_disk(tok,fname,unkLabel); } - - bool + + bool TokenIndex:: - isDynamic() const + isDynamic() const { return dynamic; } - bool + bool TokenIndex:: setDynamic(bool on) { @@ -393,7 +393,7 @@ namespace ugdiss } return ret; } - + void TokenIndex:: setUnkLabel(string unk) diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.h b/moses/TranslationModel/UG/mm/tpt_tokenindex.h index 3051f07a5..9f7c69b3e 100644 --- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h @@ -3,7 +3,7 @@ // // - Vocab items should be stored in order of ids, so that we can determine their length // by taking computing V[id+1] - V[id] instead of using strlen. -// +// // (c) 2007,2008 Ulrich Germann #ifndef __ugTokenIndex_hh @@ -30,7 +30,7 @@ namespace ugdiss /** Reverse index: maps from ID to char const* */ mutable vector<char const*> ridx; /** Label for the UNK token */ - string unkLabel; + string unkLabel; id_type unkId,numTokens; /// New 2013-09-02: thread-safe @@ -42,9 +42,9 @@ namespace ugdiss boost::shared_ptr<vector<string> > newWords; // The use of pointers to external items is a bit of a bad hack // in terms of the semantic of TokenIndex const: since external items - // are changed, the TokenIndex instance remains unchanged and const works, - // even though in reality the underlying object on the coceptual level - // *IS* changed. This means that dynamic TokenIndex instances are not + // are changed, the TokenIndex instance remains unchanged and const works, + // even though in reality the underlying object on the coceptual level + // *IS* changed. This means that dynamic TokenIndex instances are not // thread-safe! public: @@ -53,7 +53,7 @@ namespace ugdiss { public: uint32_t offset; - id_type id; + id_type id; }; /** Comparison function object used for Entry instances */ @@ -111,19 +111,19 @@ namespace ugdiss void setUnkLabel(string unk); }; - void - write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok, + void + write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok, string const& ofile, string const& unkToken); /** for sorting words by frequency */ class compWords { string unk; - public: + public: compWords(string _unk) : unk(_unk) {}; - + bool - operator()(pair<string,size_t> const& A, + operator()(pair<string,size_t> const& A, pair<string,size_t> const& B) const { if (A.first == unk) return false;// do we still need this special treatment? @@ -142,7 +142,7 @@ namespace ugdiss typedef pair<string,uint32_t> Token; // token and id - // first, sort the word list in decreasing order of frequency, so that we + // first, sort the word list in decreasing order of frequency, so that we // can assign IDs in an encoding-efficient manner (high frequency. low ID) vector<pair<string,size_t> > wcounts(M.size()); // for sorting by frequency typedef typename MYMAP::const_iterator myIter; @@ -156,16 +156,16 @@ namespace ugdiss sort(wcounts.begin(),wcounts.end(),compFunc); // Assign IDs ... - vector<Token> tok(wcounts.size()); + vector<Token> tok(wcounts.size()); for (size_t i = 0; i < wcounts.size(); i++) tok[i] = Token(wcounts[i].first,i); // and re-sort in alphabetical order - sort(tok.begin(),tok.end()); + sort(tok.begin(),tok.end()); write_tokenindex_to_disk(tok,ofile,unkToken); } template<typename Token> - void + void fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest) { istringstream buf(line); string w; diff --git a/moses/TranslationModel/UG/mm/tpt_typedefs.h b/moses/TranslationModel/UG/mm/tpt_typedefs.h index fea221d61..d2d2932de 100644 --- a/moses/TranslationModel/UG/mm/tpt_typedefs.h +++ b/moses/TranslationModel/UG/mm/tpt_typedefs.h @@ -12,4 +12,4 @@ namespace ugdiss typedef uint64_t filepos_type; typedef unsigned char uchar; } -#endif +#endif diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index d2899e677..809476aa9 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -8,18 +8,18 @@ using namespace ugdiss; using namespace std; namespace Moses { - namespace bitext + namespace bitext { - float + float lbop(size_t const tries, size_t const succ, float const confidence) { - return (confidence == 0 - ? float(succ)/tries + return (confidence == 0 + ? float(succ)/tries : (boost::math::binomial_distribution<>:: find_lower_bound_on_p(tries, succ, confidence))); } - + // template<> void @@ -42,37 +42,37 @@ namespace Moses else index.reset(new imTSA<tkn>(track,NULL,NULL)); } - + snt_adder<L2R_Token<SimpleWordId> >:: - snt_adder(vector<string> const& s, TokenIndex& v, - sptr<imTtrack<L2R_Token<SimpleWordId> > >& t, + snt_adder(vector<string> const& s, TokenIndex& v, + sptr<imTtrack<L2R_Token<SimpleWordId> > >& t, sptr<imTSA<L2R_Token<SimpleWordId> > >& i) - : snt(s), V(v), track(t), index(i) + : snt(s), V(v), track(t), index(i) { } - bool + bool expand_phrase_pair - (vector<vector<ushort> >& a1, + (vector<vector<ushort> >& a1, vector<vector<ushort> >& a2, ushort const s2, // next word on in target side ushort const L1, ushort const R1, // limits of previous phrase ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg { - if (a2[s2].size() == 0) + if (a2[s2].size() == 0) { cout << __FILE__ << ":" << __LINE__ << endl; return false; } bitvector done1(a1.size()); bitvector done2(a2.size()); - vector <pair<ushort,ushort> > agenda; + vector <pair<ushort,ushort> > agenda; // x.first: side (1 or 2) // x.second: word position agenda.reserve(a1.size() + a2.size()); agenda.push_back(pair<ushort,ushort>(2,s2)); e2 = s2; s1 = e1 = a2[s2].front(); - if (s1 >= L1 && s1 < R1) + if (s1 >= L1 && s1 < R1) { cout << __FILE__ << ":" << __LINE__ << endl; return false; @@ -88,14 +88,14 @@ namespace Moses done1.set(p); BOOST_FOREACH(ushort i, a1[p]) { - if (i < s2) + if (i < s2) { // cout << __FILE__ << ":" << __LINE__ << endl; return false; } if (done2[i]) continue; for (;e2 <= i;++e2) - if (!done2[e2]) + if (!done2[e2]) agenda.push_back(pair<ushort,ushort>(2,e2)); } } @@ -104,16 +104,16 @@ namespace Moses done2.set(p); BOOST_FOREACH(ushort i, a2[p]) { - if ((e1 < L1 && i >= L1) || - (s1 >= R1 && i < R1) || + if ((e1 < L1 && i >= L1) || + (s1 >= R1 && i < R1) || (i >= L1 && i < R1)) { - // cout << __FILE__ << ":" << __LINE__ << " " - // << L1 << "-" << R1 << " " << i << " " + // cout << __FILE__ << ":" << __LINE__ << " " + // << L1 << "-" << R1 << " " << i << " " // << s1 << "-" << e1<< endl; return false; } - + if (e1 < i) { for (; e1 <= i; ++e1) @@ -134,7 +134,7 @@ namespace Moses return true; } - void + void print_amatrix(vector<vector<ushort> > a1, uint32_t len2, ushort b1, ushort e1, ushort b2, ushort e2) { @@ -163,7 +163,7 @@ namespace Moses cout << string(90,'-') << endl; } - void + void write_bitvector(bitvector const& v, ostream& out) { for (size_t i = v.find_first(); i < v.size();) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 7fb07fc26..ab5f2a24f 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -2,18 +2,18 @@ #pragma once // Implementations of word-aligned bitext. // Written by Ulrich Germann -// +// // mmBitext: static, memory-mapped bitext // imBitext: dynamic, in-memory bitext // // things we can do to speed up things: -// - set up threads at startup time that force the +// - set up threads at startup time that force the // data in to memory sequentially // -// - use multiple agendas for better load balancing and to avoid +// - use multiple agendas for better load balancing and to avoid // competition for locks -// +// #define UG_BITEXT_TRACK_ACTIVE_THREADS 0 @@ -70,7 +70,7 @@ namespace Moses { float lbop(size_t const tries, size_t const succ, float const confidence); void write_bitvector(bitvector const& v, ostream& out); - struct + struct ContextForQuery { // needs to be made thread-safe @@ -85,7 +85,7 @@ namespace Moses { template<typename TKN> - class Bitext + class Bitext { public: typedef TKN Token; @@ -98,19 +98,19 @@ namespace Moses { mutable boost::shared_mutex m_lock; // for thread-safe operation class agenda; // for parallel sampling see ug_bitext_agenda.h - mutable sptr<agenda> ag; + mutable sptr<agenda> ag; size_t m_num_workers; // number of workers available to the agenda - size_t m_default_sample_size; + size_t m_default_sample_size; size_t m_pstats_cache_threshold; // threshold for caching sampling results sptr<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results - + vector<string> m_docname; map<string,id_type> m_docname2docid; // maps from doc names to ids sptr<std::vector<id_type> > m_sid2docid; // maps from sentences to docs (ids) mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; - // caches for unbiased sampling; biased sampling uses the caches that + // caches for unbiased sampling; biased sampling uses the caches that // are stored locally on the translation task public: @@ -123,9 +123,9 @@ namespace Moses { sptr<TSA<Token> > I2; // indices /// given the source phrase sid[start:stop] - // find the possible start (s1 .. s2) and end (e1 .. e2) + // find the possible start (s1 .. s2) and end (e1 .. e2) // points of the target phrase; if non-NULL, store word - // alignments in *core_alignment. If /flip/, source phrase is + // alignments in *core_alignment. If /flip/, source phrase is // L2. bool find_trg_phr_bounds ( size_t const sid, // sentence to investigate @@ -136,27 +136,27 @@ namespace Moses { int& po_fwd, int& po_bwd, // phrase orientations std::vector<uchar> * core_alignment, // stores the core alignment bitvector* full_alignment, // stores full word alignment for this sent. - bool const flip) const; // flip source and target (reverse lookup) - - // prep2 launches sampling and returns immediately. + bool const flip) const; // flip source and target (reverse lookup) + + // prep2 launches sampling and returns immediately. // lookup (below) waits for the job to finish before it returns - sptr<pstats> + sptr<pstats> prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; - + public: Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16); - Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2, - Ttrack<char>* const tx, + Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2, + Ttrack<char>* const tx, TokenIndex* const v1, TokenIndex* const v2, TSA<Token>* const i1, TSA<Token>* const i2, - size_t const max_sample=1000, + size_t const max_sample=1000, size_t const xnum_workers=16); - - virtual void + + virtual void open(string const base, string const L1, string const L2) = 0; - - sptr<pstats> + + sptr<pstats> lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; void prep(ttasksptr const& ttask, iter const& phrase) const; @@ -176,7 +176,7 @@ namespace Moses { void - mark_match(Token const* start, Token const* end, iter const& m, + mark_match(Token const* start, Token const* end, iter const& m, bitvector& check) const; void write_yawat_alignment @@ -184,10 +184,10 @@ namespace Moses { #if 0 // needs to be adapted to the new API void - lookup(std::vector<Token> const& snt, TSA<Token>& idx, + lookup(std::vector<Token> const& snt, TSA<Token>& idx, std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest, std::vector<std::vector<uint64_t> >* pidmap = NULL, - typename PhrasePair<Token>::Scorer* scorer=NULL, + typename PhrasePair<Token>::Scorer* scorer=NULL, sptr<SamplingBias const> const bias, bool multithread=true) const; #endif @@ -233,32 +233,32 @@ namespace Moses { Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off; Token const* x = t + len; TokenIndex const& V = isL2 ? *V2 : *V1; - while (t < x) + while (t < x) { buf << V[t->id()]; if (++t < x) buf << " "; } return buf.str(); } - + template<typename Token> - size_t + size_t Bitext<Token>:: - getDefaultSampleSize() const - { - return m_default_sample_size; + getDefaultSampleSize() const + { + return m_default_sample_size; } template<typename Token> - void + void Bitext<Token>:: setDefaultSampleSize(size_t const max_samples) - { + { boost::unique_lock<boost::shared_mutex> guard(m_lock); - if (max_samples != m_default_sample_size) + if (max_samples != m_default_sample_size) { m_cache1.reset(new pstats::cache_t); m_cache2.reset(new pstats::cache_t); - m_default_sample_size = max_samples; + m_default_sample_size = max_samples; } } @@ -274,12 +274,12 @@ namespace Moses { template<typename Token> Bitext<Token>:: - Bitext(Ttrack<Token>* const t1, - Ttrack<Token>* const t2, + Bitext(Ttrack<Token>* const t1, + Ttrack<Token>* const t2, Ttrack<char>* const tx, - TokenIndex* const v1, + TokenIndex* const v1, TokenIndex* const v2, - TSA<Token>* const i1, + TSA<Token>* const i1, TSA<Token>* const i2, size_t const max_sample, size_t const xnum_workers) @@ -294,7 +294,7 @@ namespace Moses { template<typename TKN> class snt_adder; template<> class snt_adder<L2R_Token<SimpleWordId> >; - template<> + template<> class snt_adder<L2R_Token<SimpleWordId> > { typedef L2R_Token<SimpleWordId> TKN; @@ -303,9 +303,9 @@ namespace Moses { sptr<imTtrack<TKN> > & track; sptr<imTSA<TKN > > & index; public: - snt_adder(std::vector<string> const& s, TokenIndex& v, + snt_adder(std::vector<string> const& s, TokenIndex& v, sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i); - + void operator()(); }; @@ -313,17 +313,17 @@ namespace Moses { bool Bitext<Token>:: find_trg_phr_bounds - (size_t const sid, + (size_t const sid, size_t const start, size_t const stop, size_t & s1, size_t & s2, size_t & e1, size_t & e2, int & po_fwd, int & po_bwd, - std::vector<uchar>* core_alignment, bitvector* full_alignment, + std::vector<uchar>* core_alignment, bitvector* full_alignment, bool const flip) const { // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; // a word on the core_alignment: - // + // // since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 // < e2, respectively) are be definition unaligned, we store // only the core alignment in *core_alignment it is up to the @@ -364,18 +364,18 @@ namespace Moses { else { p = binread(p,src); assert(p<x); p = binread(p,trg); } UTIL_THROW_IF2((src >= slen1 || trg >= slen2), - "Alignment range error at sentence " << sid << "!\n" - << src << "/" << slen1 << " " << + "Alignment range error at sentence " << sid << "!\n" + << src << "/" << slen1 << " " << trg << "/" << slen2); - - if (src < start || src >= stop) + + if (src < start || src >= stop) forbidden.set(trg); else { lft = min(lft,trg); rgt = max(rgt,trg); } - if (core_alignment) + if (core_alignment) { aln1[src].push_back(trg); aln2[trg].push_back(src); @@ -383,16 +383,16 @@ namespace Moses { if (full_alignment) full_alignment->set(src*slen2 + trg); } - + for (size_t i = lft; i <= rgt; ++i) - if (forbidden[i]) + if (forbidden[i]) return false; - + s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1); e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2); - + if (lft > rgt) return false; - if (core_alignment) + if (core_alignment) { core_alignment->clear(); for (size_t i = start; i < stop; ++i) @@ -417,7 +417,7 @@ namespace Moses { ( string const& bserver, string const& text, ostream* log ) const { sptr<DocumentBias> ret; - ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, + ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, bserver, text, log)); return ret; } @@ -435,15 +435,15 @@ namespace Moses { // and waits until the sampling is finished before it returns. // This allows sampling in the background template<typename Token> - sptr<pstats> + sptr<pstats> Bitext<Token> - ::prep2 + ::prep2 ( ttasksptr const& ttask, iter const& phrase, int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; sptr<ContextScope> scope = ttask->GetScope(); sptr<ContextForQuery> context = scope->get<ContextForQuery>(this); - sptr<SamplingBias> bias; + sptr<SamplingBias> bias; if (context) bias = context->bias; sptr<pstats::cache_t> cache; @@ -451,9 +451,9 @@ namespace Moses { // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias if (max_sample == int(m_default_sample_size) - && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) + && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) { - cache = (phrase.root == I1.get() + cache = (phrase.root == I1.get() ? (bias ? context->cache1 : m_cache1) : (bias ? context->cache2 : m_cache2)); // if (bias) cerr << "Using bias." << endl; @@ -461,17 +461,17 @@ namespace Moses { sptr<pstats> ret; sptr<pstats> const* cached; - if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) + if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) return *cached; boost::unique_lock<boost::shared_mutex> guard(m_lock); - if (!ag) + if (!ag) { ag.reset(new agenda(*this)); if (m_num_workers > 1) ag->add_workers(m_num_workers); } // cerr << "NEW FREQUENT PHRASE: " - // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount() + // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount() // << " at " << __FILE__ << ":" << __LINE__ << endl; ret = ag->add_job(this, phrase, max_sample, bias); if (cache) cache->set(phrase.getPid(),ret); @@ -497,8 +497,8 @@ namespace Moses { // CONSTRUCTOR pstats2pplist(typename TSA<Token>::tree_iterator const& m, Ttrack<Token> const& other, - sptr<pstats> const& ps, - std::vector<PhrasePair<Token> >& dest, + sptr<pstats> const& ps, + std::vector<PhrasePair<Token> >& dest, typename PhrasePair<Token>::Scorer const* scorer) : m_other(other) , m_pstats(ps) @@ -509,17 +509,17 @@ namespace Moses { , m_pid1(m.getPid()) , m_is_inverse(false) { } - + // WORKER - void - operator()() + void + operator()() { // wait till all statistics have been collected boost::unique_lock<boost::mutex> lock(m_pstats->lock); while (m_pstats->in_progress) m_pstats->ready.wait(lock); - m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); + m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); // convert pstats entries to phrase pairs pstats::trg_map_t::iterator a; @@ -531,8 +531,8 @@ namespace Moses { m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1), m_pp.joint); size_t J = m_pp.joint<<7; // hard coded threshold of 1/128 - if (m_pp.good1 > J || m_pp.good2 > J) continue; - if (m_scorer) + if (m_pp.good1 > J || m_pp.good2 > J) continue; + if (m_scorer) { (*m_scorer)(m_pp); } @@ -543,23 +543,23 @@ namespace Moses { } }; -#if 0 +#if 0 template<typename Token> void Bitext<Token>:: - lookup(std::vector<Token> const& snt, TSA<Token>& idx, + lookup(std::vector<Token> const& snt, TSA<Token>& idx, std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest, std::vector<std::vector<uint64_t> >* pidmap, typename PhrasePair<Token>::Scorer* scorer, sptr<SamplingBias const> const& bias, bool multithread) const { // typedef std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > > ret_t; - - dest.clear(); + + dest.clear(); dest.resize(snt.size()); if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); } - // collect statistics in parallel, then build PT entries as + // collect statistics in parallel, then build PT entries as // the sampling finishes bool fwd = &idx == I1.get(); std::vector<boost::thread*> workers; // background threads doing the lookup @@ -574,16 +574,16 @@ namespace Moses { uint64_t key = m.getPid(); if (pidmap) (*pidmap)[i].push_back(key); sptr<std::vector<PhrasePair<Token> > > pp = C.get(key); - if (pp) + if (pp) dest[i].push_back(pp); - else + else { pp.reset(new std::vector<PhrasePair<Token> >()); C.set(key,pp); dest[i].push_back(pp); sptr<pstats> x = prep2(m, this->default_sample_size,bias); pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer); - if (multithread) + if (multithread) { boost::thread* t = new boost::thread(w); workers.push_back(t); @@ -592,16 +592,16 @@ namespace Moses { } } } - for (size_t w = 0; w < workers.size(); ++w) + for (size_t w = 0; w < workers.size(); ++w) { - workers[w]->join(); + workers[w]->join(); delete workers[w]; } } -#endif +#endif template<typename Token> - sptr<pstats> + sptr<pstats> Bitext<Token>:: lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const { @@ -615,7 +615,7 @@ namespace Moses { boost::unique_lock<boost::shared_mutex> guard(m_lock); typename agenda::worker(*this->ag)(); } - else + else { boost::unique_lock<boost::mutex> lock(ret->lock); while (ret->in_progress) @@ -639,7 +639,7 @@ namespace Moses { Token const* a = x; Token const* b = s; size_t i = 0; - while (a && b && a->id() == b->id() && i < m.size()) + while (a && b && a->id() == b->id() && i < m.size()) { ++i; a = a->next(); @@ -669,7 +669,7 @@ namespace Moses { pair<bitvector,bitvector> ag; ag.first.resize(a1.size()); ag.second.resize(a2.size()); - char const* x = Tx->sntStart(sid); + char const* x = Tx->sntStart(sid); size_t a, b; while (x < Tx->sntEnd(sid)) { @@ -677,11 +677,11 @@ namespace Moses { x = binread(x,b); if (a1.at(a) < 0 && a2.at(b) < 0) { - a1[a] = a2[b] = agroups.size(); - ag.first.reset(); - ag.second.reset(); - ag.first.set(a); - ag.second.set(b); + a1[a] = a2[b] = agroups.size(); + ag.first.reset(); + ag.second.reset(); + ag.first.set(a); + ag.second.set(b); agroups.push_back(ag); grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec"); } @@ -697,7 +697,7 @@ namespace Moses { agroups[a1[a]].second.set(b); if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; } - else + else { agroups[a1[a]].first |= agroups[a2[b]].first; agroups[a1[a]].second |= agroups[a2[b]].second; @@ -705,10 +705,10 @@ namespace Moses { if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; } } - + for (a = 0; a < a1.size(); ++a) { - if (a1[a] < 0) + if (a1[a] < 0) { if (f1[a]) out << a << "::" << "infocusmono "; continue; @@ -729,7 +729,7 @@ namespace Moses { #if 0 template<typename Token> - sptr<pstats> + sptr<pstats> Bitext<Token>:: lookup(siter const& phrase, size_t const max_sample, sptr<SamplingBias const> const& bias) const @@ -738,7 +738,7 @@ namespace Moses { boost::unique_lock<boost::shared_mutex> guard(m_lock); if (this->num_workers <= 1) typename agenda::worker(*this->ag)(); - else + else { boost::unique_lock<boost::mutex> lock(ret->lock); while (ret->in_progress) @@ -747,25 +747,25 @@ namespace Moses { return ret; } #endif - + template<typename Token> - void - expand(typename Bitext<Token>::iter const& m, - Bitext<Token> const& bt, pstats const& ps, + void + expand(typename Bitext<Token>::iter const& m, + Bitext<Token> const& bt, pstats const& ps, std::vector<PhrasePair<Token> >& dest, ostream* log) { bool fwd = m.root == bt.I1.get(); dest.reserve(ps.trg.size()); PhrasePair<Token> pp; pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0); - // cout << HERE << " " + // cout << HERE << " " // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl; pstats::trg_map_t::const_iterator a; for (a = ps.trg.begin(); a != ps.trg.end(); ++a) { uint32_t sid,off,len; parse_pid(a->first, sid, off, len); - pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, + pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, len, a->second); dest.push_back(pp); } @@ -773,24 +773,24 @@ namespace Moses { #if 0 template<typename Token> - class + class PStatsCache { typedef boost::unordered_map<uint64_t, sptr<pstats> > my_cache_t; boost::shared_mutex m_lock; - my_cache_t m_cache; - + my_cache_t m_cache; + public: sptr<pstats> get(Bitext<Token>::iter const& phrase) const; - sptr<pstats> + sptr<pstats> add(Bitext<Token>::iter const& phrase) const { uint64_t pid = phrase.getPid(); - std::pair<my_cache_t::iterator,bool> + std::pair<my_cache_t::iterator,bool> } - + }; #endif } // end of namespace bitext diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h index a9632c056..d07fba6aa 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -1,8 +1,8 @@ // -*- c++ -*- // to be included from ug_bitext.h -// The agenda handles parallel sampling. -// It maintains a queue of unfinished sampling jobs and +// The agenda handles parallel sampling. +// It maintains a queue of unfinished sampling jobs and // assigns them to a pool of workers. // template<typename Token> @@ -13,7 +13,7 @@ public: class job; class worker; private: - boost::mutex lock; + boost::mutex lock; std::list<sptr<job> > joblist; std::vector<sptr<boost::thread> > workers; bool shutdown; @@ -27,23 +27,23 @@ public: agenda(Bitext<Token> const& bitext); ~agenda(); - void + void add_workers(int n); - sptr<pstats> + sptr<pstats> add_job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& phrase, + typename TSA<Token>::tree_iterator const& phrase, size_t const max_samples, sptr<SamplingBias const> const& bias); // add_job(Bitext<Token> const* const theBitext, - // typename TSA<Token>::tree_iterator const& phrase, + // typename TSA<Token>::tree_iterator const& phrase, // size_t const max_samples, SamplingBias const* const bias); - sptr<job> + sptr<job> get_job(); }; - + template<typename Token> -class +class Bitext<Token>::agenda:: worker { @@ -61,9 +61,9 @@ void Bitext<Token> ::agenda ::add_workers(int n) { - static boost::posix_time::time_duration nodelay(0,0,0,0); + static boost::posix_time::time_duration nodelay(0,0,0,0); boost::lock_guard<boost::mutex> guard(this->lock); - + int target = max(1, int(n + workers.size() - this->doomed)); // house keeping: remove all workers that have finished for (size_t i = 0; i < workers.size(); ) @@ -79,7 +79,7 @@ void Bitext<Token> // cerr << workers.size() << "/" << target << " active" << endl; if (int(workers.size()) > target) this->doomed = workers.size() - target; - else + else while (int(workers.size()) < target) { sptr<boost::thread> w(new boost::thread(worker(*this))); @@ -92,16 +92,16 @@ template<typename Token> sptr<pstats> Bitext<Token> ::agenda ::add_job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& phrase, + typename TSA<Token>::tree_iterator const& phrase, size_t const max_samples, sptr<SamplingBias const> const& bias) { boost::unique_lock<boost::mutex> lk(this->lock); - static boost::posix_time::time_duration nodelay(0,0,0,0); + static boost::posix_time::time_duration nodelay(0,0,0,0); bool fwd = phrase.root == bt.I1.get(); - sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, + sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias)); j->stats->register_worker(); - + joblist.push_back(j); if (joblist.size() == 1) { @@ -136,7 +136,7 @@ Bitext<Token> sptr<job> ret; if (this->shutdown) return ret; boost::unique_lock<boost::mutex> lock(this->lock); - if (this->doomed) + if (this->doomed) { // the number of workers has been reduced, tell the redundant once to quit --this->doomed; return ret; @@ -145,15 +145,15 @@ Bitext<Token> typename list<sptr<job> >::iterator j = joblist.begin(); while (j != joblist.end()) { - if ((*j)->done()) + if ((*j)->done()) { (*j)->stats->release(); joblist.erase(j++); - } + } else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job else break; // found one } - if (joblist.size()) + if (joblist.size()) { ret = j == joblist.end() ? joblist.front() : *j; // if we've reached the end of the queue (all jobs have 4 workers on them), @@ -175,12 +175,12 @@ agenda:: for (size_t i = 0; i < workers.size(); ++i) workers[i]->join(); } - + template<typename Token> Bitext<Token>:: agenda:: agenda(Bitext<Token> const& thebitext) : shutdown(false), doomed(0), bt(thebitext) { } - + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 0e26b6182..0e0624351 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -4,48 +4,48 @@ // todo: add check to enforce this template<typename Token> -class +class Bitext<Token>::agenda:: -job +job { #if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; #endif Bitext<Token> const* const m_bitext; - boost::mutex lock; + boost::mutex lock; friend class agenda; - boost::taus88 rnd; // every job has its own pseudo random generator + boost::taus88 rnd; // every job has its own pseudo random generator double rnddenom; // denominator for scaling random sampling size_t min_diverse; // minimum number of distinct translations - bool flip_coin(uint64_t & sid, uint64_t & offset); + bool flip_coin(uint64_t & sid, uint64_t & offset); bool step(uint64_t & sid, uint64_t & offset); // proceed to next occurrence public: size_t workers; // how many workers are working on this job? sptr<TSA<Token> const> root; // root of the underlying suffix array - char const* next; // next position to read from + char const* next; // next position to read from char const* stop; // end of index range size_t max_samples; // how many samples to extract at most size_t ctr; /* # of phrase occurrences considered so far - * # of samples chosen is stored in stats->good + * # of samples chosen is stored in stats->good */ size_t len; // phrase length - bool fwd; // if true, source phrase is L1 + bool fwd; // if true, source phrase is L1 sptr<pstats> stats; // stores statistics collected during sampling sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling float bias_total; bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence - - int + + int check_sample_distribution(uint64_t const& sid, uint64_t const& offset); - // for biased sampling: ensure the distribution approximately matches + // for biased sampling: ensure the distribution approximately matches // the bias - + bool done() const; - job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& m, - sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, + job(Bitext<Token> const* const theBitext, + typename TSA<Token>::tree_iterator const& m, + sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, sptr<SamplingBias const> const& bias); ~job(); }; @@ -57,15 +57,15 @@ Bitext<Token>::agenda::job if (stats) stats.reset(); #if UG_BITEXT_TRACK_ACTIVE_THREADS // counter may not exist any more at destruction time, hence try .. catch ... - try { --active; } catch (...) {} + try { --active; } catch (...) {} #endif } template<typename Token> Bitext<Token>::agenda::job ::job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& m, - sptr<TSA<Token> > const& r, size_t maxsmpl, + typename TSA<Token>::tree_iterator const& m, + sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, sptr<SamplingBias const> const& bias) : m_bitext(theBitext) , rnd(0) @@ -83,9 +83,9 @@ Bitext<Token>::agenda::job { stats.reset(new pstats()); stats->raw_cnt = m.approxOccurrenceCount(); - bias_total = 0; - - // we need to renormalize on the fly, as the summ of all sentence probs over + bias_total = 0; + + // we need to renormalize on the fly, as the summ of all sentence probs over // all candidates (not all sentences in the corpus) needs to add to 1. // Profiling question: how much does that cost us? if (m_bias) @@ -98,8 +98,8 @@ Bitext<Token>::agenda::job x = root->readSid(x,stop,sid); x = root->readOffset(x,stop,offset); #if 0 - cerr << ctr++ << " " << m.str(m_bitext->V1.get()) - << " " << sid << "/" << root->getCorpusSize() + cerr << ctr++ << " " << m.str(m_bitext->V1.get()) + << " " << sid << "/" << root->getCorpusSize() << " " << offset << " " << stop-x << endl; #endif bias_total += (*m_bias)[sid]; @@ -108,7 +108,7 @@ Bitext<Token>::agenda::job } #if UG_BITEXT_TRACK_ACTIVE_THREADS ++active; - // if (active%5 == 0) + // if (active%5 == 0) // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl; #endif } @@ -116,8 +116,8 @@ Bitext<Token>::agenda::job template<typename Token> bool Bitext<Token>::agenda::job ::done() const -{ - return (max_samples && stats->good >= max_samples) || next == stop; +{ + return (max_samples && stats->good >= max_samples) || next == stop; } template<typename Token> @@ -125,39 +125,39 @@ int Bitext<Token>::agenda::job ::check_sample_distribution(uint64_t const& sid, uint64_t const& offset) { // ensure that the sampled distribution approximately matches the bias // @return 0: SKIP this occurrence - // @return 1: consider this occurrence for sampling + // @return 1: consider this occurrence for sampling // @return 2: include this occurrence in the sample by all means if (!m_bias) return 1; - + using namespace boost::math; typedef boost::math::binomial_distribution<> binomial; - + ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; - - float p = (*m_bias)[sid]; - id_type docid = m_bias->GetClass(sid); - uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; + + float p = (*m_bias)[sid]; + id_type docid = m_bias->GetClass(sid); + uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; // always consider candidates from dominating documents and // from documents that have not been considered at all yet bool ret = (p > .5 || k == 0); - + if (ret && !log) return 1; - + uint32_t N = stats->good; // number of trials - float d = cdf(complement(binomial(N, p), k)); + float d = cdf(complement(binomial(N, p), k)); // d: probability that samples contains k or more instances from doc #docid - ret = ret || d >= .05; - + ret = ret || d >= .05; + if (log) { Token const* t = root->getCorpus()->sntStart(sid)+offset; Token const* x = t - min(offset,uint64_t(3)); - Token const* e = t+4; + Token const* e = t+4; if (e > root->getCorpus()->sntEnd(sid)) e = root->getCorpus()->sntEnd(sid); - *log << docid << ":" << sid << " " << size_t(k) << "/" << N + *log << docid << ":" << sid << " " << size_t(k) << "/" << N << " @" << p << " => " << d << " ["; for (size_t i = 0; i < stats->indoc.size(); ++i) { @@ -170,8 +170,8 @@ int Bitext<Token>::agenda::job else if (p < .5 && d > .9) *log << "FORCE"; *log << endl; } - - return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); + + return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); } template<typename Token> @@ -186,7 +186,7 @@ bool Bitext<Token>::agenda::job size_t options_total = max(stats->raw_cnt, this->ctr); size_t options_left = (options_total - this->ctr); size_t random_number = options_left * (rnd()/(rnd.max()+1.)); - size_t threshold; + size_t threshold; if (bias_total) // we have a bias and there are candidates with non-zero prob threshold = ((*m_bias)[sid]/bias_total * options_total * max_samples); else // no bias, or all have prob 0 (can happen with a very opinionated bias) @@ -199,7 +199,7 @@ bool Bitext<Token>::agenda::job ::step(uint64_t & sid, uint64_t & offset) { // caller must lock! if (next == stop) return false; - UTIL_THROW_IF2 + UTIL_THROW_IF2 ( next > stop, "Fatal error at " << HERE << ". How did that happen?" ); // boost::lock_guard<boost::mutex> jguard(lock); // caller must lock! next = root->readSid(next, stop, sid); @@ -214,21 +214,21 @@ bool Bitext<Token>::agenda::job { boost::lock_guard<boost::mutex> jguard(lock); if (max_samples == 0) // no sampling, consider all occurrences - return step(sid, offset); + return step(sid, offset); - while (step(sid,offset)) + while (step(sid,offset)) { size_t good = stats->good; size_t diversity = stats->trg.size(); - if (good >= max_samples && diversity >= min_diverse) + if (good >= max_samples && diversity >= min_diverse) return false; // done - // flip_coin softly enforces approximation of the sampling to the + // flip_coin softly enforces approximation of the sampling to the // bias (occurrences that would steer the sample too far from the bias // are ruled out), and flips a biased coin otherwise. if (!flip_coin(sid,offset)) continue; return true; - } + } return false; } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h index 92ed3d36a..5ff39312c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -7,13 +7,13 @@ Bitext<Token>::agenda ::operator()() { // things to do: - // + // // - have each worker maintain their own pstats object and merge // results at the end (to minimize mutex locking); - // + // // - use a non-locked, monotonically increasing counter to // ensure the minimum size of samples considered --- it's OK if - // we look at more samples than required. This way, we can + // we look at more samples than required. This way, we can // reduce the number of lock / unlock operations we need to do // during sampling. @@ -38,13 +38,13 @@ Bitext<Token>::agenda s1, s2, e1, e2, po_fwd, po_bwd, // bounds & orientation &aln, full_aln, !j->fwd)); // aln info / flip sides? - if (!good) + if (!good) { // no good, probably because phrase is not coherent j->stats->count_sample(docid, 0, po_fwd, po_bwd); continue; } - // all good: register this sample as valid + // all good: register this sample as valid size_t num_pairs = (s2-s1+1) * (e2-e1+1); j->stats->count_sample(docid, num_pairs, po_fwd, po_bwd); @@ -52,14 +52,14 @@ Bitext<Token>::agenda Token const* t = ag.bt.T2->sntStart(sid); Token const* eos = ag.bt.T2->sntEnd(sid); cerr << "[" << j->stats->good + 1 << "] "; - while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; + while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; cerr << "[" << docid << "]" << endl; #endif float sample_weight = 1./num_pairs; Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid); - // adjust offsets in phrase-internal aligment + // adjust offsets in phrase-internal aligment for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1; vector<uint64_t> seen; seen.reserve(10); @@ -93,7 +93,7 @@ Bitext<Token>::agenda UTIL_THROW_IF2(!ok, "Could not extend target phrase."); } if (s < s2) // shift phrase-internal alignments - for (size_t k = 1; k < aln.size(); k += 2) + for (size_t k = 1; k < aln.size(); k += 2) --aln[k]; } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index cb3804edc..bcda9ebf3 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -16,12 +16,12 @@ namespace Moses jstats:: jstats() : my_rcnt(0), my_cnt2(0), my_wcnt(0) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) + { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) ofwd[i] = obwd[i] = 0; my_aln.reserve(1); } - + jstats:: jstats(jstats const& other) { @@ -35,8 +35,8 @@ namespace Moses obwd[i] = other.obwd[i]; } } - - uint32_t + + uint32_t jstats:: dcnt_fwd(PhraseOrientation const idx) const { @@ -44,15 +44,15 @@ namespace Moses return ofwd[idx]; } - uint32_t + uint32_t jstats:: dcnt_bwd(PhraseOrientation const idx) const { assert(idx <= Moses::LRModel::NONE); return obwd[idx]; } - - void + + void jstats:: add(float w, vector<uchar> const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid) @@ -65,7 +65,7 @@ namespace Moses { size_t i = 0; while (i < my_aln.size() && my_aln[i].second != a) ++i; - if (i == my_aln.size()) + if (i == my_aln.size()) my_aln.push_back(pair<size_t,vector<uchar> >(1,a)); else my_aln[i].first++; @@ -83,7 +83,7 @@ namespace Moses vector<pair<size_t, vector<uchar> > > const& jstats:: - aln() const + aln() const { return my_aln; } } // namespace bitext diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index ce2e89438..dade27649 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -4,20 +4,20 @@ #include "ug_lexical_reordering.h" #include <boost/thread.hpp> -namespace Moses +namespace Moses { namespace bitext { using namespace ugdiss; - // "joint" (i.e., phrase pair) statistics + // "joint" (i.e., phrase pair) statistics class jstats { boost::mutex lock; uint32_t my_rcnt; // unweighted joint count uint32_t my_cnt2; // raw counts L2 - float my_wcnt; // weighted joint count + float my_wcnt; // weighted joint count // to do: use a static alignment pattern store that stores each pattern only // once, so that we don't have to store so many alignment vectors @@ -33,18 +33,18 @@ namespace Moses uint32_t rcnt() const; // raw joint counts uint32_t cnt2() const; // raw target phrase occurrence count float wcnt() const; // weighted joint counts - + vector<pair<size_t, vector<uchar> > > const & aln() const; void add(float w, vector<uchar> const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, + uint32_t fwd_orient, uint32_t bwd_orient, int const docid); void invalidate(); void validate(); bool valid(); uint32_t dcnt_fwd(PhraseOrientation const idx) const; uint32_t dcnt_bwd(PhraseOrientation const idx) const; - void fill_lr_vec(Moses::LRModel::Direction const& dir, - Moses::LRModel::ModelType const& mdl, + void fill_lr_vec(Moses::LRModel::Direction const& dir, + Moses::LRModel::ModelType const& mdl, vector<float>& v); }; } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 482957508..580d7669b 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -8,11 +8,11 @@ namespace Moses #if UG_BITEXT_TRACK_ACTIVE_THREADS ThreadSafeCounter pstats::active; #endif - + pstats:: pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0) { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) + for (int i = 0; i <= Moses::LRModel::NONE; ++i) ofwd[i] = obwd[i] = 0; } @@ -21,7 +21,7 @@ namespace Moses { #if UG_BITEXT_TRACK_ACTIVE_THREADS // counter may not exist any more at destruction time, so try ... catch - try { --active; } catch (...) {} + try { --active; } catch (...) {} #endif } @@ -33,7 +33,7 @@ namespace Moses ++this->in_progress; this->lock.unlock(); } - + void pstats:: release() @@ -44,9 +44,9 @@ namespace Moses this->lock.unlock(); } - void + void pstats - ::count_sample(int const docid, size_t const num_pairs, + ::count_sample(int const docid, size_t const num_pairs, int const po_fwd, int const po_bwd) { boost::lock_guard<boost::mutex> guard(lock); @@ -65,10 +65,10 @@ namespace Moses bool pstats:: - add(uint64_t pid, float const w, - vector<uchar> const& a, - uint32_t const cnt2, - uint32_t fwd_o, + add(uint64_t pid, float const w, + vector<uchar> const& a, + uint32_t const cnt2, + uint32_t fwd_o, uint32_t bwd_o, int const docid) { boost::lock_guard<boost::mutex> guard(this->lock); @@ -76,7 +76,7 @@ namespace Moses entry.add(w, a, cnt2, fwd_o, bwd_o, docid); if (this->good < entry.rcnt()) { - UTIL_THROW(util::Exception, "more joint counts than good counts:" + UTIL_THROW(util::Exception, "more joint counts than good counts:" << entry.rcnt() << "/" << this->good << "!"); } return true; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index c5b6c0152..9a14e378b 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -12,7 +12,7 @@ namespace Moses { namespace bitext { - struct + struct pstats { typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t; @@ -23,8 +23,8 @@ namespace Moses #endif boost::mutex lock; // for parallel gathering of stats boost::condition_variable ready; // consumers can wait for me to be ready - - size_t raw_cnt; // (approximate) raw occurrence count + + size_t raw_cnt; // (approximate) raw occurrence count size_t sample_cnt; // number of instances selected during sampling size_t good; // number of selected instances with valid word alignments size_t sum_pairs; // total number of target phrases extracted (can be > raw_cnt) @@ -34,25 +34,25 @@ namespace Moses uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations std::vector<uint32_t> indoc; // distribution over where samples came from - + typedef std::map<uint64_t, jstats> trg_map_t; trg_map_t trg; pstats(); ~pstats(); void release(); void register_worker(); - size_t count_workers() { return in_progress; } + size_t count_workers() { return in_progress; } - bool + bool add(uint64_t const pid, // target phrase id float const w, // sample weight (1./(# of phrases extractable)) alnvec const& a, // local alignment uint32_t const cnt2, // raw target phrase count uint32_t fwd_o, // fwd. phrase orientation uint32_t bwd_o, // bwd. phrase orientation - int const docid); // document where sample was found + int const docid); // document where sample was found - void + void count_sample(int const docid, // document where sample was found size_t const num_pairs, // # of phrases extractable here int const po_fwd, // fwd phrase orientation diff --git a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h index 845fe374e..89dc93ad1 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h +++ b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h @@ -25,13 +25,13 @@ namespace ugdiss return NULL; }; - ConllBottomUpToken const* - stop(ConllBottomUpToken const* seqStart, + ConllBottomUpToken const* + stop(ConllBottomUpToken const* seqStart, ConllBottomUpToken const* seqEnd) const { return NULL; }; - + bool operator<(T const& other) const { return this->cmp(other) < 0; } bool operator>(T const& other) const { return this->cmp(other) > 0; } bool operator==(T const& other) const { return this->cmp(other) == 0; } @@ -44,9 +44,9 @@ namespace ugdiss return false; } }; - + template<typename T> - ConllBottomUpToken<T> const* + ConllBottomUpToken<T> const* ConllBottomUpToken<T>:: next(int length) const { diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.h b/moses/TranslationModel/UG/mm/ug_conll_record.h index ea2cda29e..e52a4974b 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_record.h +++ b/moses/TranslationModel/UG/mm/ug_conll_record.h @@ -3,22 +3,22 @@ #include "ug_typedefs.h" // Base class for dependency tree corpora with POS and Lemma annotations -namespace ugdiss +namespace ugdiss { using namespace std; - class - Conll_Record + class + Conll_Record { public: id_type sform; // surface form id_type lemma; // lemma uchar majpos; // major part of speech uchar minpos; // minor part of speech - short parent; // id of parent + short parent; // id of parent uchar dtype; // dependency type uchar info[3]; /* additional information (depends on the part of speech) - * a place holder for the time being, to ensure proper + * a place holder for the time being, to ensure proper * alignment in memory */ Conll_Record(); Conll_Record const* up(int length=1) const; @@ -38,8 +38,8 @@ namespace ugdiss * @parameter PS Vocabulary for part-of-speech * @parameter DT Vocabulary for dependency type */ - Conll_Record(string const& line, - TokenIndex const& SF, TokenIndex const& LM, + Conll_Record(string const& line, + TokenIndex const& SF, TokenIndex const& LM, TokenIndex const& PS, TokenIndex const& DT); /** store the record as-is to disk (for memory-mapped reading later) */ @@ -62,7 +62,7 @@ namespace ugdiss // this is for contigous word sequences extracted from longer sequences // adjust parent pointers to 0 (no parent) if they point out of the // subsequence - void + void fixParse(Conll_Record* start, Conll_Record* stop); } // end of namespace ugdiss diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.cc b/moses/TranslationModel/UG/mm/ug_corpus_token.cc index 742c17ace..4be8cbd95 100644 --- a/moses/TranslationModel/UG/mm/ug_corpus_token.cc +++ b/moses/TranslationModel/UG/mm/ug_corpus_token.cc @@ -6,9 +6,9 @@ namespace ugdiss { id_type const& SimpleWordId:: - id() const - { - return theID; + id() const + { + return theID; } int diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.h b/moses/TranslationModel/UG/mm/ug_corpus_token.h index c1baaf21e..b9693cbf2 100644 --- a/moses/TranslationModel/UG/mm/ug_corpus_token.h +++ b/moses/TranslationModel/UG/mm/ug_corpus_token.h @@ -19,7 +19,7 @@ namespace ugdiss { /** Simple wrapper around id_type for use with the Ttrack/TSA template classes */ - class SimpleWordId + class SimpleWordId { id_type theID; public: @@ -29,7 +29,7 @@ namespace ugdiss bool operator==(SimpleWordId const& other) const; id_type remap(vector<id_type const*> const& m) const; }; - + /** Token class for suffix arrays */ template<typename T> class @@ -43,16 +43,16 @@ namespace ugdiss L2R_Token const* next(int n=1) const { return this+n; } - /** return a pointer to the end of a sentence; used as a stopping criterion during + /** return a pointer to the end of a sentence; used as a stopping criterion during * comparison of suffixes; see Ttrack::cmp() */ template<typename TTRACK_TYPE> - L2R_Token const* stop(TTRACK_TYPE const& C, id_type sid) const - { - return reinterpret_cast<L2R_Token<T> const*>(C.sntEnd(sid)); + L2R_Token const* stop(TTRACK_TYPE const& C, id_type sid) const + { + return reinterpret_cast<L2R_Token<T> const*>(C.sntEnd(sid)); } - L2R_Token const* stop(L2R_Token const* seqStart, L2R_Token const* seqEnd) const - { + L2R_Token const* stop(L2R_Token const* seqStart, L2R_Token const* seqEnd) const + { return seqEnd; } @@ -69,20 +69,20 @@ namespace ugdiss { public: typedef T Token; - + R2L_Token() : T() {}; R2L_Token(id_type id) : T(id) {}; R2L_Token const* next(int n = 1) const { return this - n; } template<typename TTRACK_TYPE> - R2L_Token const* stop(TTRACK_TYPE const& C, id_type sid) const - { - return reinterpret_cast<R2L_Token<T> const*>(C.sntStart(sid) - 1); + R2L_Token const* stop(TTRACK_TYPE const& C, id_type sid) const + { + return reinterpret_cast<R2L_Token<T> const*>(C.sntStart(sid) - 1); } - R2L_Token const* stop(R2L_Token const* seqStart, R2L_Token const* seqEnd) const - { + R2L_Token const* stop(R2L_Token const* seqStart, R2L_Token const* seqEnd) const + { assert(seqStart); return seqStart - 1; } diff --git a/moses/TranslationModel/UG/mm/ug_deptree.cc b/moses/TranslationModel/UG/mm/ug_deptree.cc index 545268e04..003d9b35e 100644 --- a/moses/TranslationModel/UG/mm/ug_deptree.cc +++ b/moses/TranslationModel/UG/mm/ug_deptree.cc @@ -7,14 +7,14 @@ using namespace std; namespace ugdiss { - bool + bool Conll_Record:: isDescendentOf(Conll_Record const* other) const { Conll_Record const* a = this; - while (a != other && a->parent) + while (a != other && a->parent) a += a->parent; - return a==other; + return a==other; } Conll_Record& @@ -43,7 +43,7 @@ namespace ugdiss } Conll_AllFields:: - Conll_AllFields() + Conll_AllFields() : Conll_Record::Conll_Record() {}; @@ -64,7 +64,7 @@ namespace ugdiss } Conll_WildCard:: - Conll_WildCard() + Conll_WildCard() : Conll_Record::Conll_Record() {}; @@ -95,8 +95,8 @@ namespace ugdiss #if 0 Conll_Record:: - Conll_Record(string const& line, - TokenIndex const& SF, TokenIndex const& LM, + Conll_Record(string const& line, + TokenIndex const& SF, TokenIndex const& LM, TokenIndex const& PS, TokenIndex const& DT) { @@ -140,35 +140,35 @@ namespace ugdiss #endif Conll_Sform:: - Conll_Sform() - : Conll_Record::Conll_Record() + Conll_Sform() + : Conll_Record::Conll_Record() {}; Conll_MinPos:: - Conll_MinPos() - : Conll_Record::Conll_Record() + Conll_MinPos() + : Conll_Record::Conll_Record() {}; - + Conll_MinPos_Lemma:: - Conll_MinPos_Lemma() - : Conll_Record::Conll_Record() + Conll_MinPos_Lemma() + : Conll_Record::Conll_Record() {}; Conll_Lemma:: Conll_Lemma() - : Conll_Record::Conll_Record() + : Conll_Record::Conll_Record() {}; Conll_Lemma:: Conll_Lemma(id_type _id) - : Conll_Record::Conll_Record() + : Conll_Record::Conll_Record() { this->lemma = _id; }; Conll_MinPos:: Conll_MinPos(id_type _id) - : Conll_Record::Conll_Record() + : Conll_Record::Conll_Record() { this->minpos = _id; }; @@ -182,7 +182,7 @@ namespace ugdiss Conll_MajPos:: Conll_MajPos(id_type _id) - : Conll_Record::Conll_Record() + : Conll_Record::Conll_Record() { this->majpos = _id; }; @@ -219,21 +219,21 @@ namespace ugdiss Conll_MinPos_Lemma:: cmp(Conll_Record const& other) const { - if (this->minpos != 0 && other.minpos != 0 && this->minpos != other.minpos) + if (this->minpos != 0 && other.minpos != 0 && this->minpos != other.minpos) return this->minpos < other.minpos ? -1 : 1; if (this->lemma != 0 && other.lemma != 0 && this->lemma != other.lemma) return this->lemma < other.lemma ? -1 : 1; return 0; } - id_type + id_type Conll_Lemma:: - id() const - { - return this->lemma; + id() const + { + return this->lemma; } - int + int Conll_Lemma:: cmp(Conll_Record const& other) const { @@ -251,16 +251,16 @@ namespace ugdiss Conll_Sform:: Conll_Sform(id_type _id) - : Conll_Record::Conll_Record() + : Conll_Record::Conll_Record() { this->sform = _id; }; - id_type + id_type Conll_Sform - ::id() const - { - return this->sform; + ::id() const + { + return this->sform; } int @@ -282,7 +282,7 @@ namespace ugdiss short p = w[i].rec->parent; if (p != 0) { - if (p > 0) assert(i+p < w.size()); + if (p > 0) assert(i+p < w.size()); else assert(i >= size_t(-p)); w[i].parent = &(w[i+p]); w[i].parent->children.push_back(&(w[i])); @@ -291,7 +291,7 @@ namespace ugdiss } #endif - /** @return true if the linear sequence of /Conll_Record/s is coherent, + /** @return true if the linear sequence of /Conll_Record/s is coherent, * i.e., a proper connected tree structure */ bool isCoherent(Conll_Record const* const start, Conll_Record const* const stop) @@ -300,16 +300,16 @@ namespace ugdiss for (Conll_Record const* x = start; outOfRange <= 1 && x < stop; ++x) { Conll_Record const* n = x->up(); - if (!n || n < start || n >= stop) + if (!n || n < start || n >= stop) outOfRange++; } return outOfRange<=1; } - + // this is for contigous word sequences extracted from longer sequences // adjust parent pointers to 0 (no parent) if they point out of the // subsequence - void + void fixParse(Conll_Record* start, Conll_Record* stop) { int len = stop-start; diff --git a/moses/TranslationModel/UG/mm/ug_deptree.h b/moses/TranslationModel/UG/mm/ug_deptree.h index 0d393aa33..b28a4bbe8 100644 --- a/moses/TranslationModel/UG/mm/ug_deptree.h +++ b/moses/TranslationModel/UG/mm/ug_deptree.h @@ -19,8 +19,8 @@ using namespace std; namespace ugdiss { - // Fills the vector v with pointers to the internal root r_x for the - // stretch [start,x] for all x: start <= x < stop. If the stretch + // Fills the vector v with pointers to the internal root r_x for the + // stretch [start,x] for all x: start <= x < stop. If the stretch // is incoherent, r_x is NULL template<typename T> void @@ -37,8 +37,8 @@ namespace ugdiss { size_t p = x-start; root[p] = x+x->parent; - for (size_t i = isR.find_first(); i < isR.size(); i = isR.find_next(i)) - if (root[i]==x) + for (size_t i = isR.find_first(); i < isR.size(); i = isR.find_next(i)) + if (root[i]==x) isR.reset(i); if (root[p] < start || root[p] >= stop) isR.set(x-start); @@ -46,7 +46,7 @@ namespace ugdiss } } - // return the root of the tree if the span [start,stop) constitutes a + // return the root of the tree if the span [start,stop) constitutes a // tree, NULL otherwise template<typename T> T const* @@ -66,7 +66,7 @@ namespace ugdiss assert(outOfRange); return outOfRange == 1 ? root : NULL; } - + // return the governor of the tree given by [start,stop) if the span // constitutes a tree, NULL otherwise template<typename T> @@ -82,7 +82,7 @@ namespace ugdiss { if (root && n != root) numRoots++; - else + else { root = n; if (!numRoots) numRoots++; @@ -101,7 +101,7 @@ namespace ugdiss T const* b = as<T>(&(*v.end())); return (a==b) ? NULL : findInternalRoot<T>(a,b); } - + #if 1 class DTNode { @@ -113,7 +113,7 @@ namespace ugdiss }; /** A parsed sentence */ - class + class DependencyTree { public: @@ -189,13 +189,13 @@ namespace ugdiss int cmp(Conll_Record const& other) const; }; - /** @return true if the linear sequence of /Conll_Record/s is coherent, + /** @return true if the linear sequence of /Conll_Record/s is coherent, * i.e., a proper connected tree structure */ bool isCoherent(Conll_Record const* start, Conll_Record const* const stop); - /** @return the root node of the tree covering the span [start,stop), if the span is coherent; + /** @return the root node of the tree covering the span [start,stop), if the span is coherent; * NULL otherwise */ template<typename T> T const* topNode(T const* start , T const* stop) @@ -204,9 +204,9 @@ namespace ugdiss for (T const* x = start; x < stop; ++x) { T const* n = reinterpret_cast<T const*>(x->up()); - if (!n || n < start || n >= stop) + if (!n || n < start || n >= stop) { - if (ret) return NULL; + if (ret) return NULL; else ret = x; } } diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc index 9f26a181b..b411cc7dc 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc @@ -6,15 +6,15 @@ namespace Moses { template<> - sptr<imBitext<L2R_Token<SimpleWordId> > > + sptr<imBitext<L2R_Token<SimpleWordId> > > imBitext<L2R_Token<SimpleWordId> >:: - add(vector<string> const& s1, - vector<string> const& s2, + add(vector<string> const& s1, + vector<string> const& s2, vector<string> const& aln) const { typedef L2R_Token<SimpleWordId> TKN; assert(s1.size() == s2.size() && s1.size() == aln.size()); - + #ifndef NDEBUG size_t first_new_snt = this->T1 ? this->T1->size() : 0; #endif @@ -24,7 +24,7 @@ namespace Moses boost::unique_lock<boost::shared_mutex> guard(m_lock); ret.reset(new imBitext<TKN>(*this)); } - + // we add the sentences in separate threads (so it's faster) boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1)); // thread1.join(); // for debugging @@ -41,10 +41,10 @@ namespace Moses binwrite(obuf,row); binwrite(obuf,col); } - // important: DO NOT replace the two lines below this comment by - // char const* x = obuf.str().c_str(), as the memory x is pointing + // important: DO NOT replace the two lines below this comment by + // char const* x = obuf.str().c_str(), as the memory x is pointing // to is freed immediately upon deconstruction of the string object. - string foo = obuf.str(); + string foo = obuf.str(); char const* x = foo.c_str(); vector<char> v(x,x+foo.size()); ret->myTx = append(ret->myTx, v); diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h index a620b7219..63e44f1b9 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h @@ -4,7 +4,7 @@ namespace Moses { - namespace bitext + namespace bitext { template<typename TKN> class imBitext : public Bitext<TKN> @@ -12,7 +12,7 @@ namespace Moses sptr<imTtrack<char> > myTx; sptr<imTtrack<TKN> > myT1; sptr<imTtrack<TKN> > myT2; - sptr<imTSA<TKN> > myI1; + sptr<imTSA<TKN> > myI1; sptr<imTSA<TKN> > myI2; static ThreadSafeCounter my_revision; public: @@ -23,26 +23,26 @@ namespace Moses size_t max_sample = 5000, size_t num_workers=4); imBitext(size_t max_sample = 5000, size_t num_workers=4); imBitext(imBitext const& other); - - // sptr<imBitext<TKN> > + + // sptr<imBitext<TKN> > // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a); - sptr<imBitext<TKN> > - add(vector<string> const& s1, - vector<string> const& s2, + sptr<imBitext<TKN> > + add(vector<string> const& s1, + vector<string> const& s2, vector<string> const& a) const; }; template<typename TKN> - ThreadSafeCounter + ThreadSafeCounter imBitext<TKN>::my_revision; template<typename TKN> imBitext<TKN>:: imBitext(size_t max_sample, size_t num_workers) : Bitext<TKN>(max_sample, num_workers) - { + { this->m_default_sample_size = max_sample; this->V1.reset(new TokenIndex()); this->V2.reset(new TokenIndex()); @@ -50,14 +50,14 @@ namespace Moses this->V2->setDynamic(true); ++my_revision; } - + template<typename TKN> imBitext<TKN>:: imBitext(sptr<TokenIndex> const& v1, sptr<TokenIndex> const& v2, size_t max_sample, size_t num_workers) : Bitext<TKN>(max_sample, num_workers) - { + { // this->default_sample_size = max_sample; this->V1 = v1; this->V2 = v2; @@ -65,12 +65,12 @@ namespace Moses this->V2->setDynamic(true); ++my_revision; } - + template<typename TKN> imBitext<TKN>:: imBitext(imBitext<TKN> const& other) - { + { this->myTx = other.myTx; this->myT1 = other.myT1; this->myT2 = other.myT2; @@ -89,17 +89,17 @@ namespace Moses } template<> - sptr<imBitext<L2R_Token<SimpleWordId> > > + sptr<imBitext<L2R_Token<SimpleWordId> > > imBitext<L2R_Token<SimpleWordId> >:: - add(vector<string> const& s1, - vector<string> const& s2, + add(vector<string> const& s1, + vector<string> const& s2, vector<string> const& aln) const; template<typename TKN> - sptr<imBitext<TKN> > + sptr<imBitext<TKN> > imBitext<TKN>:: - add(vector<string> const& s1, - vector<string> const& s2, + add(vector<string> const& s1, + vector<string> const& s2, vector<string> const& aln) const { throw "Not yet implemented"; diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h index f7256ba2d..e920d9f96 100644 --- a/moses/TranslationModel/UG/mm/ug_im_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h @@ -23,7 +23,7 @@ namespace ugdiss using namespace std; using namespace boost; namespace bio=boost::iostreams; - + // template<typename TOKEN> class imBitext<TOKEN>; //----------------------------------------------------------------------- @@ -35,61 +35,61 @@ namespace ugdiss public: class tree_iterator; friend class tree_iterator; - + private: vector<cpos> sufa; // stores the actual array - vector<filepos_type> index; /* top-level index into regions in sufa + vector<filepos_type> index; /* top-level index into regions in sufa * (for faster access) */ private: - char const* + char const* index_jump(char const* a, char const* z, float ratio) const; - char const* + char const* getLowerBound(id_type id) const; - char const* + char const* getUpperBound(id_type id) const; - + public: imTSA(); - imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c, - bdBitset const* filt, + imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c, + bdBitset const* filt, ostream* log = NULL); - imTSA(imTSA<TOKEN> const& prior, + imTSA(imTSA<TOKEN> const& prior, boost::shared_ptr<imTtrack<TOKEN> const> const& crp, vector<id_type> const& newsids, size_t const vsize); - count_type - sntCnt(char const* p, char const * const q) const; + count_type + sntCnt(char const* p, char const * const q) const; - count_type + count_type rawCnt(char const* p, char const * const q) const; - - void - getCounts(char const* p, char const * const q, + + void + getCounts(char const* p, char const * const q, count_type& sids, count_type& raw) const; - - char const* + + char const* readSid(char const* p, char const* q, id_type& sid) const; - - char const* + + char const* readSid(char const* p, char const* q, ::uint64_t& sid) const; - char const* + char const* readOffset(char const* p, char const* q, uint16_t& offset) const; - char const* + char const* readOffset(char const* p, char const* q, ::uint64_t& offset) const; - - void + + void sanityCheck() const; - - void + + void save_as_mm_tsa(string fname) const; - + /// add a sentence to the database - // shared_ptr<imTSA<TOKEN> > add(vector<TOKEN> const& snt) const; + // shared_ptr<imTSA<TOKEN> > add(vector<TOKEN> const& snt) const; }; @@ -108,12 +108,12 @@ namespace ugdiss tree_iterator(imTSA<TOKEN> const* s) : TSA<TOKEN>::tree_iterator::tree_iterator(reinterpret_cast<TSA<TOKEN> const*>(s)) {}; - + /** jump to the point 1/ratio in a tightly packed index * assumes that keys are flagged with '1', values with '0' */ template<typename TOKEN> - char const* + char const* imTSA<TOKEN>:: index_jump(char const* a, char const* z, float ratio) const { @@ -123,10 +123,10 @@ namespace ugdiss cpos const* xz = reinterpret_cast<cpos const*>(z); return reinterpret_cast<char const*>(xa+int(ratio*(xz-xa))); } - + template<typename TOKEN> imTSA<TOKEN>:: - imTSA() + imTSA() { this->indexSize = 0; // this->data = NULL; @@ -135,7 +135,7 @@ namespace ugdiss this->corpusSize = 0; this->BitSetCachingThreshold=4096; }; - + // build an array from all the tokens in the sentences in *c that are // specified in filter template<typename TOKEN> @@ -153,12 +153,12 @@ namespace ugdiss } assert(filter); // In the first iteration over the corpus, we obtain word counts. - // They allows us to + // They allows us to // a. allocate the exact amount of memory we need - // b. place tokens into the right 'section' in the array, based on + // b. place tokens into the right 'section' in the array, based on // the ID of the first token in the sequence. We can then sort // each section separately. - + if (log) *log << "counting tokens ... "; int slimit = 65536; // slimit=65536 is the upper bound of what we can fit into a ushort which @@ -176,7 +176,7 @@ namespace ugdiss vector<count_type> tmp(wcnt.size(),0); for (size_t i = 1; i < wcnt.size(); ++i) tmp[i] = tmp[i-1] + wcnt[i-1]; - + // Now dump all token positions into the right place in sufa this->corpusSize = 0; for (id_type sid = filter->find_first(); @@ -204,7 +204,7 @@ namespace ugdiss for (size_t i = 0; i < wcnt.size(); i++) { if (log && wcnt[i] > 5000) - *log << "sorting " << wcnt[i] + *log << "sorting " << wcnt[i] << " entries starting with id " << i << "." << endl; index[i+1] = index[i]+wcnt[i]; assert(index[i+1]==tmp[i]); // sanity check @@ -247,7 +247,7 @@ namespace ugdiss imTSA<TOKEN>:: getUpperBound(id_type id) const { - if (++id >= this->index.size()) + if (++id >= this->index.size()) return NULL; assert(index[id] <= this->sufa.size()); return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]); @@ -263,7 +263,7 @@ namespace ugdiss sid = reinterpret_cast<cpos const*>(p)->sid; return p; } - + template<typename TOKEN> char const* imTSA<TOKEN>:: @@ -306,11 +306,11 @@ namespace ugdiss cpos const* xq = reinterpret_cast<cpos const*>(q); return xq-xp; } - + template<typename TOKEN> - void + void imTSA<TOKEN>:: - getCounts(char const* p, char const* const q, + getCounts(char const* p, char const* const q, count_type& sids, count_type& raw) const { id_type sid; // uint16_t off; @@ -328,7 +328,7 @@ namespace ugdiss } template<typename TOKEN> - void + void imTSA<TOKEN>:: save_as_mm_tsa(string fname) const { @@ -352,34 +352,34 @@ namespace ugdiss for (size_t i = 0; i < mmIndex.size(); i++) numwrite(out,mmIndex[i]-mmIndex[0]); out.seekp(0); - numwrite(out,idxStart); + numwrite(out,idxStart); out.close(); } template<typename TOKEN> imTSA<TOKEN>:: - imTSA(imTSA<TOKEN> const& prior, + imTSA(imTSA<TOKEN> const& prior, boost::shared_ptr<imTtrack<TOKEN> const> const& crp, vector<id_type> const& newsids, size_t const vsize) { typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(crp.get()); - + // count how many tokens will be added to the TSA // and index the new additions to the corpus size_t newToks = 0; - BOOST_FOREACH(id_type sid, newsids) + BOOST_FOREACH(id_type sid, newsids) newToks += crp->sntLen(sid); vector<cpos> nidx(newToks); // new array entries - + size_t n = 0; - BOOST_FOREACH(id_type sid, newsids) + BOOST_FOREACH(id_type sid, newsids) { assert(sid < crp->size()); for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n) { nidx[n].offset = o; nidx[n].sid = sid; } } sort(nidx.begin(),nidx.end(),sorter); - + // create the new suffix array this->numTokens = newToks + prior.sufa.size(); this->sufa.resize(this->numTokens); @@ -388,10 +388,10 @@ namespace ugdiss this->corpusSize = crp->size(); this->corpus = crp; this->index.resize(vsize+1); - + size_t i = 0; typename vector<cpos>::iterator k = this->sufa.begin(); - // cerr << newToks << " new items at " + // cerr << newToks << " new items at " // << __FILE__ << ":" << __LINE__ << endl; for (size_t n = 0; n < nidx.size();) { @@ -402,7 +402,7 @@ namespace ugdiss this->index[i] = k - this->sufa.begin(); if (++i < prior.index.size() && prior.index[i-1] < prior.index[i]) { - k = copy(prior.sufa.begin() + prior.index[i-1], + k = copy(prior.sufa.begin() + prior.index[i-1], prior.sufa.begin() + prior.index[i], k); } } @@ -410,13 +410,13 @@ namespace ugdiss if (++i < prior.index.size() && prior.index[i] > prior.index[i-1]) { size_t j = prior.index[i-1]; - while (j < prior.index[i] && n < nidx.size() + while (j < prior.index[i] && n < nidx.size() && crp->getToken(nidx[n])->id() < i) { assert(k < this->sufa.end()); if (sorter(prior.sufa[j],nidx[n])) *k++ = prior.sufa[j++]; - else + else *k++ = nidx[n++]; } while (j < prior.index[i]) @@ -436,7 +436,7 @@ namespace ugdiss while (++i < this->index.size()) { if (i < prior.index.size() && prior.index[i-1] < prior.index[i]) - k = copy(prior.sufa.begin() + prior.index[i-1], + k = copy(prior.sufa.begin() + prior.index[i-1], prior.sufa.begin() + prior.index[i], k); this->index[i] = k - this->sufa.begin(); } @@ -462,5 +462,5 @@ namespace ugdiss } } - + #endif diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h index ac49ebcd4..20ab653f4 100644 --- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h @@ -1,6 +1,6 @@ // -*- c++ -*- // In-memory corpus track -// (c) 2006-2012 Ulrich Germann. +// (c) 2006-2012 Ulrich Germann. #ifndef __ug_im_ttrack #define __ug_im_ttrack @@ -36,20 +36,20 @@ namespace ugdiss template<typename Token> class imTtrack; template<typename TOKEN> - typename boost::shared_ptr<imTtrack<TOKEN> > + typename boost::shared_ptr<imTtrack<TOKEN> > append(typename boost::shared_ptr<imTtrack<TOKEN> > const & crp, vector<TOKEN> const & snt); template<typename Token> class imTtrack : public Ttrack<Token> { - + private: size_t numToks; boost::shared_ptr<vector<vector<Token> > > myData; // pointer to corpus data friend class imTSA<Token>; - friend - typename boost::shared_ptr<imTtrack<Token> > + friend + typename boost::shared_ptr<imTtrack<Token> > append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt); void m_check_token_count(); // debugging function @@ -60,14 +60,14 @@ namespace ugdiss imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL); imTtrack(size_t reserve = 0); // imTtrack(istream& in, Vocab& V); - + /** return pointer to beginning of sentence */ - Token const* sntStart(size_t sid) const; + Token const* sntStart(size_t sid) const; /** return pointer to beginning of sentence */ - Token const* sntEnd(size_t sid) const; + Token const* sntEnd(size_t sid) const; - size_t size() const; + size_t size() const; size_t numTokens() const; id_type findSid(Token const* t) const; @@ -82,16 +82,16 @@ namespace ugdiss size_t check = 0; BOOST_FOREACH(vector<Token> const& s, *myData) check += s.size(); - UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]" + UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]" << " Wrong token count after appending sentence!" - << " Counted " << check << " but expected " - << this->numToks << " in a total of " << myData->size() + << " Counted " << check << " but expected " + << this->numToks << " in a total of " << myData->size() << " sentences."); - + } template<typename Token> - Token const* + Token const* imTtrack<Token>:: sntStart(size_t sid) const // return pointer to beginning of sentence { @@ -99,9 +99,9 @@ namespace ugdiss if ((*myData)[sid].size() == 0) return NULL; return &((*myData)[sid].front()); } - + template<typename Token> - Token const* + Token const* imTtrack<Token>:: sntEnd(size_t sid) const // return pointer to end of sentence { @@ -109,9 +109,9 @@ namespace ugdiss if ((*myData)[sid].size() == 0) return NULL; return &(*myData)[sid].back()+1; } - + template<typename Token> - size_t + size_t imTtrack<Token>:: size() const // return size of corpus (in number of sentences) { @@ -120,15 +120,15 @@ namespace ugdiss // offset in the myIndex than there are sentences return myData->size(); } - + template<typename Token> - size_t + size_t imTtrack<Token>:: numTokens() const // return size of corpus (in number of words) { return numToks; } - + template<typename Token> imTtrack<Token>:: imTtrack(istream& in, TokenIndex const& V, ostream* log) @@ -140,19 +140,19 @@ namespace ugdiss boost::unordered_map<string,id_type> H; for (id_type i = 0; i < V.knownVocabSize(); ++i) H[V[i]] = i; - while (getline(in,line)) + while (getline(in,line)) { myData->push_back(vector<Token>()); - if (log && ++linectr%1000000==0) + if (log && ++linectr%1000000==0) *log << linectr/1000000 << "M lines of input processed" << endl; istringstream buf(line); - while (buf>>w) + while (buf>>w) myData->back().push_back(Token(H[w])); myData->back().resize(myData.back().size()); numToks += myData->back().size(); } } - + template<typename Token> imTtrack<Token>:: imTtrack(size_t reserve) @@ -171,7 +171,7 @@ namespace ugdiss BOOST_FOREACH(vector<Token> const& v, *d) numToks += v.size(); } - + template<typename Token> id_type imTtrack<Token>:: @@ -182,7 +182,7 @@ namespace ugdiss { vector<Token> const& v = (*myData)[i]; if (v.size() == 0) continue; - if (&v.front() <= t && &v.back() >= t) + if (&v.front() <= t && &v.back() >= t) break; } return i; @@ -190,7 +190,7 @@ namespace ugdiss /// add a sentence to the database template<typename TOKEN> - boost::shared_ptr<imTtrack<TOKEN> > + boost::shared_ptr<imTtrack<TOKEN> > append(boost::shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt) { #if 1 diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h index 53628e3b3..742e0dd4e 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h @@ -15,14 +15,14 @@ using namespace std; namespace ugdiss { - template<typename TKN> - class + template<typename TKN> + class LexicalPhraseScorer1 { typedef boost::unordered_map<id_type, float> inner_map_t; vector<inner_map_t> L1_given_L2; vector<inner_map_t> L2_given_L1; - void load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2, + void load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2, vector<inner_map_t> & lex); public: void open(string const& bname, string const& L1, string const& L2, @@ -34,14 +34,14 @@ namespace ugdiss TKN const* snt2, size_t const s2, size_t const e2, char const* const aln_start, char const* const aln_end, float & fwd_score, float& bwd_score); - float permissive_lookup(vector<inner_map_t> const& lex, + float permissive_lookup(vector<inner_map_t> const& lex, id_type const s, id_type const t) const; }; - + template<typename TKN> void LexicalPhraseScorer1<TKN>:: - load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2, + load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2, vector<inner_map_t> & lex) { boost::iostreams::filtering_istream in; @@ -52,20 +52,20 @@ namespace ugdiss while (in >> w1 >> w2 >> p) { id_type id1 = V1[w1]; - while (lex.size() <= id1) + while (lex.size() <= id1) lex.push_back(inner_map_t()); lex[id1][V2[w2]] = p; } } - + template<typename TKN> void LexicalPhraseScorer1<TKN>:: open(string const& bname, string const& L1, string const& L2, TokenIndex & V1, TokenIndex & V2) { - string lex1 = bname+L1+"-"+L2+"."+L1+"-given-"+L2+".lex.gz"; - string lex2 = bname+L1+"-"+L2+"."+L2+"-given-"+L1+".lex.gz"; + string lex1 = bname+L1+"-"+L2+"."+L1+"-given-"+L2+".lex.gz"; + string lex2 = bname+L1+"-"+L2+"."+L2+"-given-"+L1+".lex.gz"; cout << lex1 << endl; cout << lex2 << endl; load_lex(lex1,V1,V2,L1_given_L2); @@ -86,9 +86,9 @@ namespace ugdiss { i1 = aln[k]; i2 = aln[++k]; if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue; - p1[i1] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id()); + p1[i1] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id()); ++c1[i1]; - p2[i2] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id()); + p2[i2] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id()); ++c2[i2]; } fwd_score = 0; @@ -110,7 +110,7 @@ namespace ugdiss template<typename TKN> float LexicalPhraseScorer1<TKN>:: - permissive_lookup(vector<inner_map_t> const& lex, + permissive_lookup(vector<inner_map_t> const& lex, id_type const s, id_type const t) const { if (s >= lex.size()) return 1.0; @@ -135,9 +135,9 @@ namespace ugdiss // assert(snt1[i2].id() < L1_given_L2.size()); // assert(snt2[i2].id() < L2_given_L1.size()); if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue; - p1[i1] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id()); + p1[i1] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id()); ++c1[i1]; - p2[i2] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id()); + p2[i2] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id()); ++c2[i2]; } fwd_score = 0; diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h index b7e359223..fdd0366df 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h @@ -18,8 +18,8 @@ using namespace std; namespace ugdiss { - template<typename TKN> - class + template<typename TKN> + class LexicalPhraseScorer2 { vector<string> ftag; @@ -28,28 +28,28 @@ namespace ugdiss table_t COOC; void open(string const& fname); template<typename someint> - void + void score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, vector<someint> const & aln, float const alpha, float & fwd_score, float& bwd_score) const; - void + void score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, char const* const aln_start, char const* const aln_end, float const alpha, float & fwd_score, float& bwd_score) const; // plup: permissive lookup - float plup_fwd(id_type const s,id_type const t, float const alpha) const; + float plup_fwd(id_type const s,id_type const t, float const alpha) const; float plup_bwd(id_type const s,id_type const t, float const alpha) const; - // to be done: - // - on-the-fly smoothing ? - // - better (than permissive-lookup) treatment of unknown combinations + // to be done: + // - on-the-fly smoothing ? + // - better (than permissive-lookup) treatment of unknown combinations // permissive lookup is currently used for compatibility reasons // - zens-ney smoothed scoring via noisy-or combination }; - + template<typename TKN> void LexicalPhraseScorer2<TKN>:: @@ -64,7 +64,7 @@ namespace ugdiss LexicalPhraseScorer2<TKN>:: score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, - vector<someint> const & aln, float const alpha, + vector<someint> const & aln, float const alpha, float & fwd_score, float& bwd_score) const { vector<float> p1(e1,0), p2(e2,0); @@ -74,9 +74,9 @@ namespace ugdiss { i1 = aln[k]; i2 = aln[++k]; if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue; - p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha); + p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha); ++c1[i1]; - p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha); + p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha); ++c2[i2]; } fwd_score = 0; @@ -105,19 +105,19 @@ namespace ugdiss << ": alpha parameter must be >= 0"); float ret = COOC[s][t]+alpha; ret = (ret?ret:1.)/(COOC.m1(s)+alpha); - UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ + UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ << ": result not > 0 and <= 1. alpha = " << alpha << "; " << COOC[s][t] << "/" << COOC.m1(s)); #if 0 - cerr << "[" << s << "," << t << "] " - << COOC.m1(s) << "/" - << COOC[s][t] << "/" + cerr << "[" << s << "," << t << "] " + << COOC.m1(s) << "/" + << COOC[s][t] << "/" << COOC.m2(t) << endl; #endif return ret; } - + template<typename TKN> float LexicalPhraseScorer2<TKN>:: @@ -128,11 +128,11 @@ namespace ugdiss << ": alpha parameter must be >= 0"); float ret = float(COOC[s][t]+alpha); ret = (ret?ret:1.)/(COOC.m2(t)+alpha); - UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ + UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ << ": result not > 0 and <= 1."); return ret; } - + template<typename TKN> void LexicalPhraseScorer2<TKN>:: @@ -148,9 +148,9 @@ namespace ugdiss { x = binread(binread(x,i1),i2); if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue; - p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha); + p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha); ++c1[i1]; - p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha); + p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha); ++c2[i2]; } fwd_score = 0; diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc index 495501bd6..d0522c528 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc @@ -10,26 +10,26 @@ namespace Moses // bounds LFT and RGT and update the actual bounds L and R; update // the total count of alignment links in the underlying phrase // pair - bool + bool check(vector<ushort> const& v, // alignment row/column size_t const LFT, size_t const RGT, // hard limits ushort& L, ushort& R, size_t& count) // current bounds, count { if (v.size() == 0) return 0; - if (L > v.front() && (L=v.front()) < LFT) return false; + if (L > v.front() && (L=v.front()) < LFT) return false; if (R < v.back() && (R=v.back()) > RGT) return false; count += v.size(); return true; } - + /// return number of alignment points in box, -1 on failure - int + int expand_block(vector<vector<ushort> > const& row2col, vector<vector<ushort> > const& col2row, size_t row, size_t col, // seed coordinates - size_t const TOP, size_t const LFT, // hard limits - size_t const BOT, size_t const RGT, // hard limits - ushort* top = NULL, ushort* lft = NULL, + size_t const TOP, size_t const LFT, // hard limits + size_t const BOT, size_t const RGT, // hard limits + ushort* top = NULL, ushort* lft = NULL, ushort* bot = NULL, ushort* rgt = NULL) // store results { if (row < TOP || row > BOT || col < LFT || col > RGT) return -1; @@ -37,7 +37,7 @@ namespace Moses UTIL_THROW_IF2(col >= col2row.size(), "out of bounds"); // ==================================================== - // tables grow downwards, so TOP is smaller than BOT! + // tables grow downwards, so TOP is smaller than BOT! // ==================================================== ushort T, L, B, R; // box dimensions @@ -45,7 +45,7 @@ namespace Moses // if we start on an empty cell, search for the first alignment point if (row2col[row].size() == 0 && col2row[col].size() == 0) { - if (row == TOP) while (row < BOT && !row2col[++row].size()); + if (row == TOP) while (row < BOT && !row2col[++row].size()); else if (row == BOT) while (row > TOP && !row2col[--row].size()); if (col == LFT) while (col < RGT && !col2row[++col].size()); @@ -54,7 +54,7 @@ namespace Moses if (row2col[row].size() == 0 && col2row[col].size() == 0) return 0; } - if (row2col[row].size() == 0) + if (row2col[row].size() == 0) row = col2row[col].front(); if (col2row[col].size() == 0) col = row2col[row].front(); @@ -65,9 +65,9 @@ namespace Moses if ((R = row2col[row].back()) > RGT) return -1; if (B == T && R == L) return 1; - + // start/end of row / column coverage: - ushort rs = row, re = row, cs = col, ce = col; + ushort rs = row, re = row, cs = col, ce = col; int ret = row2col[row].size(); for (size_t tmp = 1; tmp; ret += tmp) { @@ -127,7 +127,7 @@ namespace Moses if (expand_block(a1,a2,x,y,T,L,B,R) >= 0) return Moses::LRModel::S; while (s2-- && a2[s2].size() == 0); - + Moses::LRModel::ReorderingType ret; ret = (a2[s2].size() == 0 ? po_other : a2[s2].back() < s1 ? Moses::LRModel::DR : diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h index d432ea37e..9004b757e 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h @@ -7,13 +7,13 @@ namespace Moses { namespace bitext { typedef Moses::LRModel::ReorderingType PhraseOrientation; -PhraseOrientation +PhraseOrientation find_po_fwd(std::vector<std::vector<ushort> >& a1, std::vector<std::vector<ushort> >& a2, size_t b1, size_t e1, size_t b2, size_t e2); -PhraseOrientation +PhraseOrientation find_po_bwd(std::vector<std::vector<ushort> >& a1, std::vector<std::vector<ushort> >& a2, size_t b1, size_t e1, @@ -21,5 +21,5 @@ find_po_bwd(std::vector<std::vector<ushort> >& a1, - + }} // close namespaces diff --git a/moses/TranslationModel/UG/mm/ug_load_primer.h b/moses/TranslationModel/UG/mm/ug_load_primer.h index 1cd167a68..961c45da1 100644 --- a/moses/TranslationModel/UG/mm/ug_load_primer.h +++ b/moses/TranslationModel/UG/mm/ug_load_primer.h @@ -1,7 +1,7 @@ //-*- c++ -*- #pragma once #include <boost/iostreams/device/mapped_file.hpp> -// +// namespace Moses { class FastLoader @@ -14,5 +14,5 @@ namespace Moses void prime(boost::iostreams::mapped_file_source const& f); - + }; diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h index d1c9a9767..0000b194f 100644 --- a/moses/TranslationModel/UG/mm/ug_lru_cache.h +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -30,25 +30,25 @@ namespace lru_cache // timeval tstamp; // time stamp typename boost::shared_ptr<VAL> ptr; // cached shared ptr }; - + mutable boost::shared_mutex m_lock; uint32_t m_qfront, m_qback; - vector<Record> m_recs; + vector<Record> m_recs; map_t m_idx; - void + void update_queue(KEY const& key, uint32_t const p) { // CALLER MUST LOCK! - // "remove" item in slot p from it's current position of the - // queue (which is different from the slot position) and move it + // "remove" item in slot p from it's current position of the + // queue (which is different from the slot position) and move it // to the end Record& r = m_recs[p]; if (m_recs.size() == 1) r.next = r.prev = m_qback = m_qfront = 0; - + if (r.key != key || p == m_qback) return; - + if (m_qfront == p) m_qfront = m_recs[r.next].prev = r.next; else @@ -65,8 +65,8 @@ namespace lru_cache size_t capacity() const { return m_recs.capacity(); } void reserve(size_t s) { m_recs.reserve(s); } - sptr<VAL> - get(KEY const& key) + sptr<VAL> + get(KEY const& key) { uint32_t p; { // brackets needed for lock scoping @@ -86,13 +86,13 @@ namespace lru_cache boost::lock_guard<boost::shared_mutex> lock(m_lock); pair<typename map_t::iterator,bool> foo; foo = m_idx.insert(make_pair(key,m_recs.size())); - + uint32_t p = foo.first->second; if (foo.second) // was not in the cache { if (m_recs.size() < m_recs.capacity()) m_recs.push_back(Record()); - else + else { foo.first->second = p = m_qfront; m_idx.erase(m_recs[p].key); diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h index cfc86b8fc..2455ca603 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h +++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h @@ -24,12 +24,12 @@ namespace ugdiss ID id; VAL val; - bool + bool operator<(ID const otherId) const { return id < otherId; } - + bool operator<(Cell const& other) const { @@ -60,14 +60,14 @@ namespace ugdiss ID numCols; boost::shared_ptr<bio::mapped_file_source> file; - VAL m1(ID key) const - { - return (key < numRows) ? M1[key] : INIT(0); + VAL m1(ID key) const + { + return (key < numRows) ? M1[key] : INIT(0); } VAL m2(ID key) const { - return (key < numCols) ? M2[key] : INIT(0); + return (key < numCols) ? M2[key] : INIT(0); } @@ -106,7 +106,7 @@ namespace ugdiss Cell const* c = lower_bound(start,stop,key); return (c != stop && c->id == key ? c->val : INIT(0)); } - + template<typename OFFSET, typename ID, typename VAL, typename INIT> void mm2dTable<OFFSET,ID,VAL,INIT>:: @@ -140,10 +140,10 @@ namespace ugdiss // cout << numRows << " rows; " << numCols << " columns " << endl; M1 = reinterpret_cast<VAL const*>(index+numRows+1); M2 = M1+numRows; - // cout << "Table " << fname << " has " << numRows << " rows and " + // cout << "Table " << fname << " has " << numRows << " rows and " // << numCols << " columns." << endl; - // cout << "File size is " << file.size()*1024 << " bytes; "; - // cout << "M2 starts " << (reinterpret_cast<char const*>(M2) - file.data()) + // cout << "File size is " << file.size()*1024 << " bytes; "; + // cout << "M2 starts " << (reinterpret_cast<char const*>(M2) - file.data()) // << " bytes into the file" << endl; // cout << M2[0] << endl; } @@ -156,8 +156,8 @@ namespace ugdiss typename ICONT // inner container type > void - write_mm_2d_table(ostream& out, vector<ICONT> const& T, - vector<VAL> const* m1 = NULL, + write_mm_2d_table(ostream& out, vector<ICONT> const& T, + vector<VAL> const* m1 = NULL, vector<VAL> const* m2 = NULL) { assert(T.size()); @@ -223,7 +223,7 @@ namespace ugdiss OFFSET o = index[i]; // (index[i]-index[0])/sizeof(VAL); out.write(reinterpret_cast<char*>(&o),sizeof(OFFSET)); } - + // write marginals out.write(reinterpret_cast<char const*>(&(*m1)[0]),m1->size()*sizeof(VAL)); out.write(reinterpret_cast<char const*>(&(*m2)[0]),m2->size()*sizeof(VAL)); diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h index 5b18ff1fa..be3fdfce8 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h @@ -3,7 +3,7 @@ namespace Moses { - namespace bitext + namespace bitext { template<typename TKN> class mmBitext : public Bitext<TKN> @@ -17,18 +17,18 @@ namespace Moses template<typename TKN> mmBitext<TKN>:: mmBitext() - : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(), - new TokenIndex(), new TokenIndex(), + : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(), + new TokenIndex(), new TokenIndex(), new mmTSA<TKN>(), new mmTSA<TKN>()) {}; - + template<typename TKN> void mmBitext<TKN>:: load_document_map(string const& fname) { ifstream docmap(fname.c_str()); - // the docmap file should list the documents in the corpus + // the docmap file should list the documents in the corpus // in the order in which they appear with one line per document: // <docname> <number of lines / sentences> // @@ -38,22 +38,22 @@ namespace Moses this->m_sid2docid.reset(new vector<id_type>(this->T1->size())); while(getline(docmap,buffer)) { - istringstream line(buffer); + istringstream line(buffer); if (!(line>>docname)) continue; // empty line if (docname.size() && docname[0] == '#') continue; // comment size_t docid = this->m_docname2docid.size(); this->m_docname2docid[docname] = docid; this->m_docname.push_back(docname); line >> b; - VERBOSE(1, "DOCUMENT MAP " << docname + VERBOSE(1, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << endl); for (b += a; a < b; ++a) (*this->m_sid2docid)[a] = docid; } - UTIL_THROW_IF2(b != this->T1->size(), + UTIL_THROW_IF2(b != this->T1->size(), "Document map doesn't match corpus!"); } - + template<typename TKN> void mmBitext<TKN>:: @@ -77,6 +77,6 @@ namespace Moses if (!access(docmapfile.c_str(),F_OK)) load_document_map(docmapfile); } - + } } diff --git a/moses/TranslationModel/UG/mm/ug_mm_tsa.h b/moses/TranslationModel/UG/mm/ug_mm_tsa.h index 9d5038e26..ff2d4c693 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h @@ -40,7 +40,7 @@ namespace ugdiss char const* index_jump(char const* a, char const* z, float ratio) const; char const* getLowerBound(id_type t) const; char const* getUpperBound(id_type t) const; - + public: mmTSA(); mmTSA(string fname, Ttrack<TOKEN> const* c); @@ -53,24 +53,24 @@ namespace ugdiss rawCnt(char const* p, char const * const q) const; void - getCounts(char const* p, char const * const q, + getCounts(char const* p, char const * const q, count_type& sids, count_type& raw) const; - char const* + char const* readSid(char const* p, char const* q, id_type& sid) const; - char const* + char const* readSid(char const* p, char const* q, ::uint64_t& sid) const; - char const* + char const* readOffset(char const* p, char const* q, uint16_t& offset) const; - char const* + char const* readOffset(char const* p, char const* q, ::uint64_t& offset) const; void sanityCheck() const; - }; + }; // ====================================================================== @@ -78,13 +78,13 @@ namespace ugdiss * assumes that keys are flagged with '1', values with '0' */ template<typename TOKEN> - char const* + char const* mmTSA<TOKEN>:: index_jump(char const* a, char const* z, float ratio) const { assert(ratio >= 0 && ratio < 1); char const* m = a+int(ratio*(z-a)); - if (m > a) + if (m > a) { while (m > a && *m < 0) --m; while (m > a && *m >= 0) --m; @@ -98,7 +98,7 @@ namespace ugdiss template<typename TOKEN> mmTSA<TOKEN>:: - mmTSA() + mmTSA() { this->startArray = NULL; this->endArray = NULL; @@ -136,9 +136,9 @@ namespace ugdiss filepos_type idxOffset; p = numread(p,idxOffset); p = numread(p,this->indexSize); - + // cerr << fname << ": " << idxOffset << " " << this->indexSize << endl; - + this->startArray = p; this->index = reinterpret_cast<filepos_type const*>(file.data()+idxOffset); this->endArray = reinterpret_cast<char const*>(index); @@ -153,7 +153,7 @@ namespace ugdiss mmTSA<TOKEN>:: getLowerBound(id_type id) const { - if (id >= this->indexSize) + if (id >= this->indexSize) return NULL; return this->startArray + this->index[id]; } @@ -165,7 +165,7 @@ namespace ugdiss mmTSA<TOKEN>:: getUpperBound(id_type id) const { - if (id >= this->indexSize) + if (id >= this->indexSize) return NULL; // if (index[id] == index[id+1]) // return NULL; @@ -232,13 +232,13 @@ namespace ugdiss } return ret; } - + // ====================================================================== template<typename TOKEN> - void + void mmTSA<TOKEN>:: - getCounts(char const* p, char const* const q, + getCounts(char const* p, char const* const q, count_type& sids, count_type& raw) const { raw = 0; diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h index 51ba21778..bfee14e3e 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h @@ -26,7 +26,7 @@ namespace ugdiss { using namespace std; namespace bio=boost::iostreams; - + template<typename TKN=id_type> class mmTtrack : public Ttrack<TKN> { @@ -38,21 +38,21 @@ namespace ugdiss id_type numWords; bio::mapped_file_source file; Token const* data; // pointer to first word of first sentence - id_type const* index; /* pointer to index (change data type for corpora + id_type const* index; /* pointer to index (change data type for corpora * of more than four billion words) */ public: mmTtrack(string fname); mmTtrack(); - // return pointer to beginning of sentence - Token const* sntStart(size_t sid) const; + // return pointer to beginning of sentence + Token const* sntStart(size_t sid) const; - // return pointer to end of sentence - Token const* sntEnd(size_t sid) const; + // return pointer to end of sentence + Token const* sntEnd(size_t sid) const; // return size of corpus (in number of sentences) - size_t size() const; + size_t size() const; // return size of corpus (in number of sentences) size_t numTokens() const; @@ -60,23 +60,23 @@ namespace ugdiss // open an mmTtrack file void open(string fname); - // FUNCTIONS FOR BUILDING CORPUS TRACKS - // write a blank file header at the beginning of a new ttrack file + // FUNCTIONS FOR BUILDING CORPUS TRACKS + // write a blank file header at the beginning of a new ttrack file void write_blank_file_header(ostream& out) const; // write the sentence index /idx/ and fill the file header - void write_index_and_finalize(ostream& out, + void write_index_and_finalize(ostream& out, vector<id_type> const& idx, count_type tokenCount) const; // copy a contiguous sequence of sentences to another stream // return the number of tokens copied id_type copySentences(ostream& trg, id_type start, id_type stop) const; - + /** find the sentence id of a given token */ - id_type findSid(TKN const* t) const; + id_type findSid(TKN const* t) const; - id_type findSid(id_type tokenOffset) const; + id_type findSid(id_type tokenOffset) const; /// re-assign ids based on the id maps in /f/ void remap(string const fname, vector<id_type const*> const & f) const; @@ -88,7 +88,7 @@ namespace ugdiss void mmTtrack<TKN>:: remap(string const fname, vector<id_type const*> const & f) const - { + { bio::mapped_file myfile(fname); assert(myfile.is_open()); Moses::prime(myfile); @@ -110,7 +110,7 @@ namespace ugdiss mmTtrack<TKN>:: size() const { - return this->numSent; + return this->numSent; } template<typename TKN> @@ -118,17 +118,17 @@ namespace ugdiss mmTtrack<TKN>:: numTokens() const { - return this->numWords; + return this->numWords; } template<typename TKN> - TKN const* + TKN const* mmTtrack<TKN>:: sntStart(size_t sid) const // return pointer to beginning of sentence { if (sid >= this->numSent) { - cerr << "Fatal error: requested sentence #"<<sid<<" is beyond corpus size (" + cerr << "Fatal error: requested sentence #"<<sid<<" is beyond corpus size (" << this->numSent <<")" << endl; } assert(sid < this->numSent); @@ -136,14 +136,14 @@ namespace ugdiss } template<typename TKN> - TKN const* + TKN const* mmTtrack<TKN>:: sntEnd(size_t sid) const // return pointer to end of sentence { assert(sid < this->numSent); return data+index[sid+1]; } - + template<typename TKN> mmTtrack<TKN>:: mmTtrack() @@ -161,7 +161,7 @@ namespace ugdiss } template<typename TKN> - void + void mmTtrack<TKN>:: open(string fname) { @@ -235,7 +235,7 @@ namespace ugdiss } template<typename TKN> - id_type + id_type mmTtrack<TKN>:: copySentences(ostream& trg, id_type start, id_type stop) const { diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc index 2c00665bb..34e3f1b1e 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc @@ -21,7 +21,7 @@ // ++this->in_progress; // this->lock.unlock(); // } - + // void // pstats:: // release() @@ -52,7 +52,7 @@ // mmbitext() // : ag(NULL) // { - + // } // bool @@ -78,13 +78,13 @@ // { // if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); } // else { p = binread(p,src); assert(p<x); p = binread(p,trg); } -// if (src < start || src >= stop) +// if (src < start || src >= stop) // forbidden.set(trg); // else // { // lft = min(lft,trg); // rgt = max(rgt,trg); -// if (core_alignment) +// if (core_alignment) // { // if (flip) aln[trg].push_back(src); // else aln[src].push_back(trg); @@ -101,16 +101,16 @@ // } // cout << endl; // #endif - + // for (size_t i = lft; i <= rgt; ++i) -// if (forbidden[i]) +// if (forbidden[i]) // return false; - + // s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1); // e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2); - + // if (lft > rgt) return false; -// if (core_alignment) +// if (core_alignment) // { // core_alignment->clear(); // if (flip) @@ -147,11 +147,11 @@ // prep2(phrase); // } -// sptr<mmbitext::pstats> +// sptr<mmbitext::pstats> // mmbitext:: // prep2(iter const& phrase) // { -// if (!ag) +// if (!ag) // { // ag = new agenda(*this); // ag->add_workers(20); @@ -197,11 +197,11 @@ // continue; // } -// stats->lock.lock(); -// stats->good += 1; +// stats->lock.lock(); +// stats->good += 1; // stats->lock.unlock(); -// for (size_t k = 0; k < aln.size(); k += 2) +// for (size_t k = 0; k < aln.size(); k += 2) // aln[k] += s2 - s1; // Token const* o = (fwd ? ag.bitext.T2 : ag.bitext.T1).sntStart(sid); // float sample_weight = 1./((s2-s1+1)*(e2-e1+1)); @@ -215,14 +215,14 @@ // stats->add(b,sample_weight,aln); // if (i < e2) assert(b.extend(o[i].id())); // } -// if (fwd && s < s2) -// for (size_t k = 0; k < aln.size(); k += 2) +// if (fwd && s < s2) +// for (size_t k = 0; k < aln.size(); k += 2) // --aln[k]; // } // stats->release(); // } // } - + // void // mmbitext:: // pstats:: @@ -239,7 +239,7 @@ // agenda(mmbitext const& thebitext) // : shutdown(false), doomed(0), bitext(thebitext) // { - + // } // mmbitext:: @@ -259,13 +259,13 @@ // { // if (ag) delete ag; // } - + // sptr<mmbitext::pstats> // mmbitext:: // agenda:: // add_job(mmbitext::iter const& phrase, size_t const max_samples) // { -// static boost::posix_time::time_duration nodelay(0,0,0,0); +// static boost::posix_time::time_duration nodelay(0,0,0,0); // job j; // j.stats.reset(new mmbitext::pstats()); @@ -296,11 +296,11 @@ // bool // mmbitext:: // agenda:: -// get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, +// get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, // bool & fwd, sptr<mmbitext::pstats> & stats) // { // boost::unique_lock<boost::mutex> lock(this->lock); -// if (this->doomed || this->shutdown) +// if (this->doomed || this->shutdown) // { // if (this->doomed) --this->doomed; // return false; @@ -309,7 +309,7 @@ // // { // // cerr << "no jobs" << endl; // // this->ready.wait(lock); -// // if (this->doomed || this->shutdown) +// // if (this->doomed || this->shutdown) // // { // // if (this->doomed) --this->doomed; // // return false; @@ -346,7 +346,7 @@ // boost::lock_guard<boost::mutex> lock(stats->lock); // if (stats->raw_cnt == ctr) ++stats->raw_cnt; // size_t rnum = util::rand_excl(stats->raw_cnt - ctr++); -// // cout << stats->raw_cnt << " " << ctr-1 << " " +// // cout << stats->raw_cnt << " " << ctr-1 << " " // // << rnum << " " << max_samples - stats->good << endl; // if (rnum < max_samples - stats->good) // { @@ -364,7 +364,7 @@ // agenda:: // add_workers(int n) // { -// static boost::posix_time::time_duration nodelay(0,0,0,0); +// static boost::posix_time::time_duration nodelay(0,0,0,0); // boost::lock_guard<boost::mutex> lock(this->lock); // // house keeping: remove all workers that have finished // for (size_t i = 0; i < workers.size(); ) @@ -377,7 +377,7 @@ // } // else ++i; // } -// if (n < 0) +// if (n < 0) // { // this->doomed -= n; // } @@ -394,8 +394,8 @@ // mmbitext:: // jstats:: // jstats() -// { -// my_aln.reserve(1); +// { +// my_aln.reserve(1); // } // mmbitext:: @@ -406,8 +406,8 @@ // my_wcnt = other.wcnt(); // my_aln = other.aln(); // } - -// void + +// void // mmbitext:: // jstats:: // add(float w, vector<uchar> const& a) @@ -419,7 +419,7 @@ // { // size_t i = 0; // while (i < my_aln.size() && my_aln[i].second != a) ++i; -// if (i == my_aln.size()) +// if (i == my_aln.size()) // my_aln.push_back(pair<size_t,vector<uchar> >(1,a)); // else // my_aln[i].first++; @@ -431,7 +431,7 @@ // uint32_t // mmbitext:: // jstats:: -// rcnt() const +// rcnt() const // { return my_rcnt; } // float @@ -443,7 +443,7 @@ // vector<pair<size_t, vector<uchar> > > const& // mmbitext:: // jstats:: -// aln() const +// aln() const // { return my_aln; } // } diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.h b/moses/TranslationModel/UG/mm/ug_mmbitext.h index e7378e7f6..3837abc59 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.h +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.h @@ -4,10 +4,10 @@ // Written by Ulrich Germann // things we can do to speed up things: -// - set up threads at startup time that force the +// - set up threads at startup time that force the // data in to memory sequentially // -// - use multiple agendas for better load balancing and to avoid +// - use multiple agendas for better load balancing and to avoid // competition for locks #include <string> @@ -46,8 +46,8 @@ namespace Moses { class jstats; // phrase pair ("joint") statistics class agenda { - boost::mutex lock; - boost::condition_variable ready; + boost::mutex lock; + boost::condition_variable ready; class job; class worker; list<job> joblist; @@ -59,9 +59,9 @@ namespace Moses { agenda(mmbitext const& bitext); ~agenda(); void add_workers(int n); - sptr<pstats> add_job(mmbitext::iter const& phrase, + sptr<pstats> add_job(mmbitext::iter const& phrase, size_t const max_samples); - bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, + bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, bool & fwd, sptr<mmbitext::pstats> & stats); }; @@ -72,22 +72,22 @@ namespace Moses { mmTtrack<char> Tx; // word alignments mmTtrack<Token> T1,T2; // token tracks TokenIndex V1,V2; // vocabs - mmTSA<Token> I1,I2; // suffix arrays + mmTSA<Token> I1,I2; // suffix arrays /// given the source phrase sid[start:stop] - // find the possible start (s1 .. s2) and end (e1 .. e2) + // find the possible start (s1 .. s2) and end (e1 .. e2) // points of the target phrase; if non-NULL, store word - // alignments in *core_alignment. If /flip/, source phrase is + // alignments in *core_alignment. If /flip/, source phrase is // L2. - bool + bool find_trg_phr_bounds - (size_t const sid, size_t const start, size_t const stop, - size_t & s1, size_t & s2, size_t & e1, size_t & e2, + (size_t const sid, size_t const start, size_t const stop, + size_t & s1, size_t & s2, size_t & e1, size_t & e2, vector<uchar> * core_alignment, bool const flip) const; boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2; private: - sptr<pstats> + sptr<pstats> prep2(iter const& phrase); public: mmbitext(); @@ -105,8 +105,8 @@ namespace Moses { jstats { uint32_t my_rcnt; // unweighted count - float my_wcnt; // weighted count - vector<pair<size_t, vector<uchar> > > my_aln; + float my_wcnt; // weighted count + vector<pair<size_t, vector<uchar> > > my_aln; boost::mutex lock; public: jstats(); @@ -117,7 +117,7 @@ namespace Moses { void add(float w, vector<uchar> const& a); }; - // struct + // struct // mmbitext: // phrasepair // { @@ -125,32 +125,32 @@ namespace Moses { // size_t len; // size_t cnt; // float fwd, bwd; - + // map<uint32_t,uint32_t> aln; // string toString(TokenIndex const& V) const; // bool operator<(phrase const& other) const; // bool operator>(phrase const& other) const; // phrase(pair<pair<Token const*, size_t>,jstats> const & foo); - + // }; - struct + struct mmbitext:: pstats { boost::mutex lock; // for parallel gathering of stats boost::condition_variable ready; // consumers can wait for this data structure to be ready. - size_t raw_cnt; // (approximate) raw occurrence count + size_t raw_cnt; // (approximate) raw occurrence count size_t sample_cnt; // number of instances selected during sampling size_t good; // number of selected instances with valid word alignments size_t sum_pairs; - // size_t snt_cnt; + // size_t snt_cnt; // size_t sample_snt; size_t in_progress; // keeps track of how many threads are currently working on this boost::unordered_map<uint64_t, jstats> trg; - pstats(); + pstats(); // vector<phrase> nbest; // void select_nbest(size_t const N=10); void release(); @@ -167,7 +167,7 @@ namespace Moses { public: worker(agenda& a); void operator()(); - + }; class diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc index ec3423fdc..d533dafa3 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.cc +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc @@ -3,10 +3,10 @@ namespace Moses { namespace bitext { -void +void fill_lr_vec2 -( LRModel::ModelType mdl, float const* const cnt, - float const total, float* v) +( LRModel::ModelType mdl, float const* const cnt, + float const total, float* v) { if (mdl == LRModel::Monotonic) { @@ -23,17 +23,17 @@ fill_lr_vec2 else if (mdl == LRModel::MSD) { float denom = log(total + 3); - v[LRModel::M] = log(cnt[LRModel::M] + 1) - denom; - v[LRModel::S] = log(cnt[LRModel::S] + 1) - denom; - v[LRModel::D] = log(cnt[LRModel::DR] + + v[LRModel::M] = log(cnt[LRModel::M] + 1) - denom; + v[LRModel::S] = log(cnt[LRModel::S] + 1) - denom; + v[LRModel::D] = log(cnt[LRModel::DR] + cnt[LRModel::DL] + 1) - denom; } else if (mdl == LRModel::MSLR) { float denom = log(total + 4); - v[LRModel::M] = log(cnt[LRModel::M] + 1) - denom; + v[LRModel::M] = log(cnt[LRModel::M] + 1) - denom; v[LRModel::S] = log(cnt[LRModel::S] + 1) - denom; - v[LRModel::DL] = log(cnt[LRModel::DL] + 1) - denom; + v[LRModel::DL] = log(cnt[LRModel::DL] + 1) - denom; v[LRModel::DR] = log(cnt[LRModel::DR] + 1) - denom; } else UTIL_THROW2("Reordering type not recognized!"); diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 70d4b0d82..53a9f761c 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -11,7 +11,7 @@ namespace Moses namespace bitext { template<typename Token> - class + class PhrasePair { public: @@ -36,24 +36,24 @@ namespace Moses bool operator<(PhrasePair const& other) const; bool operator>(PhrasePair const& other) const; - bool operator<=(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; bool operator>=(PhrasePair const& other) const; void init(); - void init(uint64_t const pid1, bool is_inverse, + void init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len, pstats const* ps = NULL, size_t const numfeats=0); - PhrasePair const& - update(uint64_t const pid2, Token const* x, + PhrasePair const& + update(uint64_t const pid2, Token const* x, uint32_t const len, jstats const& js); void - fill_lr_vec(LRModel::Direction const& dir, - LRModel::ModelType const& mdl, + fill_lr_vec(LRModel::Direction const& dir, + LRModel::ModelType const& mdl, vector<float>& v) const; void - print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, + print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, LRModel const& LR) const; class SortByTargetIdSeq @@ -62,7 +62,7 @@ namespace Moses int cmp(PhrasePair const& a, PhrasePair const& b) const; bool operator()(PhrasePair const& a, PhrasePair const& b) const; }; - + class SortDescendingByJointCount { public: @@ -73,8 +73,8 @@ namespace Moses template<typename Token> void PhrasePair<Token> - ::init(uint64_t const pid1, bool is_inverse, - Token const* x, uint32_t const len, + ::init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, pstats const* ps, size_t const numfeats) { inverse = is_inverse; @@ -98,15 +98,15 @@ namespace Moses template<typename Token> PhrasePair<Token> const& PhrasePair<Token> - ::update(uint64_t const pid2, - Token const* x, uint32_t const len, jstats const& js) + ::update(uint64_t const pid2, + Token const* x, uint32_t const len, jstats const& js) { p2 = pid2; start2 = x; len2 = len; raw2 = js.cnt2(); joint = js.rcnt(); assert(js.aln().size()); - if (js.aln().size()) + if (js.aln().size()) aln = js.aln()[0].second; // float total_fwd = 0, total_bwd = 0; // for (int i = 0; i <= Moses::LRModel::NONE; i++) @@ -123,48 +123,48 @@ namespace Moses dfwd[i] = js.dcnt_fwd(po); dbwd[i] = js.dcnt_bwd(po); } - + indoc = js.indoc; return *this; } template<typename Token> - bool + bool PhrasePair<Token> - ::operator<(PhrasePair const& other) const - { - return this->score < other.score; + ::operator<(PhrasePair const& other) const + { + return this->score < other.score; } - + template<typename Token> - bool + bool PhrasePair<Token> ::operator>(PhrasePair const& other) const - { - return this->score > other.score; + { + return this->score > other.score; } template<typename Token> - bool + bool PhrasePair<Token> - ::operator<=(PhrasePair const& other) const - { - return this->score <= other.score; + ::operator<=(PhrasePair const& other) const + { + return this->score <= other.score; } - + template<typename Token> - bool + bool PhrasePair<Token> ::operator>=(PhrasePair const& other) const - { - return this->score >= other.score; + { + return this->score >= other.score; } template<typename Token> PhrasePair<Token> const& PhrasePair<Token> - ::operator+=(PhrasePair const& o) - { + ::operator+=(PhrasePair const& o) + { raw1 += o.raw1; raw2 += o.raw2; good1 += o.good1; @@ -178,16 +178,16 @@ namespace Moses template<typename Token> PhrasePair<Token> - ::PhrasePair(PhrasePair<Token> const& o) + ::PhrasePair(PhrasePair<Token> const& o) : start1(o.start1) , start2(o.start2) , len1(o.len1) , len2(o.len2) , p1(o.p1) , p2(o.p2) - , raw1(o.raw1) , raw2(o.raw2) + , raw1(o.raw1) , raw2(o.raw2) , sample1(o.sample1) , sample2(o.sample2) , good1(o.good1) , good2(o.good2) - , joint(o.joint) + , joint(o.joint) , fvals(o.fvals) - , aln(o.aln) + , aln(o.aln) , score(o.score) , inverse(o.inverse) , indoc(o.indoc) @@ -198,7 +198,7 @@ namespace Moses dbwd[i] = o.dbwd[i]; } } - + template<typename Token> int PhrasePair<Token> ::SortByTargetIdSeq @@ -207,7 +207,7 @@ namespace Moses size_t i = 0; Token const* x = a.start2; Token const* y = b.start2; - while (i < a.len2 && i < b.len2 && x->id() == y->id()) + while (i < a.len2 && i < b.len2 && x->id() == y->id()) { x = x->next(); y = y->next(); @@ -218,7 +218,7 @@ namespace Moses if (i == b.len2) return 1; return x->id() < y->id() ? -1 : 1; } - + template<typename Token> bool PhrasePair<Token> ::SortByTargetIdSeq @@ -237,16 +237,16 @@ namespace Moses } template<typename Token> - bool + bool PhrasePair<Token> ::SortDescendingByJointCount ::operator()(PhrasePair const& a, PhrasePair const& b) const { return this->cmp(a,b) < 0; } - + template<typename Token> - void + void PhrasePair<Token> ::init() { @@ -257,21 +257,21 @@ namespace Moses } - void - fill_lr_vec2(LRModel::ModelType mdl, float const* const cnt, + void + fill_lr_vec2(LRModel::ModelType mdl, float const* const cnt, float const total, float* v); - + template<typename Token> void PhrasePair<Token> - ::fill_lr_vec(LRModel::Direction const& dir, - LRModel::ModelType const& mdl, + ::fill_lr_vec(LRModel::Direction const& dir, + LRModel::ModelType const& mdl, vector<float>& v) const { // how many distinct scores do we have? size_t num_scores = (mdl == LRModel::MSLR ? 4 : mdl == LRModel::MSD ? 3 : 2); size_t offset; - if (dir == LRModel::Bidirectional) + if (dir == LRModel::Bidirectional) { offset = num_scores; num_scores *= 2; @@ -281,32 +281,32 @@ namespace Moses v.resize(num_scores); // determine the denominator - float total = 0; - for (size_t i = 0; i <= LRModel::NONE; ++i) + float total = 0; + for (size_t i = 0; i <= LRModel::NONE; ++i) total += dfwd[i]; if (dir != LRModel::Forward) // i.e., Backward or Bidirectional fill_lr_vec2(mdl, dbwd, total, &v[0]); if (dir != LRModel::Backward) // i.e., Forward or Bidirectional fill_lr_vec2(mdl, dfwd, total, &v[offset]); - } - + } + template<typename Token> void PhrasePair<Token> - ::print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, + ::print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, LRModel const& LR) const { - out << toString (V1, this->start1, this->len1) << " ::: " - << toString (V2, this->start2, this->len2) << " " + out << toString (V1, this->start1, this->len1) << " ::: " + << toString (V2, this->start2, this->len2) << " " << this->joint << " ["; for (size_t i = 0; i < this->indoc.size(); ++i) - { - if (i) out << " "; - out << this->indoc[i]; + { + if (i) out << " "; + out << this->indoc[i]; } - out << "] ["; + out << "] ["; vector<float> lrscores; this->fill_lr_vec(LR.GetDirection(), LR.GetModelType(), lrscores); for (size_t i = 0; i < lrscores.size(); ++i) @@ -322,7 +322,7 @@ namespace Moses if (i) *log << " "; *log << p.dfwd[i]; } - *log << "] ["; + *log << "] ["; for (int i = 0; i <= Moses::LRModel::NONE; i++) { // PhraseOrientation po = static_cast<PhraseOrientation>(i); diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index fea57e719..95b93ec7b 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -1,7 +1,7 @@ #include "ug_sampling_bias.h" #include <iostream> #include <boost/foreach.hpp> - + #ifdef HAVE_CURLPP #include <curlpp/Options.hpp> #include <curlpp/cURLpp.hpp> @@ -15,11 +15,11 @@ namespace Moses using ugdiss::id_type; #ifdef HAVE_CURLPP - std::string + std::string query_bias_server(std::string const& url, std::string const& text) { // communicate with the bias server; resuts will be in ... - std::ostringstream os; + std::ostringstream os; curlpp::Easy myRequest; std::string query = url+curlpp::escape(text); myRequest.setOpt(new curlpp::options::Url(query)); @@ -32,7 +32,7 @@ namespace Moses DocumentBias ::DocumentBias - ( std::vector<id_type> const& sid2doc, + ( std::vector<id_type> const& sid2doc, std::map<std::string,id_type> const& docname2docid, std::string const& server_url, std::string const& text, std::ostream* log) @@ -45,15 +45,15 @@ namespace Moses #endif } - void + void DocumentBias ::init_from_json ( std::string const& json, std::map<std::string,id_type> const& docname2docid, std::ostream* log) - { // poor man's special purpose json parser for responses from the + { // poor man's special purpose json parser for responses from the // MMT bias server - - std::string d; float total = 0; std::map<std::string,float> bias; + + std::string d; float total = 0; std::map<std::string,float> bias; size_t i = 0; while (i < json.size() && json[i] != '"') ++i; while (++i < json.size()) { @@ -61,34 +61,34 @@ namespace Moses if (i >= json.size()) break; float& f = bias[json.substr(k,i-k)]; while (++i < json.size() && json[i] != ':'); - k = ++i; + k = ++i; while (++i < json.size() && json[i] != ',' && json[i] != '}'); total += (f = atof(json.substr(k, i-k).c_str())); k = ++i; while (i < json.size() && json[i] != '"') ++i; } - + typedef std::pair<std::string const,float> item; - if (total) { BOOST_FOREACH(item& x, bias) { x.second /= total; } } + if (total) { BOOST_FOREACH(item& x, bias) { x.second /= total; } } if (log) { - BOOST_FOREACH(item& x, bias) + BOOST_FOREACH(item& x, bias) { std::map<std::string,id_type>::const_iterator m; m = docname2docid.find(x.first); int docid = m != docname2docid.end() ? m->second : -1; - *log << "CONTEXT SERVER RESPONSE " + *log << "CONTEXT SERVER RESPONSE " << "[" << docid << "] " - << x.first << " " << x.second << std::endl; + << x.first << " " << x.second << std::endl; } } init(bias, docname2docid); - + // using xmlrpc_parse_json didn't always work (parser errors) // xmlrpc_value* b = xmlrpc_parse_json(env ,buf.str().c_str()); - // std::cerr << "|" << buf.str() << "|" << std::endl; - // // if (b == NULL) std::cerr << "OOpS" << std::endl; + // std::cerr << "|" << buf.str() << "|" << std::endl; + // // if (b == NULL) std::cerr << "OOpS" << std::endl; // xmlrpc_c::value_struct v(b); // = *b; - // std::map<std::string, xmlrpc_c::value> const + // std::map<std::string, xmlrpc_c::value> const // bmap = static_cast<map<std::string, xmlrpc_c::value> >(v); // std::map<std::string, float> bias; // typedef std::map<std::string, xmlrpc_c::value>::value_type item; @@ -99,11 +99,11 @@ namespace Moses // } // typedef std::map<std::string, float>::value_type fitem; // BOOST_FOREACH(fitem const& x, bias) - // std::cerr << x.first << " " << x.second/total << std::endl; + // std::cerr << x.first << " " << x.second/total << std::endl; // // delete b; } - void + void DocumentBias ::init(std::map<std::string,float> const& biasmap, std::map<std::string,id_type> const& docname2docid) @@ -119,60 +119,60 @@ namespace Moses BOOST_FOREACH(doc_record const& d, docname2docid) std::cerr << "BIAS " << d.first << " " << m_bias[d.second] << std::endl; } - - id_type + + id_type DocumentBias ::GetClass(id_type const idx) const - { - return m_sid2docid.at(idx); + { + return m_sid2docid.at(idx); } - - float + + float DocumentBias - ::operator[](id_type const idx) const - { - UTIL_THROW_IF2(idx >= m_sid2docid.size(), + ::operator[](id_type const idx) const + { + UTIL_THROW_IF2(idx >= m_sid2docid.size(), "Out of bounds: " << idx << "/" << m_sid2docid.size()); return m_bias[m_sid2docid[idx]]; } - size_t + size_t DocumentBias - ::size() const + ::size() const { return m_sid2docid.size(); } SentenceBias - ::SentenceBias(std::vector<float> const& bias) + ::SentenceBias(std::vector<float> const& bias) : m_bias(bias) { } SentenceBias ::SentenceBias(size_t const s) : m_bias(s) { } - id_type + id_type SentenceBias ::GetClass(id_type idx) const { return idx; } - float& + float& SentenceBias - ::operator[](id_type const idx) + ::operator[](id_type const idx) { UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); return m_bias[idx]; } - float + float SentenceBias - ::operator[](id_type const idx) const - { + ::operator[](id_type const idx) const + { UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); return m_bias[idx]; } - - size_t + + size_t SentenceBias ::size() const { return m_bias.size(); } - + } } diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index faed69e63..f540ddc76 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -15,54 +15,54 @@ namespace Moses std::string query_bias_server(std::string const& url, std::string const& text); - class SamplingBias + class SamplingBias { public: int loglevel; std::ostream* log; - virtual float + virtual float operator[](id_type const ID) const = 0; // returns (unnormalized bias) for the class of item ID - virtual size_t size() const = 0; + virtual size_t size() const = 0; // number of classes - - virtual id_type + + virtual id_type GetClass(id_type const ID) const = 0; // returns class of item ID }; - + class DocumentBias : public SamplingBias { std::vector<id_type> const& m_sid2docid; std::vector<float> m_bias; - + public: - + DocumentBias(std::vector<id_type> const& sid2doc, std::map<std::string,id_type> const& docname2docid, std::string const& server_url, std::string const& text, std::ostream* log); - void - init_from_json - ( std::string const& json, + void + init_from_json + ( std::string const& json, std::map<std::string,id_type> const& docname2docid, std::ostream* log ); - - void + + void init ( std::map<std::string,float> const& biasmap, std::map<std::string,id_type> const& docname2docid); - - id_type + + id_type GetClass(id_type const idx) const; - float + float operator[](id_type const idx) const; - size_t + size_t size() const; }; @@ -76,10 +76,10 @@ namespace Moses id_type GetClass(id_type idx) const; - float& operator[](id_type const idx); - float operator[](id_type const idx) const; + float& operator[](id_type const idx); + float operator[](id_type const idx) const; size_t size() const; - + }; } diff --git a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h index 034a74bd9..3af929644 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h @@ -1,13 +1,13 @@ // -*- c++ -*- // (c) 2007-2010 Ulrich Germann // implementation of stuff related to ArrayEntries -// this file should only be included via ug_tsa_base.h, +// this file should only be included via ug_tsa_base.h, // never by itself #ifndef __ug_tsa_array_entry_h #define __ug_tsa_array_entry_h #include "ug_ttrack_position.h" -namespace ugdiss +namespace ugdiss { namespace tsa { @@ -20,7 +20,7 @@ namespace ugdiss ArrayEntry(); ArrayEntry(char const* p); - + template<typename TSA_TYPE> ArrayEntry(TSA_TYPE const* S, char const* p); @@ -34,7 +34,7 @@ namespace ugdiss } // template<typename TSA_TYPE> - // class SamplingArrayEntryIterator + // class SamplingArrayEntryIterator // : public tsa::ArrayEntry // { // size_t const N; // (approximate) total number of occurrences @@ -46,7 +46,7 @@ namespace ugdiss // public: // SamplingArrayEntryIterator(TSA_TYPE::tree_iterator const& m, size_t const s); // bool step(); // returns false when at end of range - // bool done(); // + // bool done(); // // }; // template<typename TSA_TYPE> @@ -60,7 +60,7 @@ namespace ugdiss // , root(m.root) // , stop(m.upper_bound(-1)) // { } - + // template<typename TSA_TYPE> // bool // SamplingArrayEntryIterator:: diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h index 83593c79c..8a4117910 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_base.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h @@ -26,7 +26,7 @@ namespace ugdiss namespace bio=boost::iostreams; template<typename TKN> - TKN const* + TKN const* next(TKN const* x) { return static_cast<TKN const*>(x ? x->next() : NULL); @@ -42,20 +42,20 @@ namespace ugdiss * ordering of sequences. Both are decleared/defined in * ug_corpus_token.{h|cc} */ - template<typename TKN> - class TSA + template<typename TKN> + class TSA { public: virtual ~TSA() {}; - typedef TSA_tree_iterator<TKN> tree_iterator; + typedef TSA_tree_iterator<TKN> tree_iterator; // allows iteration over the array as if it were a trie - typedef tsa::ArrayEntry ArrayEntry; + typedef tsa::ArrayEntry ArrayEntry; /* an entry in the array, for iteration over all occurrences of a * particular sequence */ - // typedef boost::dynamic_bitset<uint64_t> bitset; + // typedef boost::dynamic_bitset<uint64_t> bitset; typedef boost::shared_ptr<bitvector> bitset_pointer; typedef TKN Token; - typedef BitSetCache<TSA<TKN> > BSC_t; + typedef BitSetCache<TSA<TKN> > BSC_t; /* to allow caching of bit vectors that are expensive to create on * the fly */ @@ -67,7 +67,7 @@ namespace ugdiss char const* endArray; // ... and end ... // of memory block storing the actual TSA - size_t corpusSize; + size_t corpusSize; /** size of the corpus (in number of sentences) of the corpus * underlying the sequence array. * @@ -76,37 +76,37 @@ namespace ugdiss * suffix array is based on a subset * of the sentences of /corpus/. */ - - id_type numTokens; + + id_type numTokens; /** size of the corpus (in number of tokens) of the corpus underlying the - * sequence array. + * sequence array. * * ATTENTION: This number may differ from corpus->numTokens(), namely when - * the suffix array is based on a subset of the sentences of + * the suffix array is based on a subset of the sentences of * /corpus/. */ - id_type indexSize; - // (number of entries +1) in the index of root-level nodes + id_type indexSize; + // (number of entries +1) in the index of root-level nodes size_t BitSetCachingThreshold; - + //////////////////////////////////////////////////////////////// // private member functions: - /** @return an index position approximately /fraction/ between + /** @return an index position approximately /fraction/ between * /startRange/ and /endRange/. - */ - virtual - char const* - index_jump(char const* startRange, - char const* stopRange, + */ + virtual + char const* + index_jump(char const* startRange, + char const* stopRange, float fraction) const = 0; - - /** return the index position of the first item that + + /** return the index position of the first item that * is equal to or includes [refStart,refStart+refLen) as a prefix */ - char const* + char const* find_start(char const* lo, char const* const upX, TKN const* const refStart, int refLen, size_t d) const; @@ -114,19 +114,19 @@ namespace ugdiss /** return the index position of the first item that is greater than * [refStart,refStart+refLen) and does not include it as a prefix */ - char const* + char const* find_end(char const* lo, char const* const upX, TKN const* const refStart, int refLen, size_t d) const; - + /** return the index position of the first item that is longer than * [refStart,refStart+refLen) and includes it as a prefix */ - char const* + char const* find_longer(char const* lo, char const* const upX, TKN const* const refStart, int refLen, size_t d) const; - + /** Returns a char const* pointing to the position in the data block * where the first item starting with token /id/ is located. */ @@ -140,37 +140,37 @@ namespace ugdiss public: boost::shared_ptr<BSC_t> bsc; - + char const* arrayStart() const { return startArray; } char const* arrayEnd() const { return endArray; } - /** @return a pointer to the beginning of the index entry range covering + /** @return a pointer to the beginning of the index entry range covering * [keyStart,keyStop) */ - char const* + char const* lower_bound(typename vector<TKN>::const_iterator const& keyStart, typename vector<TKN>::const_iterator const& keyStop) const; - char const* + char const* lower_bound(TKN const* keyStart, TKN const* keyStop) const; - char const* + char const* lower_bound(TKN const* keyStart, int keyLen) const; - /** @return a pointer to the end point of the index entry range covering + /** @return a pointer to the end point of the index entry range covering * [keyStart,keyStop) */ - char const* - upper_bound(typename vector<TKN>::const_iterator const& keyStart, + char const* + upper_bound(typename vector<TKN>::const_iterator const& keyStart, typename vector<TKN>::const_iterator const& keyStop) const; - char const* + char const* upper_bound(TKN const* keyStart, int keyLength) const; /** dump all suffixes in order to /out/ */ void dump(ostream& out, TokenIndex const& T) const; - - /** fill the dynamic bit set with true for all sentences that contain + + /** fill the dynamic bit set with true for all sentences that contain * /phrase/. * @return the raw number of occurrences. */ @@ -188,70 +188,70 @@ namespace ugdiss setTokenBits(char const* startRange, char const* endRange, size_t len, bitvector& bs) const; - /** read the sentence ID into /sid/ - * @return position of associated offset. + /** read the sentence ID into /sid/ + * @return position of associated offset. * * The function provides an abstraction that uses the right * interpretation of the position based on the subclass * (memory-mapped or in-memory). */ virtual - char const* + char const* readSid(char const* p, char const* q, id_type& sid) const = 0; virtual - char const* + char const* readSid(char const* p, char const* q, ::uint64_t& sid) const = 0; - /** read the offset part of the index entry into /offset/ - * @return position of the next entry in the index. + /** read the offset part of the index entry into /offset/ + * @return position of the next entry in the index. * * The function provides an abstraction that uses the right * interpretation of the position based on the subclass * (memory-mapped or in-memory). */ virtual - char const* + char const* readOffset(char const* p, char const* q, uint16_t& offset) const = 0; virtual - char const* + char const* readOffset(char const* p, char const* q, ::uint64_t& offset) const = 0; - /** @return sentence count + /** @return sentence count */ count_type - sntCnt(char const* p, char const* const q) const; - + sntCnt(char const* p, char const* const q) const; + count_type - rawCnt2(TKN const* keyStart, size_t keyLen) const; + rawCnt2(TKN const* keyStart, size_t keyLen) const; /** @return raw occurrence count - * + * * depending on the subclass, this is constant time (imTSA) or * linear in in the number of occurrences (mmTSA). */ virtual count_type - rawCnt(char const* p, char const* const q) const = 0; + rawCnt(char const* p, char const* const q) const = 0; - /** get both sentence and word counts. + /** get both sentence and word counts. * * Avoids having to go over the byte range representing the range * of suffixes in question twice when dealing with memory-mapped * suffix arrays. - */ + */ virtual - void - getCounts(char const* p, char const* const q, - count_type& sids, count_type& raw) const = 0; + void + getCounts(char const* p, char const* const q, + count_type& sids, count_type& raw) const = 0; - string - suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0) + string + suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0) const; - string - suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0) + string + suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0) const; tsa::ArrayEntry& readEntry(char const* p, tsa::ArrayEntry& I) const; @@ -260,36 +260,36 @@ namespace ugdiss char const* dataEnd() const; bool sanityCheck1() const; - - /** Return an ID that represents a given phrase; + + /** Return an ID that represents a given phrase; This should NEVER be 0! - Structure of a phrase ID: + Structure of a phrase ID: leftmost 32 bits: sentence ID in the corpus next 16 bits: offset from the start of the sentence next 16 bits: length of the phrase */ - ::uint64_t + ::uint64_t getSequenceId(typename vector<TKN>::const_iterator const& pstart, typename vector<TKN>::const_iterator const& pstop) const; - - ::uint64_t + + ::uint64_t getSequenceId(TKN const* t, ushort plen) const; - + /** Return the phrase represented by phrase ID pid_ */ string getSequence(::uint64_t pid, TokenIndex const& V) const; - + /** Return the phrase represented by phrase ID pid_ */ vector<TKN> getSequence(::uint64_t pid) const; - TKN const* + TKN const* getSequenceStart(::uint64_t) const; ushort getSequenceLength(::uint64_t) const; - size_t + size_t getCorpusSize() const; Ttrack<TKN> const* @@ -297,13 +297,13 @@ namespace ugdiss bitset_pointer getBitSet(TKN const* startKey, size_t keyLen) const; - + boost::shared_ptr<bitvector> - findTree(TKN const* treeStart, TKN const* treeEnd, + findTree(TKN const* treeStart, TKN const* treeEnd, bitvector const* filter) const; - + size_t markOccurrences(char const* lo, char const* up, size_t len, - bitvector& bitset, + bitvector& bitset, bool markOnlyStartPosition) const; bool @@ -311,13 +311,13 @@ namespace ugdiss vector<tree_iterator>& dest) const; double aveIndexEntrySize() const - { - return (endArray-startArray)/double(numTokens); + { + return (endArray-startArray)/double(numTokens); } public: - // virtual - sptr<TSA_tree_iterator<TKN> > + // virtual + sptr<TSA_tree_iterator<TKN> > find(TKN const* start, size_t len) const { typedef TSA_tree_iterator<TKN> iter; @@ -333,7 +333,7 @@ namespace ugdiss // ====================================================================== // template<typename TOKEN> - // sptr<TSA_tree_iterator<TOKEN> > + // sptr<TSA_tree_iterator<TOKEN> > // TSA<TOKEN>:: // find(TOKEN const* start, size_t len) const // { @@ -354,7 +354,7 @@ namespace ugdiss * @return number of total occurrences of the phrase in the corpus */ template<typename TKN> - count_type + count_type TSA<TKN>:: fillBitSet(vector<TKN> const& key, bitvector& bitset) const @@ -362,7 +362,7 @@ namespace ugdiss if (!key.size()) return 0; return fillBitset(&(key[0]),key.size(),bitset); } - + // --------------------------------------------------------------------------- /** fill the dynamic bitset with information as to which sentences @@ -370,7 +370,7 @@ namespace ugdiss * @return number of total occurrences of the phrase in the corpus */ template<typename TKN> - count_type + count_type TSA<TKN>:: fillBitSet(TKN const* key, size_t keyLen, bitvector& bitset) const @@ -385,7 +385,7 @@ namespace ugdiss // --------------------------------------------------------------------------- template<typename TKN> - count_type + count_type TSA<TKN>:: setBits(char const* startRange, char const* endRange, bitvector& bs) const @@ -452,7 +452,7 @@ namespace ugdiss * of the token range matching [startKey,endKey) */ template<typename TKN> - char const* + char const* TSA<TKN>:: find_start(char const* lo, char const* const upX, TKN const* const refStart, int refLen, @@ -485,12 +485,12 @@ namespace ugdiss * of the token range matching [startKey,endKey) */ template<typename TKN> - char const* + char const* TSA<TKN>:: find_end(char const* lo, char const* const upX, TKN const* const refStart, int refLen, size_t d) const - + { char const* up = upX; if (lo >= up) return NULL; @@ -520,7 +520,7 @@ namespace ugdiss * but continues on */ template<typename TKN> - char const* + char const* TSA<TKN>:: find_longer(char const* lo, char const* const upX, TKN const* const refStart, int refLen, @@ -553,7 +553,7 @@ namespace ugdiss * given search phrase */ template<typename TKN> - char const* + char const* TSA<TKN>:: lower_bound(typename vector<TKN>::const_iterator const& keyStart, typename vector<TKN>::const_iterator const& keyStop) const @@ -570,7 +570,7 @@ namespace ugdiss * given search phrase */ template<typename TKN> - char const* + char const* TSA<TKN>:: lower_bound(TKN const* const keyStart, TKN const* const keyStop) const @@ -579,7 +579,7 @@ namespace ugdiss } template<typename TKN> - char const* + char const* TSA<TKN>:: lower_bound(TKN const* const keyStart, int keyLen) const { @@ -595,7 +595,7 @@ namespace ugdiss * given search phrase (i.e., points just beyond the range) */ template<typename TKN> - char const* + char const* TSA<TKN>:: upper_bound(typename vector<TKN>::const_iterator const& keyStart, typename vector<TKN>::const_iterator const& keyStop) const @@ -612,7 +612,7 @@ namespace ugdiss * given search phrase (i.e., points just beyond the range) */ template<typename TKN> - char const* + char const* TSA<TKN>:: upper_bound(TKN const* keyStart, int keyLength) const { @@ -645,7 +645,7 @@ namespace ugdiss { return getSequenceId(&(*pstart),pstop-pstart); } - + //--------------------------------------------------------------------------- template<typename TKN> @@ -667,14 +667,14 @@ namespace ugdiss //--------------------------------------------------------------------------- - template<typename TKN> + template<typename TKN> vector<TKN> TSA<TKN>:: getSequence(::uint64_t pid) const { size_t plen = pid % 65536; size_t offset = (pid >> 16) % 65536; - TKN const* w = corpus->sntStart(pid >> 32)+offset; + TKN const* w = corpus->sntStart(pid >> 32)+offset; vector<TKN> ret(plen); for (size_t i = 0; i < plen; i++, w = w->next()) { @@ -684,7 +684,7 @@ namespace ugdiss return ret; } - template<typename TKN> + template<typename TKN> string TSA<TKN>:: getSequence(::uint64_t pid, TokenIndex const& V) const @@ -698,21 +698,21 @@ namespace ugdiss return buf.str(); } - + //--------------------------------------------------------------------------- - template<typename TKN> + template<typename TKN> TKN const* TSA<TKN>:: getSequenceStart(::uint64_t pid) const { size_t offset = (pid >> 16) % 65536; - return corpus->sntStart(pid >> 32)+offset; + return corpus->sntStart(pid >> 32)+offset; } - + //--------------------------------------------------------------------------- - template<typename TKN> + template<typename TKN> ushort TSA<TKN>:: getSequenceLength(::uint64_t pid) const @@ -729,7 +729,7 @@ namespace ugdiss { return corpusSize; } - + //--------------------------------------------------------------------------- template<typename TKN> @@ -756,7 +756,7 @@ namespace ugdiss }; //--------------------------------------------------------------------------- - + /// find all instances of the tree described by [treeStart, treeEnd) template<typename TKN> typename TSA<TKN>::bitset_pointer @@ -764,7 +764,7 @@ namespace ugdiss getBitSet(TKN const* startKey, size_t keyLen) const { bitset_pointer ret; - if (bsc != NULL) + if (bsc != NULL) ret = bsc->get(startKey,keyLen); else { @@ -773,7 +773,7 @@ namespace ugdiss } return ret; } - + //--------------------------------------------------------------------------- template<typename TKN> @@ -809,12 +809,12 @@ namespace ugdiss vector<tree_iterator>& dest) const { dest.assign(terminals.count(),tree_iterator(this)); - for (size_t i = terminals.find_first(), k = 0; - i < terminals.size(); + for (size_t i = terminals.find_first(), k = 0; + i < terminals.size(); i = terminals.find_next(i),++k) { for (TKN const* x = base+i; x && x->id(); x = x->next()) - if (!dest[k].extend(x->id())) + if (!dest[k].extend(x->id())) return false; } typename tree_iterator::SortByApproximateCount sorter; diff --git a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h index 3111f1c1d..d13449e36 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h @@ -20,7 +20,7 @@ namespace ugdiss { using namespace std; template<typename TSA> - class + class BitSetCache { public: @@ -33,15 +33,15 @@ namespace ugdiss myMap cached1,cached2; int threshold; public: - + BitSetCache() : tsa(NULL), threshold(0) {}; - BitSetCache(TSA const* t, size_t th=4194304) + BitSetCache(TSA const* t, size_t th=4194304) { init(t,th); }; - void - init(TSA const* t, size_t th=4194304) + void + init(TSA const* t, size_t th=4194304) { tsa = t; threshold = th; @@ -84,7 +84,7 @@ namespace ugdiss if (up-lo > threshold) { pair<char const*,ushort> k(lo,keyLen); - // cout << "bla " << keyStart->id() << " " + // cout << "bla " << keyStart->id() << " " // << cached2.size() << " " << up-lo << " " << k.second << endl; myMapIter m = cached2.find(k); if (m != cached2.end()) diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 508f09304..053ff2445 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -23,24 +23,24 @@ namespace ugdiss template<typename T> void display(T const* x, string label) { - cout << label << ":"; - for (;x;x=next(x)) cout << " " << x->lemma; - cout << endl; + cout << label << ":"; + for (;x;x=next(x)) cout << " " << x->lemma; + cout << endl; } #endif template<typename T> class TSA; // CLASS DEFINITION - // The TSA_tree_iterator allows traversal of a Token Sequence Array + // The TSA_tree_iterator allows traversal of a Token Sequence Array // as if it was a trie. // // down(): go to first child - // over(): go to next sibling + // over(): go to next sibling // up(): go to parent // extend(id): go to a specific child node // all four functions return true if successful, false otherwise - // lower_bound() and upper_bound() give the range of entries in the + // lower_bound() and upper_bound() give the range of entries in the // array covered by the "virtual trie node". template<typename TKN> class @@ -49,7 +49,7 @@ namespace ugdiss protected: vector<char const*> lower; vector<char const*> upper; - + // for debugging ... void showBounds(ostream& out) const; public: @@ -57,7 +57,7 @@ namespace ugdiss virtual ~TSA_tree_iterator() {}; - TSA<Token> const* root; + TSA<Token> const* root; // TO BE DONE: make the pointer private and add a const function // to return the pointer @@ -66,16 +66,16 @@ namespace ugdiss TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other); TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len); // TSA_tree_iterator(TSA<Token> const* s, Token const& t); - TSA_tree_iterator(TSA<Token> const* s, - Token const* kstart, - size_t const len, + TSA_tree_iterator(TSA<Token> const* s, + Token const* kstart, + size_t const len, bool full_match_only=true); - TSA_tree_iterator(TSA<Token> const* s, - Token const* kstart, - Token const* kend, + TSA_tree_iterator(TSA<Token> const* s, + Token const* kstart, + Token const* kend, bool full_match_only=true); - TSA_tree_iterator(TSA<Token> const* s, - TokenIndex const& V, + TSA_tree_iterator(TSA<Token> const* s, + TokenIndex const& V, string const& key); char const* lower_bound(int p) const; @@ -104,49 +104,49 @@ namespace ugdiss bool match(id_type sid) const; // fillBitSet: deprecated; use markSentences() instead - count_type + count_type fillBitSet(boost::dynamic_bitset<typename ::uint64_t>& bitset) const; - - count_type + + count_type markEndOfSequence(Token const* start, Token const* stop, boost::dynamic_bitset<typename ::uint64_t>& dest) const; - count_type + count_type markSequence(Token const* start, Token const* stop, bitvector& dest) const; - - count_type + + count_type markSentences(boost::dynamic_bitset<typename ::uint64_t>& bitset) const; - - count_type + + count_type markOccurrences(boost::dynamic_bitset<typename ::uint64_t>& bitset, bool markOnlyStartPosition=false) const; - - count_type + + count_type markOccurrences(vector<ushort>& dest) const; - - ::uint64_t + + ::uint64_t getSequenceId() const; - - // equivalent but more efficient than + + // equivalent but more efficient than // bitvector tmp; markSentences(tmp); foo &= tmp; bitvector& filterSentences(bitvector& foo) const; - + /// a special auxiliary function for finding trees - void - tfAndRoot(bitvector const& ref, // reference root positions + void + tfAndRoot(bitvector const& ref, // reference root positions bitvector const& snt, // relevant sentences bitvector& dest) const; - + size_t arrayByteSpanSize(int p = -1) const - { + { if (lower.size()==0) return 0; // or endArray-startArray??? if (p < 0) p = lower.size()+p; assert(p >=0 && p < int(lower.size())); return lower.size() ? upper[p]-lower[p] : 0; } - + struct SortByApproximateCount { - bool operator()(TSA_tree_iterator const& a, + bool operator()(TSA_tree_iterator const& a, TSA_tree_iterator const& b) const { if (a.size()==0) return b.size() ? true : false; @@ -175,7 +175,7 @@ namespace ugdiss size_t grow(Token const* snt, bitvector const& cov) { - size_t x = cov.find_first(); + size_t x = cov.find_first(); while (x < cov.size() && extend(snt[x])) x = cov.find_next(x); return this->size(); @@ -183,7 +183,7 @@ namespace ugdiss sptr<vector<typename ttrack::Position> > randomSample(int level, size_t N) const; - + }; //--------------------------------------------------------------------------- @@ -205,7 +205,7 @@ namespace ugdiss assert(root->corpus->getToken(A)); assert(lo < root->getUpperBound(root->corpus->getToken(A)->id())); lower.push_back(lo); - Token const* foo = this->getToken(0); + Token const* foo = this->getToken(0); upper.push_back(root->upper_bound(foo,lower.size())); return lower.size(); } @@ -217,7 +217,7 @@ namespace ugdiss Token const* z = next(a); for (size_t i = 1; i < size(); ++i) z = next(z); if (z < root->corpus->sntStart(A.sid) || z >= root->corpus->sntEnd(A.sid)) - { + { char const* up = upper.back(); lo = root->find_longer(lo,up,a,lower.size(),0); if (!lo) return false; @@ -244,7 +244,7 @@ namespace ugdiss TSA_tree_iterator<Token>:: over() { - if (lower.size() == 0) + if (lower.size() == 0) return false; if (lower.size() == 1) { @@ -254,7 +254,7 @@ namespace ugdiss if (upper[0] < hi) { lower[0] = upper[0]; - Token const* foo = this->getToken(0); + Token const* foo = this->getToken(0); upper.back() = root->upper_bound(foo,lower.size()); } else @@ -264,11 +264,11 @@ namespace ugdiss char const* lo = root->getLowerBound(wid); if (lo == root->endArray) return false; char const* hi = root->getUpperBound(wid); - if (!hi) return false; + if (!hi) return false; if (lo == hi) continue; assert(lo); lower[0] = lo; - Token const* foo = this->getToken(0); + Token const* foo = this->getToken(0); upper.back() = root->upper_bound(foo,lower.size()); break; } @@ -293,7 +293,7 @@ namespace ugdiss // display(root->corpus->getToken(U),"L2"); - Token const* foo = this->getToken(0); + Token const* foo = this->getToken(0); // display(foo,"F!"); upper.back() = root->upper_bound(foo,lower.size()); return true; @@ -326,17 +326,17 @@ namespace ugdiss template<typename Token> TSA_tree_iterator<Token>:: TSA_tree_iterator(TSA<Token> const* s) - : root(s) + : root(s) {}; template<typename Token> TSA_tree_iterator<Token>:: TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other) - : root(s) + : root(s) { Token const* x = other.getToken(0); for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i) - x = x->next(); + x = x->next(); }; @@ -345,9 +345,9 @@ namespace ugdiss TSA_tree_iterator<Token>:: TSA_tree_iterator (TSA<Token> const* r, - id_type const* s, + id_type const* s, size_t const len) - : root(r) + : root(r) { for (id_type const* e = s + len; s < e && extend(*s); ++s); }; @@ -357,16 +357,16 @@ namespace ugdiss #if 1 template<typename Token> TSA_tree_iterator<Token>:: - TSA_tree_iterator(TSA<Token> const* s, - TokenIndex const& V, + TSA_tree_iterator(TSA<Token> const* s, + TokenIndex const& V, string const& key) : root(s) { istringstream buf(key); string w; while (buf >> w) { - if (this->extend(V[w])) - continue; + if (this->extend(V[w])) + continue; else { lower.clear(); @@ -377,7 +377,7 @@ namespace ugdiss }; #endif -#if 0 +#if 0 // --------------------------------------------------------------------------- template<typename Token> @@ -394,7 +394,7 @@ namespace ugdiss template<typename Token> TSA_tree_iterator<Token>:: TSA_tree_iterator(TSA<Token> const* s, Token const& t) - : root(s) + : root(s) { if (!root) return; char const* up = root->getUpperBound(t.id()); @@ -409,33 +409,33 @@ namespace ugdiss template<typename Token> TSA_tree_iterator<Token>:: - TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, + TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, size_t const len, bool full_match_only) - : root(s) + : root(s) { if (!root) return; size_t i = 0; for (; i < len && kstart && extend(*kstart); ++i) kstart = kstart->next(); - if (full_match_only && i != len) + if (full_match_only && i != len) { lower.clear(); upper.clear(); } }; - // DEPRECATED: DO NOT USE. Use the one that takes the length + // DEPRECATED: DO NOT USE. Use the one that takes the length // instead of kend. template<typename Token> TSA_tree_iterator<Token>:: - TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, + TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, Token const* kend, bool full_match_only) - : root(s) + : root(s) { - for (;kstart != kend; kstart = kstart->next()) - if (!extend(*kstart)) + for (;kstart != kend; kstart = kstart->next()) + if (!extend(*kstart)) break; - if (full_match_only && kstart != kend) + if (full_match_only && kstart != kend) { lower.clear(); upper.clear(); @@ -445,7 +445,7 @@ namespace ugdiss // --------------------------------------------------------------------------- // EXTEND // --------------------------------------------------------------------------- - + template<typename Token> bool TSA_tree_iterator<Token>:: @@ -496,9 +496,9 @@ namespace ugdiss template<typename Token> size_t TSA_tree_iterator<Token>:: - size() const - { - return lower.size(); + size() const + { + return lower.size(); } // --------------------------------------------------------------------------- @@ -506,8 +506,8 @@ namespace ugdiss template<typename Token> id_type TSA_tree_iterator<Token>:: - getSid() const - { + getSid() const + { char const* p = (lower.size() ? lower.back() : root->startArray); char const* q = (upper.size() ? upper.back() : root->endArray); id_type sid; @@ -520,8 +520,8 @@ namespace ugdiss template<typename Token> ::uint64_t TSA_tree_iterator<Token>:: - getPid(int p) const - { + getPid(int p) const + { if (this->size() == 0) return 0; if (p < 0) p += upper.size(); char const* lb = lower_bound(p); @@ -531,7 +531,7 @@ namespace ugdiss ::uint64_t ret = (sid<<32) + (off<<16) + ::uint64_t(p+1); return ret; } - + // --------------------------------------------------------------------------- template<typename Token> @@ -614,7 +614,7 @@ namespace ugdiss //--------------------------------------------------------------------------- template<typename Token> - count_type + count_type TSA_tree_iterator<Token>:: fillBitSet(boost::dynamic_bitset<typename ::uint64_t>& bitset) const { @@ -624,7 +624,7 @@ namespace ugdiss //--------------------------------------------------------------------------- template<typename Token> - count_type + count_type TSA_tree_iterator<Token>:: markSentences(boost::dynamic_bitset<typename ::uint64_t>& bitset) const { @@ -651,7 +651,7 @@ namespace ugdiss //--------------------------------------------------------------------------- template<typename Token> - count_type + count_type TSA_tree_iterator<Token>:: markOccurrences(boost::dynamic_bitset<typename ::uint64_t>& bitset, bool markOnlyStartPosition) const { @@ -667,7 +667,7 @@ namespace ugdiss //--------------------------------------------------------------------------- template<typename Token> - count_type + count_type TSA_tree_iterator<Token>:: markOccurrences(vector<ushort>& dest) const { @@ -694,10 +694,10 @@ namespace ugdiss } //--------------------------------------------------------------------------- - // mark all endpoints of instances of the path represented by this + // mark all endpoints of instances of the path represented by this // iterator in the sentence [start,stop) template<typename Token> - count_type + count_type TSA_tree_iterator<Token>:: markEndOfSequence(Token const* start, Token const* stop, boost::dynamic_bitset<typename ::uint64_t>& dest) const @@ -726,10 +726,10 @@ namespace ugdiss } //--------------------------------------------------------------------------- - // mark all occurrences of the sequence represented by this + // mark all occurrences of the sequence represented by this // iterator in the sentence [start,stop) template<typename Token> - count_type + count_type TSA_tree_iterator<Token>:: markSequence(Token const* start, Token const* stop, @@ -784,7 +784,7 @@ namespace ugdiss { assert(x); buf << (i > start ? " " : ""); - if (V) buf << (*V)[x->id()]; + if (V) buf << (*V)[x->id()]; else buf << x->id(); } return buf.str(); @@ -807,13 +807,13 @@ namespace ugdiss { assert(x); buf << (i > start ? " " : ""); - buf << V[x->id()].str; + buf << V[x->id()].str; } return buf.str(); } #endif - /// @return true if the sentence [start,stop) contains the sequence + /// @return true if the sentence [start,stop) contains the sequence template<typename Token> bool TSA_tree_iterator<Token>:: @@ -823,7 +823,7 @@ namespace ugdiss for (Token const* t = start; t < stop; ++t) { if (*t != *a) continue; - Token const* b = a; + Token const* b = a; Token const* y = t; size_t i; for (i = 1; i < lower.size(); ++i) @@ -838,7 +838,7 @@ namespace ugdiss return false; } - /// @return true if the sentence /sid/ contains the sequence + /// @return true if the sentence /sid/ contains the sequence template<typename Token> bool TSA_tree_iterator<Token>:: @@ -851,9 +851,9 @@ namespace ugdiss // @param sntcheck: number of roots in the respective sentence // @param dest: bitvector to keep track of the exact root location template<typename Token> - void + void TSA_tree_iterator<Token>:: - tfAndRoot(bitvector const& ref, // reference root positions + tfAndRoot(bitvector const& ref, // reference root positions bitvector const& snt, // relevant sentences bitvector& dest) const { @@ -880,12 +880,12 @@ namespace ugdiss filterSentences(bitvector& bv) const { float aveSntLen = root->corpus->numTokens()/root->corpus->size(); - size_t ANDcost = bv.size()/8; // cost of dest&=ref; + size_t ANDcost = bv.size()/8; // cost of dest&=ref; float aveEntrySize = ((root->endArray-root->startArray) /root->corpus->numTokens()); if (arrayByteSpanSize()+ANDcost < aveEntrySize*aveSntLen*bv.count()) { - bitvector tmp(bv.size()); + bitvector tmp(bv.size()); markSentences(tmp); bv &= tmp; } @@ -906,9 +906,9 @@ namespace ugdiss if (level < 0) level += lower.size(); assert(level >=0); - sptr<vector<typename ttrack::Position> > + sptr<vector<typename ttrack::Position> > ret(new vector<typename ttrack::Position>(N)); - + size_t m=0; // number of samples selected so far typename Token::ArrayEntry I(lower.at(level)); @@ -916,7 +916,7 @@ namespace ugdiss while (m < N && (I.next) < stop) { root->readEntry(I.next,I); - + // t: expected number of remaining samples const double t = (stop - I.pos)/root->aveIndexEntrySize(); const double r = util::rand_excl(t); @@ -930,6 +930,6 @@ namespace ugdiss return ret; } - + } // end of namespace ugdiss #endif diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc index 644c53c3a..60d20a5f9 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc @@ -9,12 +9,12 @@ namespace ugdiss { using namespace std; - + #if 0 template<> id_type Ttrack<id_type>:: - toID(id_type const& t) + toID(id_type const& t) { return t; } diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h index f9864bda6..d087a9e58 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h @@ -2,7 +2,7 @@ // Base class for corpus tracks. mmTtrack (memory-mapped Ttrack) and imTtrack (in-memory Ttrack) // are derived from this class. -// This code is part of a refactorization of the earlier Ttrack class as a template class for +// This code is part of a refactorization of the earlier Ttrack class as a template class for // tokens of arbitrary fixed-length size. // (c) 2007-2009 Ulrich Germann. All rights reserved. @@ -27,8 +27,8 @@ namespace ugdiss typedef boost::dynamic_bitset<uint64_t> bdBitset; template<typename sid_t, typename off_t, typename len_t> - void - parse_pid(uint64_t const pid, sid_t & sid, + void + parse_pid(uint64_t const pid, sid_t & sid, off_t & off, len_t& len) { static uint64_t two32 = uint64_t(1)<<32; @@ -39,12 +39,12 @@ namespace ugdiss } template<typename Token> - string + string toString(TokenIndex const& V, Token const* x, size_t const len) { if (!len) return ""; UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); - ostringstream buf; + ostringstream buf; buf << V[x->id()]; size_t i = 1; for (x = x->next(); x && i < len; ++i, x = x->next()) @@ -63,66 +63,66 @@ namespace ugdiss typedef TKN Token; /** @return a pointer to beginning of sentence /sid/ */ - virtual - TKN const* - sntStart(size_t sid) const = 0; + virtual + TKN const* + sntStart(size_t sid) const = 0; /** @return end point of sentence /sid/ */ - virtual - TKN const* - sntEnd(size_t sid) const = 0; + virtual + TKN const* + sntEnd(size_t sid) const = 0; TKN const* getToken(Position const& p) const; template<typename T> - T const* - getTokenAs(Position const& p) const + T const* + getTokenAs(Position const& p) const { return reinterpret_cast<T const*>(getToken(p)); } template<typename T> T const* - sntStartAs(id_type sid) const + sntStartAs(id_type sid) const { return reinterpret_cast<T const*>(sntStart(sid)); } template<typename T> T const* - sntEndAs(id_type sid) const + sntEndAs(id_type sid) const { return reinterpret_cast<T const*>(sntEnd(sid)); } /** @return length of sentence /sid/ */ size_t sntLen(size_t sid) const { return sntEnd(sid) - sntStart(sid); } - size_t + size_t startPos(id_type sid) const { return sntStart(sid)-sntStart(0); } - - size_t + + size_t endPos(id_type sid) const { return sntEnd(sid)-sntStart(0); } /** Don't use this unless you want a copy of the sentence */ - vector<TKN> - operator[](id_type sid) const - { - return vector<TKN>(sntStart(sid),sntEnd(sid)); + vector<TKN> + operator[](id_type sid) const + { + return vector<TKN>(sntStart(sid),sntEnd(sid)); } /** @return size of corpus in number of sentences */ - virtual size_t size() const = 0; + virtual size_t size() const = 0; /** @return size of corpus in number of words/tokens */ - virtual size_t numTokens() const = 0; + virtual size_t numTokens() const = 0; - /** @return string representation of sentence /sid/ + /** @return string representation of sentence /sid/ * Currently only defined for Ttrack<id_type> */ string str(id_type sid, TokenIndex const& T) const; string pid2str(TokenIndex const* V, uint64_t pid) const; - // /** @return string representation of sentence /sid/ + // /** @return string representation of sentence /sid/ // * Currently only defined for Ttrack<id_type> */ // string str(id_type sid, Vocab const& V) const; - - /** counts the tokens in the corpus; used for example in the construction of + + /** counts the tokens in the corpus; used for example in the construction of * token sequence arrays */ count_type count_tokens(vector<count_type>& cnt, bdBitset const* filter, int lengthCutoff=0, ostream* log=NULL) const; @@ -130,7 +130,7 @@ namespace ugdiss // static id_type toID(TKN const& t); int cmp(Position const& A, Position const& B, int keyLength) const; - int cmp(Position const& A, TKN const* keyStart, int keyLength=-1, + int cmp(Position const& A, TKN const* keyStart, int keyLength=-1, int depth=0) const; virtual id_type findSid(TKN const* t) const = 0; // find the sentence id of a given token @@ -139,18 +139,18 @@ namespace ugdiss // the following three functions are currently not used by any program ... (deprecate?) TKN const* - find_next_within_sentence(TKN const* startKey, - int keyLength, + find_next_within_sentence(TKN const* startKey, + int keyLength, Position startHere) const; Position - find_first(TKN const* startKey, int keyLength, + find_first(TKN const* startKey, int keyLength, bdBitset const* filter=NULL) const; Position - find_next(TKN const* startKey, int keyLength, Position startAfter, + find_next(TKN const* startKey, int keyLength, Position startAfter, bdBitset const* filter=NULL) const; - + virtual size_t offset(TKN const* t) const { return t-sntStart(0); } }; @@ -171,11 +171,11 @@ namespace ugdiss template<typename TKN> count_type Ttrack<TKN>:: - count_tokens(vector<count_type>& cnt, bdBitset const* filter, + count_tokens(vector<count_type>& cnt, bdBitset const* filter, int lengthCutoff, ostream* log) const { - bdBitset filter2; - if (!filter) + bdBitset filter2; + if (!filter) { filter2.resize(this->size()); filter2.set(); @@ -184,21 +184,21 @@ namespace ugdiss cnt.clear(); cnt.reserve(500000); count_type totalCount=0; - + int64_t expectedTotal=0; for (size_t sid = 0; sid < this->size(); ++sid) expectedTotal += this->sntLen(sid); - + for (size_t sid = filter->find_first(); sid < filter->size(); sid = filter->find_next(sid)) { TKN const* k = sntStart(sid); TKN const* const stop = sntEnd(sid); - if (lengthCutoff && stop-k >= lengthCutoff) + if (lengthCutoff && stop-k >= lengthCutoff) { - if (log) - *log << "WARNING: skipping sentence #" << sid + if (log) + *log << "WARNING: skipping sentence #" << sid << " with more than 65536 tokens" << endl; expectedTotal -= stop-k; } @@ -217,7 +217,7 @@ namespace ugdiss if (this->size() == filter->count()) { if (totalCount != expectedTotal) - cerr << "OOPS: expected " << expectedTotal + cerr << "OOPS: expected " << expectedTotal << " tokens but counted " << totalCount << endl; assert(totalCount == expectedTotal); } @@ -256,16 +256,16 @@ namespace ugdiss a = next(a); b = next(b); // cerr << keyLength << "b. " << (a ? a->lemma : 0) << " " << (b ? b->lemma : 0) << endl; - if (--keyLength==0 || b < bosB || b >= eosB) - { + if (--keyLength==0 || b < bosB || b >= eosB) + { ret = (a < bosA || a >= eosA) ? 0 : 1; break; } } // cerr << "RETURNING " << ret << endl; - return ret; + return ret; } - + template<typename TKN> int Ttrack<TKN>:: @@ -287,17 +287,17 @@ namespace ugdiss if (*x > *key) return 2; key = key->next(); x = x->next(); - if (--keyLength==0) // || !key) + if (--keyLength==0) // || !key) return (x == stopx) ? 0 : 1; assert(key); } - return -1; + return -1; } template<typename TKN> - TKN const* + TKN const* Ttrack<TKN>:: - find_next_within_sentence(TKN const* startKey, int keyLength, + find_next_within_sentence(TKN const* startKey, int keyLength, Position startHere) const { for (TKN const* t = getToken(startHere); t; t = getToken(startHere)) @@ -308,12 +308,12 @@ namespace ugdiss { TKN const* k = startKey->next(); TKN const* t2 = t->next(); - if (t2) + if (t2) { - cout << t2->lemma << "." << int(t2->minpos) << " " + cout << t2->lemma << "." << int(t2->minpos) << " " << k->lemma << "." << int(k->minpos) << " " << t2->cmp(*k) << endl; - } + } } #endif int x = cmp(startHere,startKey,keyLength,0); @@ -330,8 +330,8 @@ namespace ugdiss { if (filter) { - for (size_t sid = filter->find_first(); - sid < filter->size(); + for (size_t sid = filter->find_first(); + sid < filter->size(); sid = filter->find_next(sid)) { TKN const* x = find_next_within_sentence(startKey,keyLength,Position(sid,0)); @@ -348,7 +348,7 @@ namespace ugdiss } return Position(this->size(),0); } - + template<typename TKN> typename Ttrack<TKN>::Position Ttrack<TKN>:: @@ -411,6 +411,6 @@ namespace ugdiss } return buf.str(); } - + } #endif diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_position.h b/moses/TranslationModel/UG/mm/ug_ttrack_position.h index 64fab3afb..6d473f263 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_position.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_position.h @@ -6,7 +6,7 @@ #include "ug_typedefs.h" // A token position in a Ttrack, with a LESS functor for comparing token -// positions in whatever sorting order the underlying token type implies. +// positions in whatever sorting order the underlying token type implies. // // (c) 2007-2010 Ulrich Germann. All rights reserved. @@ -26,19 +26,19 @@ namespace ugdiss Position(id_type _sid, ushort _off); template<typename TTRACK_TYPE> class LESS; // probably abandoned }; // end of deklaration of Position - -#if 1 + +#if 1 template<typename TTRACK_TYPE> - class + class Position:: LESS { TTRACK_TYPE const* c; public: typedef typename TTRACK_TYPE::Token Token; - + LESS(TTRACK_TYPE const* crp) : c(crp) {}; - + bool operator()(Position const& A, Position const& B) const { Token const* a = c->getToken(A); assert(a); @@ -48,30 +48,30 @@ namespace ugdiss Token const* bosA = c->sntStart(A.sid); Token const* eosA = c->sntEnd(A.sid); - + Token const* bosB = c->sntStart(B.sid); Token const* eosB = c->sntEnd(B.sid); - + #if 0 - Token const* z = a; + Token const* z = a; cout << "A: " << z->id(); for (z = next(z); z >= bosA && z < eosA; z = next(z)) - cout << "-" << z->id(); + cout << "-" << z->id(); cout << endl; - - z = b; + + z = b; cout << "B: " << z->id(); for (z = next(z); z >= bosB && z < eosB; z = next(z)) - cout << "-" << z->id(); + cout << "-" << z->id(); cout << endl; #endif while (*a == *b) { a = next(a); b = next(b); - if (a < bosA || a >= eosA) + if (a < bosA || a >= eosA) return (b >= bosB && b < eosB); - if (b < bosB || b >= eosB) + if (b < bosB || b >= eosB) return false; } int x = a->cmp(*b); @@ -86,4 +86,4 @@ namespace ugdiss } // end of namespace ttrack } // end of namespace ugdiss #endif - + diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index 83c8684e0..0181bef9e 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -24,7 +24,7 @@ namespace ugdiss typedef vector<vector<short> > short_2d_table; typedef vector<short_2d_table> short_3d_table; typedef vector<short_3d_table> short_4d_table; - + typedef vector<vector<int> > int_2d_table; typedef vector<int_2d_table> int_3d_table; typedef vector<int_3d_table> int_4d_table; diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 4e9e97766..6e680bbc5 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -19,7 +19,7 @@ namespace Moses using namespace std; using namespace boost; - void + void fillIdSeq(Phrase const& mophrase, size_t const ifactor, TokenIndex const& V, vector<id_type>& dest) { @@ -30,8 +30,8 @@ namespace Moses dest[i] = V[f->ToString()]; } } - - void + + void parseLine(string const& line, map<string,string> & param) { char_separator<char> sep("; "); @@ -79,13 +79,13 @@ namespace Moses , context_key(((char*)this)+1) // , m_tpc_ctr(0) , ofactor(1,0) - { - init(line); + { + init(line); setup_local_feature_functions(); Register(); } - void + void Mmsapt:: read_config_file(string fname, map<string,string>& param) { @@ -99,9 +99,9 @@ namespace Moses tokenizer<char_separator<char> >::const_iterator t = tokens.begin(); if (t == tokens.end()) continue; string& foo = param[*t++]; - if (t == tokens.end() || foo.size()) continue; + if (t == tokens.end() || foo.size()) continue; // second condition: do not overwrite settings from the line in moses.ini - UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(), + UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(), "Syntax error in Mmsapt config file '" << fname << "'."); for (foo = *t++; t != tokens.end(); foo += " " + *t++); } @@ -120,7 +120,7 @@ namespace Moses m_is_integer.push_back(ff->isIntegerValued(i)); } } - + bool Mmsapt::isLogVal(int i) const { return m_is_logval.at(i); } bool Mmsapt::isInteger(int i) const { return m_is_integer.at(i); } @@ -130,7 +130,7 @@ namespace Moses parseLine(line,this->param); this->m_numScoreComponents = atoi(param["num-features"].c_str()); - + m = param.find("config"); if (m != param.end()) read_config_file(m->second,param); @@ -138,17 +138,17 @@ namespace Moses m = param.find("base"); if (m != param.end()) { - m_bname = m->second; + m_bname = m->second; m = param.find("path"); UTIL_THROW_IF2((m != param.end() && m->second != m_bname), - "Conflicting aliases for path:\n" + "Conflicting aliases for path:\n" << "path=" << string(m->second) << "\n" << "base=" << m_bname.c_str() ); } else m_bname = param["path"]; L1 = param["L1"]; L2 = param["L2"]; - + UTIL_THROW_IF2(m_bname.size() == 0, "Missing corpus base name at " << HERE); UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE); UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE); @@ -157,11 +157,11 @@ namespace Moses pair<string,string> dflt("input-factor","0"); input_factor = atoi(param.insert(dflt).first->second.c_str()); // shouldn't that be a string? - + dflt = pair<string,string> ("output-factor","0"); output_factor = atoi(param.insert(dflt).first->second.c_str()); ofactor.assign(1,output_factor); - + dflt = pair<string,string> ("smooth",".01"); m_lbop_conf = atof(param.insert(dflt).first->second.c_str()); @@ -177,7 +177,7 @@ namespace Moses dflt = pair<string,string>("bias-loglevel","0"); m_bias_loglevel = atoi(param.insert(dflt).first->second.c_str()); - + dflt = pair<string,string>("table-limit","20"); m_tableLimit = atoi(param.insert(dflt).first->second.c_str()); @@ -188,25 +188,25 @@ namespace Moses // in plain language: cache size is at least 1000, and 10,000 by default // this cache keeps track of the most frequently used target // phrase collections even when not actively in use - + // Feature functions are initialized in function Load(); - param.insert(pair<string,string>("pfwd", "g")); - param.insert(pair<string,string>("pbwd", "g")); - param.insert(pair<string,string>("logcnt", "0")); - param.insert(pair<string,string>("coh", "0")); - param.insert(pair<string,string>("rare", "1")); - param.insert(pair<string,string>("prov", "1")); - + param.insert(pair<string,string>("pfwd", "g")); + param.insert(pair<string,string>("pbwd", "g")); + param.insert(pair<string,string>("logcnt", "0")); + param.insert(pair<string,string>("coh", "0")); + param.insert(pair<string,string>("rare", "1")); + param.insert(pair<string,string>("prov", "1")); + poolCounts = true; - + // this is for pre-comuted sentence-level bias; DEPRECATED! - if ((m = param.find("bias")) != param.end()) + if ((m = param.find("bias")) != param.end()) m_bias_file = m->second; - if ((m = param.find("bias-server")) != param.end()) + if ((m = param.find("bias-server")) != param.end()) m_bias_server = m->second; - if ((m = param.find("bias-logfile")) != param.end()) + if ((m = param.find("bias-logfile")) != param.end()) { m_bias_logfile = m->second; if (m_bias_logfile == "/dev/stderr") @@ -220,10 +220,10 @@ namespace Moses } } - if ((m = param.find("lr-func")) != param.end()) + if ((m = param.find("lr-func")) != param.end()) m_lr_func_name = m->second; - if ((m = param.find("extra")) != param.end()) + if ((m = param.find("extra")) != param.end()) m_extra_data = m->second; dflt = pair<string,string>("tuneable","true"); @@ -239,7 +239,7 @@ namespace Moses known_parameters.push_back("L1"); known_parameters.push_back("L2"); known_parameters.push_back("Mmsapt"); - known_parameters.push_back("PhraseDictionaryBitextSampling"); + known_parameters.push_back("PhraseDictionaryBitextSampling"); // alias for Mmsapt known_parameters.push_back("base"); // alias for path known_parameters.push_back("bias"); @@ -259,7 +259,7 @@ namespace Moses known_parameters.push_back("name"); known_parameters.push_back("num-features"); known_parameters.push_back("output-factor"); - known_parameters.push_back("path"); + known_parameters.push_back("path"); known_parameters.push_back("pbwd"); known_parameters.push_back("pfwd"); known_parameters.push_back("prov"); @@ -275,12 +275,12 @@ namespace Moses { UTIL_THROW_IF2(!binary_search(known_parameters.begin(), known_parameters.end(), m->first), - HERE << ": Unknown parameter specification for Mmsapt: " + HERE << ": Unknown parameter specification for Mmsapt: " << m->first); } } - void + void Mmsapt:: load_bias(string const fname) { @@ -298,7 +298,7 @@ namespace Moses // - sane word alignment? vector<string> text1,text2,symal; string line; - filtering_istream in1,in2,ina; + filtering_istream in1,in2,ina; open_input_stream(bname+L1+".txt.gz",in1); open_input_stream(bname+L2+".txt.gz",in2); @@ -314,7 +314,7 @@ namespace Moses assert(btdyn); cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl; } - + template<typename fftype> void Mmsapt:: @@ -334,7 +334,7 @@ namespace Moses ff.reset(new fftype(spec)); register_ff(ff, m_active_ff_dyn); } - else + else { sptr<fftype> ff(new fftype(spec)); register_ff(ff, m_active_ff_common); @@ -344,7 +344,7 @@ namespace Moses template<typename fftype> void Mmsapt:: - check_ff(string const ffname, float const xtra, + check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry) { string const& spec = param[ffname]; @@ -361,7 +361,7 @@ namespace Moses ff.reset(new fftype(xtra,spec)); register_ff(ff, m_active_ff_dyn); } - else + else { sptr<fftype> ff(new fftype(xtra,spec)); register_ff(ff, m_active_ff_common); @@ -394,28 +394,28 @@ namespace Moses // standard (default) feature set if (fsname == "standard") { - // lexical scores + // lexical scores string lexfile = m_bname + L1 + "-" + L2 + ".lex"; - sptr<PScoreLex1<Token> > + sptr<PScoreLex1<Token> > ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile)); register_ff(ff,m_active_ff_common); - + // these are always computed on pooled data check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common); check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common); check_ff<PScoreCoherence<Token> >("coh", &m_active_ff_common); - - // for these ones either way is possible (specification ends with '+' - // if corpus-specific + + // for these ones either way is possible (specification ends with '+' + // if corpus-specific check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf); check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf); check_ff<PScoreLogCnt<Token> >("logcnt"); - + // These are always corpus-specific check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix); check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn); } - + // data source features (copies of phrase and word count specific to // this translation model) else if (fsname == "datasource") @@ -456,14 +456,14 @@ namespace Moses btfix.m_num_workers = this->m_workers; btfix.open(m_bname, L1, L2); btfix.setDefaultSampleSize(m_default_sample_size); - + btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size, m_workers)); if (m_bias_file.size()) load_bias(m_bias_file); - - if (m_extra_data.size()) + + if (m_extra_data.size()) load_extra_data(m_extra_data, false); - + #if 0 // currently not used LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC; @@ -490,18 +490,18 @@ namespace Moses } - TargetPhrase* + TargetPhrase* Mmsapt:: mkTPhrase(Phrase const& src, - PhrasePair<Token>* fix, - PhrasePair<Token>* dyn, + PhrasePair<Token>* fix, + PhrasePair<Token>* dyn, sptr<Bitext<Token> > const& dynbt) const { - UTIL_THROW_IF2(!fix && !dyn, HERE << + UTIL_THROW_IF2(!fix && !dyn, HERE << ": Can't create target phrase from nothing."); vector<float> fvals(this->m_numScoreComponents); PhrasePair<Token> pool = fix ? *fix : *dyn; - if (fix) + if (fix) { BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) (*ff)(btfix, *fix, &fvals); @@ -511,7 +511,7 @@ namespace Moses BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) (*ff)(*dynbt, *dyn, &fvals); } - + if (fix && dyn) { pool += *dyn; } else if (fix) { @@ -533,7 +533,7 @@ namespace Moses BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } - if (fix) + if (fix) { BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) (*ff)(btfix, pool, &fvals); @@ -574,39 +574,39 @@ namespace Moses const InputPathList &inputPathQueue) const { InputPathList::const_iterator iter; - for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) + for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { InputPath &inputPath = **iter; const Phrase &phrase = inputPath.GetPhrase(); PrefixExists(ttask, phrase); // launches parallel lookup } - for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) + for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { InputPath &inputPath = **iter; const Phrase &phrase = inputPath.GetPhrase(); - const TargetPhraseCollection *targetPhrases + const TargetPhraseCollection *targetPhrases = this->GetTargetPhraseCollectionLEGACY(ttask,phrase); inputPath.SetTargetPhrases(*this, targetPhrases, NULL); } } - - TargetPhraseCollection const* + + TargetPhraseCollection const* Mmsapt:: GetTargetPhraseCollectionLEGACY(const Phrase& src) const { UTIL_THROW2("Don't call me without the translation task."); } - // This is not the most efficient way of phrase lookup! - TargetPhraseCollection const* + // This is not the most efficient way of phrase lookup! + TargetPhraseCollection const* Mmsapt:: GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const { // map from Moses Phrase to internal id sequence - vector<id_type> sphrase; + vector<id_type> sphrase; fillIdSeq(src,input_factor,*(btfix.V1),sphrase); if (sphrase.size() == 0) return NULL; - + // Reserve a local copy of the dynamic bitext in its current form. /btdyn/ // is set to a new copy of the dynamic bitext every time a sentence pair // is added. /dyn/ keeps the old bitext around as long as we need it. @@ -631,11 +631,11 @@ namespace Moses << mdyn.size() << " " << mdyn.getPid() << endl; #endif - if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) + if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) return NULL; // phrase not found in either bitext // do we have cached results for this phrase? - uint64_t phrasekey = (mfix.size() == sphrase.size() + uint64_t phrasekey = (mfix.size() == sphrase.size() ? (mfix.getPid()<<1) : (mdyn.getPid()<<1)+1); // get context-specific cache of items previously looked up @@ -647,25 +647,25 @@ namespace Moses // was stored as the time stamp. For each word in the // vocabulary, we also store its most recent occurrence in the // bitext. Only if the timestamp of each word in the phrase is - // newer than the timestamp of the phrase itself we must update - // the entry. + // newer than the timestamp of the phrase itself we must update + // the entry. if (ret) return ret; // yes, was cached => DONE - + // OK: pt entry NOT found or NOT up to date - // lookup and expansion could be done in parallel threads, + // lookup and expansion could be done in parallel threads, // but ppdyn is probably small anyway // TO DO: have Bitexts return lists of PhrasePairs instead of pstats - // no need to expand pstats at every single lookup again, especially + // no need to expand pstats at every single lookup again, especially // for btfix. sptr<pstats> sfix,sdyn; - + if (mfix.size() == sphrase.size()) sfix = btfix.lookup(ttask, mfix); if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(ttask, mdyn); vector<PhrasePair<Token> > ppfix,ppdyn; PhrasePair<Token>::SortByTargetIdSeq sort_by_tgt_id; - if (sfix) + if (sfix) { expand(mfix, btfix, *sfix, ppfix, m_bias_log); sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); @@ -706,8 +706,8 @@ namespace Moses #if 0 - if (combine_pstats(src, - mfix.getPid(), sfix.get(), btfix, + if (combine_pstats(src, + mfix.getPid(), sfix.get(), btfix, mdyn.getPid(), sdyn.get(), *dyn, ret)) { #if 0 @@ -733,7 +733,7 @@ namespace Moses return ret; } - size_t + size_t Mmsapt:: SetTableLimit(size_t limit) { @@ -762,14 +762,14 @@ namespace Moses throw "CreateRuleLookupManager is currently not supported in Mmsapt!"; } - void + void Mmsapt:: InitializeForInput(ttasksptr const& ttask) { sptr<ContextScope> const& scope = ttask->GetScope(); - sptr<ContextForQuery> context + sptr<ContextForQuery> context = scope->get<ContextForQuery>(&btfix, true); - if (m_bias_server.size() && context->bias == NULL) + if (m_bias_server.size() && context->bias == NULL) { // we need to create the bias boost::unique_lock<boost::shared_mutex> lock(context->lock); string const& context_words = ttask->GetContextString(); @@ -778,18 +778,18 @@ namespace Moses if (m_bias_log) { *m_bias_log << HERE << endl - << "BIAS LOOKUP CONTEXT: " - << context_words << endl; + << "BIAS LOOKUP CONTEXT: " + << context_words << endl; context->bias_log = m_bias_log; } - context->bias + context->bias = btfix.SetupDocumentBias(m_bias_server, context_words, m_bias_log); context->bias->loglevel = m_bias_loglevel; context->bias->log = m_bias_log; } if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); - } + } boost::unique_lock<boost::shared_mutex> mylock(m_lock); sptr<TPCollCache> localcache = scope->get<TPCollCache>(cache_key); if (!localcache) @@ -798,12 +798,12 @@ namespace Moses else localcache = m_cache; scope->set<TPCollCache>(cache_key, localcache); } - + if (m_lr_func_name.size() && m_lr_func == NULL) { FeatureFunction* lr = &FeatureFunction::FindFeatureFunction(m_lr_func_name); m_lr_func = dynamic_cast<LexicalReordering*>(lr); - UTIL_THROW_IF2(lr == NULL, "FF " << m_lr_func_name + UTIL_THROW_IF2(lr == NULL, "FF " << m_lr_func_name << " does not seem to be a lexical reordering function!"); // todo: verify that lr_func implements a hierarchical reordering model } @@ -813,7 +813,7 @@ namespace Moses // Mmsapt:: // PrefixExists(Moses::Phrase const& phrase) const // { - // return PrefixExists(phrase,NULL); + // return PrefixExists(phrase,NULL); // } bool @@ -821,11 +821,11 @@ namespace Moses PrefixExists(ttasksptr const& ttask, Moses::Phrase const& phrase) const { if (phrase.GetSize() == 0) return false; - vector<id_type> myphrase; + vector<id_type> myphrase; fillIdSeq(phrase,input_factor,*btfix.V1,myphrase); - + TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size()); - if (mfix.size() == myphrase.size()) + if (mfix.size() == myphrase.size()) { btfix.prep(ttask, mfix); // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; @@ -872,7 +872,7 @@ namespace Moses // return btfix.SetupDocumentBias(bias); // } - vector<float> + vector<float> Mmsapt ::DefaultWeights() const { return vector<float>(this->GetNumScoreComponents(), 1.); } diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 4552ea8d2..5f688cfd8 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -38,13 +38,13 @@ // TO DO: // - make lexical phrase scorer take addition to the "dynamic overlay" into account // - switch to pool of sapts, where each sapt has its own provenance feature -// RESEARCH QUESTION: is this more effective than having multiple phrase tables, +// RESEARCH QUESTION: is this more effective than having multiple phrase tables, // each with its own set of features? namespace Moses { using namespace bitext; - class Mmsapt + class Mmsapt #ifndef NO_MOSES : public PhraseDictionary #endif @@ -54,7 +54,7 @@ namespace Moses friend class Alignment; std::map<std::string,std::string> param; std::string m_name; - public: + public: typedef L2R_Token<SimpleWordId> Token; typedef mmBitext<Token> mmbitext; typedef imBitext<Token> imbitext; @@ -63,21 +63,21 @@ namespace Moses typedef PhraseScorer<Token> pscorer; private: // vector<sptr<bitext> > shards; - mmbitext btfix; - sptr<imbitext> btdyn; + mmbitext btfix; + sptr<imbitext> btdyn; std::string m_bname, m_extra_data, m_bias_file,m_bias_server; std::string L1; std::string L2; float m_lbop_conf; // confidence level for lbop smoothing float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing // alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha) - // must be > 0 if dynamic + // must be > 0 if dynamic size_t m_default_sample_size; size_t m_workers; // number of worker threads for sampling the bitexts std::vector<std::string> m_feature_set_names; // one or more of: standard, datasource std::string m_bias_logfile; boost::scoped_ptr<ofstream> m_bias_logger; // for logging to a file - ostream* m_bias_log; + ostream* m_bias_log; int m_bias_loglevel; LexicalReordering* m_lr_func; // associated lexical reordering function string m_lr_func_name; // name of associated lexical reordering function @@ -88,47 +88,47 @@ namespace Moses boost::shared_ptr<SamplingBias> m_bias; // for global default bias boost::shared_ptr<TPCollCache> m_cache; // for global default bias size_t m_cache_size; // - size_t input_factor; // + size_t input_factor; // size_t output_factor; // we can actually return entire Tokens! // for display for human inspection (ttable dumps): std::vector<std::string> m_feature_names; // names of features activated - std::vector<bool> m_is_logval; // keeps track of which features are log valued - std::vector<bool> m_is_integer; // keeps track of which features are integer valued + std::vector<bool> m_is_logval; // keeps track of which features are log valued + std::vector<bool> m_is_integer; // keeps track of which features are integer valued std::vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix) std::vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn) - std::vector<sptr<pscorer > > m_active_ff_common; + std::vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn) - void + void register_ff(sptr<pscorer> const& ff, std::vector<sptr<pscorer> > & registry); template<typename fftype> - void + void check_ff(std::string const ffname,std::vector<sptr<pscorer> >* registry = NULL); - // add feature function if specified - + // add feature function if specified + template<typename fftype> - void - check_ff(std::string const ffname, float const xtra, + void + check_ff(std::string const ffname, float const xtra, std::vector<sptr<pscorer> >* registry = NULL); // add feature function if specified void add_corpus_specific_features(std::vector<sptr<pscorer > >& ffvec); - + // built-in feature functions // PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn; // PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn; - // PScoreLex<Token> calc_lex; + // PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually - // PScorePC<Token> apply_pp; // apply phrase penalty + // PScorePC<Token> apply_pp; // apply phrase penalty // PScoreLogCounts<Token> add_logcounts_fix; // PScoreLogCounts<Token> add_logcounts_dyn; void init(std::string const& line); mutable boost::shared_mutex m_lock; - // mutable boost::shared_mutex m_cache_lock; + // mutable boost::shared_mutex m_cache_lock; // for more complex operations on the cache bool withPbwd; bool poolCounts; @@ -141,25 +141,25 @@ namespace Moses void read_config_file(std::string fname, std::map<std::string,std::string>& param); // phrase table feature weights for alignment: - std::vector<float> feature_weights; + std::vector<float> feature_weights; - std::vector<std::vector<id_type> > wlex21; + std::vector<std::vector<id_type> > wlex21; // word translation lexicon (without counts, get these from calc_lex.COOC) typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> mm2dtable_t; mm2dtable_t COOCraw; - TargetPhrase* - mkTPhrase(Phrase const& src, - Moses::bitext::PhrasePair<Token>* fix, - Moses::bitext::PhrasePair<Token>* dyn, + TargetPhrase* + mkTPhrase(Phrase const& src, + Moses::bitext::PhrasePair<Token>* fix, + Moses::bitext::PhrasePair<Token>* dyn, sptr<Bitext<Token> > const& dynbt) const; void process_pstats (Phrase const& src, - uint64_t const pid1, - pstats const& stats, - Bitext<Token> const & bt, + uint64_t const pid1, + pstats const& stats, + Bitext<Token> const & bt, TargetPhraseCollection* tpcoll ) const; @@ -169,16 +169,16 @@ namespace Moses uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta, uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb, TargetPhraseCollection* tpcoll) const; - + bool combine_pstats - (Phrase const& src, + (Phrase const& src, uint64_t const pid1a, pstats* statsa, Bitext<Token> const & bta, - uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb, + uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb, TargetPhraseCollection* tpcoll) const; void load_extra_data(std::string bname, bool locking); - void load_bias(std::string bname); + void load_bias(std::string bname); public: // Mmsapt(std::string const& description, std::string const& line); @@ -190,22 +190,22 @@ namespace Moses std::string const& GetName() const; #ifndef NO_MOSES - TargetPhraseCollection const* + TargetPhraseCollection const* GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const; - TargetPhraseCollection const* + TargetPhraseCollection const* GetTargetPhraseCollectionLEGACY(const Phrase& src) const; - void + void GetTargetPhraseCollectionBatch(ttasksptr const& ttask, const InputPathList &inputPathQueue) const; - + //! Create a sentence-specific manager for SCFG rule lookup. ChartRuleLookupManager* CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &); - + ChartRuleLookupManager* - CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &, + CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &, std::size_t); #endif @@ -222,7 +222,7 @@ namespace Moses bool ProvidesPrefixCheck() const; // return true if prefix /phrase/ check exists // bool PrefixExists(Phrase const& phrase, SamplingBias const* const bias) const; bool PrefixExists(ttasksptr const& ttask, Phrase const& phrase) const; - + bool isLogVal(int i) const; bool isInteger(int i) const; @@ -232,7 +232,7 @@ namespace Moses void CleanUpAfterSentenceProcessing(ttasksptr const& ttask); // align two new sentences - sptr<std::vector<int> > + sptr<std::vector<int> > align(std::string const& src, std::string const& trg) const; std::vector<std::string> const& diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc index 65cf979e1..13d8387d2 100644 --- a/moses/TranslationModel/UG/mmsapt_align.cc +++ b/moses/TranslationModel/UG/mmsapt_align.cc @@ -6,7 +6,7 @@ // using namespace bitext; // using namespace std; // using namespace boost; - + // struct PPgreater // { // bool operator()(PhrasePair const& a, PhrasePair const& b) @@ -28,7 +28,7 @@ // PhrasePair pp; // ushort s1,e1,s2,e2; // start and end positions // int prev; // preceding alignment hypothesis -// float score; +// float score; // bitvector scov; // source coverage // PhraseAlnHyp(PhrasePair const& ppx, int slen, // pair<uint32_t,uint32_t> const& sspan, @@ -37,7 +37,7 @@ // { // s1 = sspan.first; e1 = sspan.second; // s2 = tspan.first; e2 = tspan.second; -// for (size_t i = s1; i < e1; ++i) +// for (size_t i = s1; i < e1; ++i) // scov.set(i); // } @@ -78,13 +78,13 @@ // return po_other; // } -// float +// float // dprob_fwd(PhraseAlnHyp const& next) // { // return pp.dfwd[po_fwd(&next)]; // } -// float +// float // dprob_bwd(PhraseAlnHyp const& prev) // { // return pp.dbwd[po_bwd(&prev)]; @@ -102,15 +102,15 @@ // typedef pstats::trg_map_t jStatsTable; // Mmsapt const& PT; -// vector<id_type> s,t; +// vector<id_type> s,t; // pidmap_t sspan2pid, tspan2pid; // span -> phrase ID // pid2span_t spid2span,tpid2span; // vector<vector<sptr<pstats> > > spstats; -// vector<PhrasePair> PP; +// vector<PhrasePair> PP; // // position-independent phrase pair info // public: -// vector<PhraseAlnHyp> PAH; +// vector<PhraseAlnHyp> PAH; // vector<vector<int> > tpos2ahyp; // // maps from target start positions to PhraseAlnHyps starting at // // that position @@ -120,8 +120,8 @@ // void fill_sspan_maps(); // public: // Alignment(Mmsapt const& pt, string const& src, string const& trg); -// void show(ostream& out); -// void show(ostream& out, PhraseAlnHyp const& ah); +// void show(ostream& out); +// void show(ostream& out, PhraseAlnHyp const& ah); // }; // void @@ -129,11 +129,11 @@ // show(ostream& out, PhraseAlnHyp const& ah) // { // #if 0 -// LexicalPhraseScorer2<Token>::table_t const& +// LexicalPhraseScorer2<Token>::table_t const& // COOCjnt = PT.calc_lex.scorer.COOC; // out << setw(10) << exp(ah.score) << " " -// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) +// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) // << " <=> " // << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1); // vector<uchar> const& a = ah.pp.aln; @@ -168,7 +168,7 @@ // // << "]" << endl; // #endif // } - + // void // Alignment:: // show(ostream& out) @@ -192,7 +192,7 @@ // return spstats[sspan.first][k]; // else return sptr<pstats>(); // } - + // void // Alignment:: // fill_tspan_maps() @@ -207,7 +207,7 @@ // tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1)); // tspan2pid[i][k] = pid; // } -// } +// } // } // void @@ -230,11 +230,11 @@ // int y = p->second[0].second-1; // spstats[i].push_back(spstats[x][y-x]); // } -// else +// else // { // spstats[i].push_back(PT.btfix.lookup(m)); // cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " " -// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt +// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt // << endl; // } // spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1)); @@ -262,14 +262,14 @@ // // size_t m2 = COOC.m2(i); // // if (j*1000 > m1 && j*1000 > m2) // // cout << " " << (*PT.btfix.V1)[k]; -// // } +// // } // // } // // cout << endl; // // } - + // fill_tspan_maps(); // fill_sspan_maps(); -// tpos2ahyp.resize(t.size()); +// tpos2ahyp.resize(t.size()); // // now fill the association score table // PAH.reserve(1000000); // typedef pid2span_t::iterator psiter; @@ -301,12 +301,12 @@ // } // } - + // int // extend(vector<PhraseAlnHyp> & PAH, int edge, int next) // { -// if ((PAH[edge].scov & PAH[next].scov).count()) +// if ((PAH[edge].scov & PAH[next].scov).count()) // return -1; // int ret = PAH.size(); // PAH.push_back(PAH[next]); diff --git a/moses/TranslationModel/UG/ptable-describe-features.cc b/moses/TranslationModel/UG/ptable-describe-features.cc index dbd5accb9..c9dd3abd1 100644 --- a/moses/TranslationModel/UG/ptable-describe-features.cc +++ b/moses/TranslationModel/UG/ptable-describe-features.cc @@ -19,7 +19,7 @@ int main() { if (line.empty()) continue; size_t k = line.find_first_not_of(" "); - if (line.find("Mmsapt") != k && + if (line.find("Mmsapt") != k && line.find("PhraseDictionaryBitextSampling") != k) continue; Mmsapt PT(line); @@ -32,6 +32,6 @@ int main() } exit(0); } - - + + diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc index e165011c7..94627a02c 100644 --- a/moses/TranslationModel/UG/ptable-lookup.cc +++ b/moses/TranslationModel/UG/ptable-lookup.cc @@ -19,13 +19,13 @@ class SimplePhrase : public Moses::Phrase vector<FactorType> const m_fo; // factor order public: SimplePhrase(): m_fo(1,FactorType(0)) {} - - void init(string const& s) + + void init(string const& s) { istringstream buf(s); string w; - while (buf >> w) + while (buf >> w) { - Word wrd; + Word wrd; this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false); } } @@ -63,15 +63,15 @@ int main(int argc, char* argv[]) cerr << "Phrase table implementation not supported by this utility." << endl; exit(1); } - + string line; while (true) { Sentence phrase; if (!phrase.Read(cin,ifo)) break; - if (pdta) + if (pdta) { - pdta->InitializeForInput(phrase); + pdta->InitializeForInput(phrase); // do we also need to call CleanupAfterSentenceProcessing at the end? } Phrase& p = phrase; @@ -79,13 +79,13 @@ int main(int argc, char* argv[]) cout << p << endl; TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p); if (!trg) continue; - vector<size_t> order(trg->GetSize()); + vector<size_t> order(trg->GetSize()); for (size_t i = 0; i < order.size(); ++i) order[i] = i; sort(order.begin(),order.end(),TargetPhraseIndexSorter(*trg)); size_t k = 0; - // size_t precision = + // size_t precision = cout.precision(2); - + vector<string> fname; if (mmsapt) { @@ -119,6 +119,6 @@ int main(int argc, char* argv[]) } exit(0); } - - + + diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h index e1ecf1573..0caf11e43 100644 --- a/moses/TranslationModel/UG/sapt_phrase_key.h +++ b/moses/TranslationModel/UG/sapt_phrase_key.h @@ -8,6 +8,6 @@ namespace sapt using namespace Moses; using namespace std; - + } diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h index 9870ed7f0..ace907d73 100644 --- a/moses/TranslationModel/UG/sapt_phrase_scorers.h +++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h @@ -1,6 +1,6 @@ // -*- c++ -*- // Phrase scoring functions for suffix array-based phrase tables -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "sapt_pscore_unaligned.h" // count # of unaligned words #include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h index ff705f952..388c83d9b 100644 --- a/moses/TranslationModel/UG/sapt_pscore_base.h +++ b/moses/TranslationModel/UG/sapt_pscore_base.h @@ -1,6 +1,6 @@ // -*- c++ -*- // Base classes for suffix array-based phrase scorers -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" @@ -21,72 +21,72 @@ namespace Moses { string m_tag; vector<string> m_feature_names; public: - - virtual - void - operator()(Bitext<Token> const& pt, - PhrasePair<Token>& pp, - vector<float> * dest=NULL) + + virtual + void + operator()(Bitext<Token> const& pt, + PhrasePair<Token>& pp, + vector<float> * dest=NULL) const = 0; void setIndex(int const i) { m_index = i; } - + int getIndex() const { return m_index; } - int + int fcnt() const { return m_num_feats; } - + vector<string> const & fnames() const { return m_feature_names; } string const & fname(int i) const - { + { if (i < 0) i += m_num_feats; UTIL_THROW_IF2(i < 0 || i >= m_num_feats, "Feature name index out of range at " << HERE); - return m_feature_names.at(i); + return m_feature_names.at(i); } virtual bool - isLogVal(int i) const { return true; }; - // is this feature log valued? - + isLogVal(int i) const { return true; }; + // is this feature log valued? + virtual bool - isIntegerValued(int i) const { return false; }; - // is this feature integer valued (e.g., count features)? + isIntegerValued(int i) const { return false; }; + // is this feature integer valued (e.g., count features)? virtual bool allowPooling() const { return true; } - // does this feature function allow pooling of counts if + // does this feature function allow pooling of counts if // there are no occurrences in the respective corpus? - + virtual void load() { } }; - // base class for 'families' of phrase scorers that have a single + // base class for 'families' of phrase scorers that have a single template<typename Token> class - SingleRealValuedParameterPhraseScorerFamily + SingleRealValuedParameterPhraseScorerFamily : public PhraseScorer<Token> { protected: vector<float> m_x; - virtual - void - init(string const specs) - { + virtual + void + init(string const specs) + { using namespace boost; - UTIL_THROW_IF2(this->m_tag.size() == 0, + UTIL_THROW_IF2(this->m_tag.size() == 0, "m_tag must be initialized in constructor"); UTIL_THROW_IF2(specs.size() == 0,"empty specification string!"); UTIL_THROW_IF2(this->m_feature_names.size(), diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h index a3211df54..c201c9651 100644 --- a/moses/TranslationModel/UG/sapt_pscore_coherence.h +++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h @@ -1,5 +1,5 @@ // -*- c++ -*- -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" @@ -13,16 +13,16 @@ namespace Moses { PScoreCoherence : public PhraseScorer<Token> { public: - PScoreCoherence(string const dummy) - { + PScoreCoherence(string const dummy) + { this->m_index = -1; this->m_num_feats = 1; this->m_feature_names.push_back(string("coherence")); } - - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h index a8e83da51..76ca2a9a4 100644 --- a/moses/TranslationModel/UG/sapt_pscore_lex1.h +++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h @@ -1,6 +1,6 @@ // -*- c++ -*- // Phrase scorer that counts the number of unaligend words in the phrase -// written by Ulrich Germann +// written by Ulrich Germann #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "sapt_pscore_base.h" @@ -17,11 +17,11 @@ namespace Moses { string m_lexfile; public: LexicalPhraseScorer2<Token> scorer; - - PScoreLex1(string const& alphaspec, string const& lexfile) - { + + PScoreLex1(string const& alphaspec, string const& lexfile) + { this->m_index = -1; - this->m_num_feats = 2; + this->m_num_feats = 2; this->m_feature_names.reserve(2); this->m_feature_names.push_back("lexfwd"); this->m_feature_names.push_back("lexbwd"); @@ -31,13 +31,13 @@ namespace Moses { void load() - { - scorer.open(m_lexfile); + { + scorer.open(m_lexfile); } - - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; @@ -48,27 +48,27 @@ namespace Moses { cout << len1 << " " << len2 << endl; Token const* t1 = bt.T1->sntStart(sid1); for (size_t i = off1; i < off1 + len1; ++i) - cout << (*bt.V1)[t1[i].id()] << " "; + cout << (*bt.V1)[t1[i].id()] << " "; cout << __FILE__ << ":" << __LINE__ << endl; - + Token const* t2 = bt.T2->sntStart(sid2); for (size_t i = off2; i < off2 + len2; ++i) - cout << (*bt.V2)[t2[i].id()] << " "; + cout << (*bt.V2)[t2[i].id()] << " "; cout << __FILE__ << ":" << __LINE__ << endl; - + BOOST_FOREACH (int a, pp.aln) cout << a << " " ; cout << __FILE__ << ":" << __LINE__ << "\n" << endl; - + scorer.score(bt.T1->sntStart(sid1)+off1,0,len1, bt.T2->sntStart(sid2)+off2,0,len2, pp.aln, m_alpha, (*dest)[this->m_index], (*dest)[this->m_index+1]); #endif - scorer.score(pp.start1,0, pp.len1, - pp.start2,0, pp.len2, pp.aln, m_alpha, - (*dest)[this->m_index], + scorer.score(pp.start1,0, pp.len1, + pp.start2,0, pp.len2, pp.aln, m_alpha, + (*dest)[this->m_index], (*dest)[this->m_index+1]); } }; diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h index 2790323ed..9dc5ac7ba 100644 --- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h +++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h @@ -2,7 +2,7 @@ // Phrase scorer that rewards the number of phrase pair occurrences in a bitext // with the asymptotic function x/(j+x) where x > 0 is a function // parameter that determines the steepness of the rewards curve -// written by Ulrich Germann +// written by Ulrich Germann #include "sapt_pscore_base.h" #include <boost/dynamic_bitset.hpp> @@ -10,15 +10,15 @@ using namespace std; namespace Moses { namespace bitext { - + template<typename Token> class PScoreLogCnt : public PhraseScorer<Token> { string m_specs; public: - PScoreLogCnt(string const specs) - { + PScoreLogCnt(string const specs) + { this->m_index = -1; this->m_specs = specs; if (specs.find("r1") != string::npos) // raw source phrase counts @@ -35,11 +35,11 @@ namespace Moses { } bool - isIntegerValued(int i) const { return true; } + isIntegerValued(int i) const { return true; } - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; @@ -49,15 +49,15 @@ namespace Moses { assert(pp.joint); assert(pp.raw2); size_t i = this->m_index; - if (m_specs.find("r1") != string::npos) + if (m_specs.find("r1") != string::npos) (*dest)[i++] = log(pp.raw1); - if (m_specs.find("s1") != string::npos) + if (m_specs.find("s1") != string::npos) (*dest)[i++] = log(pp.sample1); - if (m_specs.find("g1") != string::npos) + if (m_specs.find("g1") != string::npos) (*dest)[i++] = log(pp.good1); - if (m_specs.find("j") != string::npos) + if (m_specs.find("j") != string::npos) (*dest)[i++] = log(pp.joint); - if (m_specs.find("r2") != string::npos) + if (m_specs.find("r2") != string::npos) (*dest)[++i] = log(pp.raw2); } }; diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h index f7b4686d7..9366777ef 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -1,5 +1,5 @@ //-*- c++ -*- -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" @@ -15,12 +15,12 @@ namespace Moses { { float conf; string denom; - + public: - PScorePbwd(float const c, string d) - { + PScorePbwd(float const c, string d) + { this->m_index = -1; - conf = c; + conf = c; denom = d; size_t checksum = d.size(); BOOST_FOREACH(char const& x, denom) @@ -36,13 +36,13 @@ namespace Moses { << d << "' for Pbwd phrase scorer at " << HERE); } - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; - // we use the denominator specification to scale the raw counts on the + // we use the denominator specification to scale the raw counts on the // target side; the clean way would be to counter-sample size_t i = this->m_index; BOOST_FOREACH(char const& x, denom) diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h index ed48a93d2..c5de210a1 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -1,5 +1,5 @@ // -*- c++ -*- -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" @@ -18,10 +18,10 @@ namespace Moses { public: - PScorePfwd(float const c, string d) - { + PScorePfwd(float const c, string d) + { this->m_index = -1; - conf = c; + conf = c; denom = d; size_t checksum = d.size(); BOOST_FOREACH(char const& x, denom) @@ -32,17 +32,17 @@ namespace Moses { this->m_feature_names.push_back(s); } this->m_num_feats = this->m_feature_names.size(); - UTIL_THROW_IF2(this->m_feature_names.size() != checksum, - "Unknown parameter in specification '" + UTIL_THROW_IF2(this->m_feature_names.size() != checksum, + "Unknown parameter in specification '" << d << "' for Pfwd phrase scorer at " << HERE); } - - void - operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp, + + void + operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; - if (pp.joint > pp.good1) + if (pp.joint > pp.good1) { pp.joint = pp.good1; // cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl; @@ -53,18 +53,18 @@ namespace Moses { { switch (c) { - case 'g': - (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); + case 'g': + (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); break; - case 's': - (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); + case 's': + (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); break; case 'r': - (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf)); + (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf)); } } } }; } } - + diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h index e0a6eb48b..e0ce40117 100644 --- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h +++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h @@ -1,5 +1,5 @@ // -*- c++ -*- -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" @@ -20,15 +20,15 @@ namespace Moses { this->m_num_feats = 1; this->m_feature_names.push_back(string("phrasecount")); } - - void + + void operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; (*dest)[this->m_index] = 1; - } + } }; } } diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h index c33b98fe7..ee7b08bda 100644 --- a/moses/TranslationModel/UG/sapt_pscore_provenance.h +++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h @@ -2,7 +2,7 @@ // Phrase scorer that rewards the number of phrase pair occurrences in a bitext // with the asymptotic function j/(j+x) where x > 0 is a function // parameter that determines the steepness of the rewards curve -// written by Ulrich Germann +// written by Ulrich Germann #include "sapt_pscore_base.h" #include <boost/dynamic_bitset.hpp> @@ -10,7 +10,7 @@ using namespace std; namespace Moses { namespace bitext { - + // asymptotic provenance feature n/(n+x) template<typename Token> class @@ -18,18 +18,18 @@ namespace Moses { { public: - PScoreProvenance(string const& spec) + PScoreProvenance(string const& spec) { this->m_tag = "prov"; this->init(spec); } - + bool - isLogVal(int i) const { return false; } + isLogVal(int i) const { return false; } - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; @@ -39,7 +39,7 @@ namespace Moses { } bool - allowPooling() const + allowPooling() const { return false; } }; diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h index 58f204c88..34979243c 100644 --- a/moses/TranslationModel/UG/sapt_pscore_rareness.h +++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h @@ -2,7 +2,7 @@ // Phrase scorer that rewards the number of phrase pair occurrences in a bitext // with the asymptotic function x/(j+x) where x > 0 is a function // parameter that determines the steepness of the rewards curve -// written by Ulrich Germann +// written by Ulrich Germann #include "sapt_pscore_base.h" #include <boost/dynamic_bitset.hpp> @@ -10,25 +10,25 @@ using namespace std; namespace Moses { namespace bitext { - + // rareness penalty: x/(n+x) template<typename Token> class PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token> { public: - PScoreRareness(string const spec) + PScoreRareness(string const spec) { this->m_tag = "rare"; this->init(spec); } bool - isLogVal(int i) const { return false; } + isLogVal(int i) const { return false; } - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h index dafc1e129..8dceb1ad0 100644 --- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h +++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h @@ -1,6 +1,6 @@ // -*- c++ -*- // Phrase scorer that counts the number of unaligend words in the phrase -// written by Ulrich Germann +// written by Ulrich Germann #include "sapt_pscore_base.h" #include <boost/dynamic_bitset.hpp> @@ -14,7 +14,7 @@ namespace Moses { { typedef boost::dynamic_bitset<typename ::uint64_t> bitvector; public: - PScoreUnaligned(string const spec) + PScoreUnaligned(string const spec) { this->m_index = -1; int f = this->m_num_feats = atoi(spec.c_str()); @@ -28,16 +28,16 @@ namespace Moses { this->m_feature_names[1] = "unal-t"; } } - + bool - isLogVal(int i) const { return false; } - + isLogVal(int i) const { return false; } + bool - isIntegerValued(int i) const { return true; } + isIntegerValued(int i) const { return true; } - void - operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; @@ -46,9 +46,9 @@ namespace Moses { // parse_pid(pp.p2, sid2, off2, len2); bitvector check1(pp.len1),check2(pp.len2); for (size_t i = 0; i < pp.aln.size(); ) - { - check1.set(pp.aln[i++]); - check2.set(pp.aln.at(i++)); + { + check1.set(pp.aln[i++]); + check2.set(pp.aln.at(i++)); } if (this->m_num_feats == 1) diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h index 3227bb6ba..a5000be37 100644 --- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h +++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h @@ -1,5 +1,5 @@ // -*- c++ -*- -// written by Ulrich Germann +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" @@ -13,7 +13,7 @@ namespace Moses { class PScoreWC : public PhraseScorer<Token> { - public: + public: PScoreWC(string const dummy) { this->m_index = -1; @@ -21,14 +21,14 @@ namespace Moses { this->m_feature_names.push_back(string("wordcount")); } - void + void operator()(Bitext<Token> const& bt, - PhrasePair<Token>& pp, + PhrasePair<Token>& pp, vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; (*dest)[this->m_index] = pp.len2; - } + } }; } } diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc index 460d66c1f..00a705936 100644 --- a/moses/TranslationModel/UG/sim-pe.cc +++ b/moses/TranslationModel/UG/sim-pe.cc @@ -15,7 +15,7 @@ using namespace boost; vector<FactorType> fo(1,FactorType(0)); -ostream& +ostream& operator<<(ostream& out, Hypothesis const* x) { vector<const Hypothesis*> H; @@ -24,7 +24,7 @@ operator<<(ostream& out, Hypothesis const* x) for (; H.size(); H.pop_back()) { Phrase const& p = H.back()->GetCurrTargetPhrase(); - for (size_t pos = 0 ; pos < p.GetSize() ; pos++) + for (size_t pos = 0 ; pos < p.GetSize() ; pos++) out << *p.GetFactor(pos, 0) << (H.size() ? " " : ""); } return out; @@ -33,19 +33,19 @@ operator<<(ostream& out, Hypothesis const* x) vector<FactorType> ifo; size_t lineNumber; -string +string translate(string const& source) { StaticData const& global = StaticData::Instance(); - Sentence sentence; - istringstream ibuf(source+"\n"); + Sentence sentence; + istringstream ibuf(source+"\n"); sentence.Read(ibuf,ifo); // Manager manager(lineNumber, sentence, global.GetSearchAlgorithm()); Manager manager(sentence, global.GetSearchAlgorithm()); manager.ProcessSentence(); - + ostringstream obuf; const Hypothesis* h = manager.GetBestHypothesis(); obuf << h; @@ -58,7 +58,7 @@ int main(int argc, char* argv[]) Parameter params; if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(¶ms, argv[0])) exit(1); - + StaticData const& global = StaticData::Instance(); global.SetVerboseLevel(0); ifo = global.GetInputFactorOrder(); @@ -79,6 +79,6 @@ int main(int argc, char* argv[]) } exit(0); } - - + + diff --git a/moses/TranslationModel/UG/spe-check-coverage.cc b/moses/TranslationModel/UG/spe-check-coverage.cc index 6e838ad04..378dd800f 100644 --- a/moses/TranslationModel/UG/spe-check-coverage.cc +++ b/moses/TranslationModel/UG/spe-check-coverage.cc @@ -24,13 +24,13 @@ class SimplePhrase : public Moses::Phrase vector<FactorType> const m_fo; // factor order public: SimplePhrase(): m_fo(1,FactorType(0)) {} - - void init(string const& s) + + void init(string const& s) { istringstream buf(s); string w; - while (buf >> w) + while (buf >> w) { - Word wrd; + Word wrd; this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false); } } @@ -45,7 +45,7 @@ public: bool operator()(size_t a, size_t b) const { // return cmp(*my_tpc[a], *my_tpc[b]); - return (my_tpc[a]->GetScoreBreakdown().GetWeightedScore() > + return (my_tpc[a]->GetScoreBreakdown().GetWeightedScore() > my_tpc[b]->GetScoreBreakdown().GetWeightedScore()); } }; @@ -59,7 +59,7 @@ int main(int argc, char* argv[]) argfilter[1] = std::make_pair(string("--spe-trg"),1); argfilter[2] = std::make_pair(string("--spe-aln"),1); argfilter[3] = std::make_pair(string("--spe-show"),1); - + char** my_args; int my_acnt; char** mo_args; int mo_acnt; filter_arguments(argc, argv, mo_acnt, &mo_args, my_acnt, &my_args, argfilter); @@ -77,9 +77,9 @@ int main(int argc, char* argv[]) else if (!strcmp(my_args[i],"--spe-show")) vlevel = my_args[i+1]; } - + Parameter params; - if (!params.LoadParam(mo_acnt,mo_args) || + if (!params.LoadParam(mo_acnt,mo_args) || !StaticData::LoadDataStatic(¶ms, mo_args[0])) exit(1); @@ -95,15 +95,15 @@ int main(int argc, char* argv[]) exit(1); } mmsapt->SetTableLimit(0); - + string srcline,trgline,alnline; cout.precision(2); vector<string> fname = mmsapt->GetFeatureNames(); while (getline(spe_src,srcline)) { - UTIL_THROW_IF2(!getline(spe_trg,trgline), HERE + UTIL_THROW_IF2(!getline(spe_trg,trgline), HERE << ": missing data for online updates."); - UTIL_THROW_IF2(!getline(spe_aln,alnline), HERE + UTIL_THROW_IF2(!getline(spe_aln,alnline), HERE << ": missing data for online updates."); cout << string(80,'-') << "\n" << srcline << "\n" << trgline << "\n" << endl; @@ -127,29 +127,29 @@ int main(int argc, char* argv[]) if (!mmsapt->PrefixExists(p)) break; TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p); if (!trg || !trg->GetSize()) continue; - + bool header_done = false; bool has_dynamic_match = vlevel == "all" || vlevel == "ALL"; - vector<size_t> order; order.reserve(trg->GetSize()); + vector<size_t> order; order.reserve(trg->GetSize()); size_t stop = trg->GetSize(); vector<size_t> o2(trg->GetSize()); for (size_t i = 0; i < stop; ++i) o2[i] = i; sort(o2.begin(),o2.end(),TargetPhraseIndexSorter(*trg)); - + for (size_t r = 0; r < stop; ++r) // r for rank { if (vlevel != "ALL") { Phrase const& phr = static_cast<Phrase const&>(*(*trg)[o2[r]]); - ostringstream buf; buf << phr; - string tphrase = buf.str(); + ostringstream buf; buf << phr; + string tphrase = buf.str(); tphrase.erase(tphrase.size()-1); size_t s = trgline.find(tphrase); if (s == string::npos) continue; size_t e = s + tphrase.size(); if ((s && trgline[s-1] != ' ') || (e < trgline.size() && trgline[e] != ' ')) - continue; + continue; } order.push_back(r); if (!has_dynamic_match) @@ -170,7 +170,7 @@ int main(int argc, char* argv[]) ScoreComponentCollection::IndexPair idx = scc.GetIndexes(PT); FVector const& scores = scc.GetScoresVector(); float wscore = scc.GetWeightedScore(); - if (vlevel == "new" && scores[idx.first + dynprovidx] == 0) + if (vlevel == "new" && scores[idx.first + dynprovidx] == 0) continue; if (!header_done) { @@ -201,7 +201,7 @@ int main(int argc, char* argv[]) } cout << " " << format(fmt) % (mmsapt->isInteger(j) ? round(f) : f); } - cout << " " << format("%10.3e") % exp(wscore) + cout << " " << format("%10.3e") % exp(wscore) << " " << format("%10.3e") % exp((*trg)[o2[r]]->GetFutureScore()) << endl; } mmsapt->Release(trg); @@ -213,6 +213,6 @@ int main(int argc, char* argv[]) // } exit(0); } -#endif - +#endif + diff --git a/moses/TranslationModel/UG/spe-check-coverage2.cc b/moses/TranslationModel/UG/spe-check-coverage2.cc index fa9ce1c85..3b4f559d2 100644 --- a/moses/TranslationModel/UG/spe-check-coverage2.cc +++ b/moses/TranslationModel/UG/spe-check-coverage2.cc @@ -20,7 +20,7 @@ typedef Bitext<Token>::iter iter; mmbitext bg; -void +void show(ostream& out, iter& f) { iter b(bg.I2.get(),f.getToken(0),f.size()); @@ -29,11 +29,11 @@ show(ostream& out, iter& f) else out << string(12,' '); out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " "; - out << f.str(bg.V1.get()) << endl; + out << f.str(bg.V1.get()) << endl; } -void +void dump(ostream& out, iter& f) { float cnt = f.size() ? f.approxOccurrenceCount() : 0; @@ -44,12 +44,12 @@ dump(ostream& out, iter& f) while (f.over()); f.up(); } - if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) + if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) show(out,f); } -void +void read_data(string fname, vector<string>& dest) { ifstream in(fname.c_str()); @@ -71,6 +71,6 @@ int main(int argc, char* argv[]) dump(cout,mfg); exit(0); } - - + + diff --git a/moses/TranslationModel/UG/spe-check-coverage3.cc b/moses/TranslationModel/UG/spe-check-coverage3.cc index ea8c85e99..a62daa7b8 100644 --- a/moses/TranslationModel/UG/spe-check-coverage3.cc +++ b/moses/TranslationModel/UG/spe-check-coverage3.cc @@ -22,7 +22,7 @@ typedef Bitext<Token>::iter iter; mmbitext bg; vector<string> src,trg,aln; -void +void show(ostream& out, iter& f) { iter b(bg.I2.get(),f.getToken(0),f.size()); @@ -31,11 +31,11 @@ show(ostream& out, iter& f) else out << string(12,' '); out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " "; - out << f.str(bg.V1.get()) << endl; + out << f.str(bg.V1.get()) << endl; } -void +void dump(ostream& out, iter& f) { float cnt = f.size() ? f.approxOccurrenceCount() : 0; @@ -46,12 +46,12 @@ dump(ostream& out, iter& f) while (f.over()); f.up(); } - if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) + if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) show(out,f); } -void +void read_data(string fname, vector<string>& dest) { ifstream in(fname.c_str()); @@ -60,14 +60,14 @@ read_data(string fname, vector<string>& dest) in.close(); } -void -show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt, +void +show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt, vector<vector<int> > const& a) { for (size_t i = 0; i < snt.size(); ++i) { cout << format("%d:%s[") % i % V[snt[i].id()]; - for (size_t k = 0; k < a[i].size(); ++k) + for (size_t k = 0; k < a[i].size(); ++k) cout << (k?",":"") << a[i][k]; cout << "] "; } @@ -77,7 +77,7 @@ show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt, void show_pair(size_t const sid) { - vector<Token> s,t; + vector<Token> s,t; fill_token_seq(*bg.V1,src[sid],s); fill_token_seq(*bg.V2,trg[sid],t); vector<vector<int> > a1(s.size()),a2(t.size()); @@ -97,11 +97,11 @@ void show_pair(size_t const sid) int main(int argc, char* argv[]) { - if (argc < 5) + if (argc < 5) { - cerr << "usage: " << argv[0] - << " <bg base name> <L1> <L2> <fg base name>" - << endl; + cerr << "usage: " << argv[0] + << " <bg base name> <L1> <L2> <fg base name>" + << endl; exit(1); } bg.open(argv[1],argv[2],argv[3]); @@ -122,10 +122,10 @@ int main(int argc, char* argv[]) bias[sid] = 0; // cout << src[sid] << endl << trg[sid] << endl; // show_pair(sid); - vector<Token> snt; + vector<Token> snt; fill_token_seq(*bg.V1,src[sid],snt); vector<vector<sptr<vector<PhrasePair<Token> > > > > FG,BG; - fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true); + fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true); bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true); set<sptr<vector<PhrasePair<Token> > > > seen; for (size_t i = 0; i < snt.size(); ++i) @@ -136,7 +136,7 @@ int main(int argc, char* argv[]) { if (!m0.extend(snt[i+k].id())) break; if (k && m0.approxOccurrenceCount() < 2) break; - if (m1.size() == k && (!m1.extend(snt[i+k].id()) || + if (m1.size() == k && (!m1.extend(snt[i+k].id()) || m1.approxOccurrenceCount() < 25)) { cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " " @@ -156,8 +156,8 @@ int main(int argc, char* argv[]) sptr<pstats> bgstats; jstats const* bgjstats = NULL; Bitext<Token>::iter m2(bg.I2.get(), pp.start2, pp.len2); - if (m1.approxOccurrenceCount() > 5000 || - m2.approxOccurrenceCount() > 5000) + if (m1.approxOccurrenceCount() > 5000 || + m2.approxOccurrenceCount() > 5000) continue; if (m1.size() == pp.len1 && m2.size() == pp.len2) { @@ -173,9 +173,9 @@ int main(int argc, char* argv[]) cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: " << toString(*fg->V2, pp.start2, pp.len2) << " " << format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2; - if (bgjstats) - cout << " " << (format("[%u/%u/%u]") - % bgstats->good % bgjstats->rcnt() + if (bgjstats) + cout << " " << (format("[%u/%u/%u]") + % bgstats->good % bgjstats->rcnt() % (bgjstats->cnt2() * bgstats->good / bgstats->raw_cnt)); else if (m1.size() == pp.len1) @@ -189,6 +189,6 @@ int main(int argc, char* argv[]) } exit(0); } - - + + diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc index daafec545..60eabb9e7 100644 --- a/moses/TranslationModel/UG/try-align.cc +++ b/moses/TranslationModel/UG/try-align.cc @@ -17,7 +17,7 @@ float lbop_level = .05; namespace stats { using namespace Moses::bitext; - float + float pmi(size_t j,size_t m1, size_t m2, size_t N) { #if smooth @@ -29,8 +29,8 @@ namespace stats return log(j) + log(N) - log(m1) - log(m2); #endif } - - float + + float npmi(size_t j,size_t m1, size_t m2, size_t N) { #if smooth @@ -39,11 +39,11 @@ namespace stats float p12 = lbop(N,j,lbop_level); return (log(p12) - log(p1) - log(p2)) / -log(p12); #else - return pmi(j,m1,m2,N) / (log(N) - log(j)); + return pmi(j,m1,m2,N) / (log(N) - log(j)); #endif } - float + float mi(size_t j,size_t m1, size_t m2, size_t N) { float ret = 0; @@ -79,7 +79,7 @@ struct PhrasePair float mi; // mutual information float score; - void + void set(vector<ttrack::Position> const& o1, vector<ttrack::Position> const& o2, size_t const N) @@ -90,7 +90,7 @@ struct PhrasePair { if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; } if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; } - + if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; } else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; } else { ++i2; ++m2; } @@ -114,19 +114,19 @@ struct PhrasePair this->score = npmi; // npmi; // hmean; // /sqrt(z); } } stats; - + PhrasePair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0) : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { } - bool + bool operator<(PhrasePair const& other) const - { - return (this->stats.score == other.stats.score + { + return (this->stats.score == other.stats.score ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2) - : (this->stats.score > other.stats.score)); + : (this->stats.score > other.stats.score)); } - + size_t len1() const { return e1 - s1; } size_t len2() const { return e2 - s2; } bool includes(PhrasePair const& o) const @@ -142,8 +142,8 @@ PhrasePair::stats_t::cache_t ppcache; struct SortByPositionInCorpus { - bool - operator()(ttrack::Position const& a, + bool + operator()(ttrack::Position const& a, ttrack::Position const& b) const { return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset; @@ -151,8 +151,8 @@ struct SortByPositionInCorpus }; -void -getoccs(tsa_t::tree_iterator const& m, +void +getoccs(tsa_t::tree_iterator const& m, vector<ttrack::Position>& occs) { occs.clear(); @@ -166,9 +166,9 @@ getoccs(tsa_t::tree_iterator const& m, sort(occs.begin(),occs.end(),SortByPositionInCorpus()); } -void -lookup_phrases(vector<id_type> const& snt, - TokenIndex& V, ttrack_t const& T, +void +lookup_phrases(vector<id_type> const& snt, + TokenIndex& V, ttrack_t const& T, tsa_t const& I, SinglePhrase::cache_t& cache, vector<vector<sptr<SinglePhrase> > >& dest) { @@ -182,7 +182,7 @@ lookup_phrases(vector<id_type> const& snt, if (m.approxOccurrenceCount() < 3) break; // if (k - i > 0) break; sptr<SinglePhrase>& o = cache[m.getPid()]; - if (!o) + if (!o) { o.reset(new SinglePhrase()); o->pid = m.getPid(); @@ -193,7 +193,7 @@ lookup_phrases(vector<id_type> const& snt, } } -struct +struct RowIndexSorter { vector<vector<float> > const& M; @@ -202,14 +202,14 @@ RowIndexSorter : M(m), my_col(c) { } template<typename T> - bool - operator()(T const& a, T const& b) const - { + bool + operator()(T const& a, T const& b) const + { return M.at(a).at(my_col) > M.at(b).at(my_col); } }; -struct +struct ColIndexSorter { vector<vector<float> > const& M; @@ -218,9 +218,9 @@ ColIndexSorter : M(m), my_row(r) { } template<typename T> - bool - operator()(T const& a, T const& b) const - { + bool + operator()(T const& a, T const& b) const + { return M.at(my_row).at(a) > M[my_row].at(b); } @@ -234,7 +234,7 @@ int main(int argc, char* argv[]) T1.reset(new ttrack_t()); T2.reset(new ttrack_t()); - + V1.open(base + L1 + ".tdx"); T1->open(base + L1 + ".mct"); I1.open(base + L1 + ".sfa", T1); @@ -259,7 +259,7 @@ int main(int argc, char* argv[]) vector<PhrasePair> pp_all,pp_good; vector<int> a1(snt1.size(),-1); vector<int> a2(snt2.size(),-1); - + vector<vector<int> > z1(snt1.size(),vector<int>(snt1.size(),-1)); vector<vector<int> > z2(snt2.size(),vector<int>(snt2.size(),-1)); vector<vector<vector<PhrasePair> > >ppm1(M1.size()),ppm2(M2.size()); @@ -282,9 +282,9 @@ int main(int argc, char* argv[]) for (size_t k2 = 0; k2 < M2[i2].size(); ++k2) { pp.e2 = i2 + k2 + 1; - sptr<PhrasePair::stats_t> & s + sptr<PhrasePair::stats_t> & s = ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)]; - if (!s) + if (!s) { s.reset(new PhrasePair::stats_t()); s->set(M1[i1][k1]->occs,M2[i2][k2]->occs,T1->size()); @@ -294,8 +294,8 @@ int main(int argc, char* argv[]) // ppm1[i1][k1].push_back(pp); // ppm2[i2][k2].push_back(pp); size_t J = pp.stats.j * 100; - if (pp.stats.score > 0 - && J >= pp.stats.m1 + if (pp.stats.score > 0 + && J >= pp.stats.m1 && J > pp.stats.m2) { pp_all.push_back(pp); } } @@ -310,7 +310,7 @@ int main(int argc, char* argv[]) for (size_t r = pp.s1; r < pp.e1; ++r) for (size_t c = pp.s2; c < pp.e2; ++c) { - // M[r][c] += log(1-pp.stats.npmi); + // M[r][c] += log(1-pp.stats.npmi); M[r][c] += log(1-pp.stats.mi); } } @@ -342,11 +342,11 @@ int main(int argc, char* argv[]) } cout << endl; } -#endif +#endif #if 0 for (size_t k = 1; k < pp_all.size(); ++k) for (size_t i = k; i--;) - if (pp_all[i].s1 >= pp_all[k].s1 && + if (pp_all[i].s1 >= pp_all[k].s1 && pp_all[i].e1 <= pp_all[k].e1 && pp_all[i].s2 >= pp_all[k].s2 && pp_all[i].e2 <= pp_all[k].e2) @@ -360,35 +360,35 @@ int main(int argc, char* argv[]) { PhrasePair const& x = pp_all[p]; // if (x.stats.npmi < .7) break; - // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0) + // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0) // continue; - for (size_t i = x.s1; i < x.e1; ++i) + for (size_t i = x.s1; i < x.e1; ++i) { - if (assoc1[i] < 0) + if (assoc1[i] < 0) assoc1[i] = p; else { // PhrasePair& y = pp_all[assoc1[i]]; - // if (y.includes(x)) + // if (y.includes(x)) // assoc1[i] = p; } } - for (size_t i = x.s2; i < x.e2; ++i) + for (size_t i = x.s2; i < x.e2; ++i) { - if (assoc2[i] < 0) + if (assoc2[i] < 0) assoc2[i] = p; else { // PhrasePair& y = pp_all[assoc2[i]]; - // if (y.includes(x)) + // if (y.includes(x)) // assoc2[i] = p; } } z1[x.s1][x.e1-1] = p; z2[x.s2][x.e2-1] = p; continue; - cout << (boost::format("%.4f %.8f %.4f") - % x.stats.score + cout << (boost::format("%.4f %.8f %.4f") + % x.stats.score % x.stats.mi % x.stats.npmi); for (size_t z = x.s1; z < x.e1; ++z) @@ -396,8 +396,8 @@ int main(int argc, char* argv[]) cout << " :::"; for (size_t z = x.s2; z < x.e2; ++z) cout << " " << V2[snt2[z]]; - cout << " [" - << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2 + cout << " [" + << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2 << "]" << endl; } vector<bool> done(pp_all.size(),false); @@ -415,8 +415,8 @@ int main(int argc, char* argv[]) cout << " ::: "; for (size_t j = p.s2; j < p.e2; ++j) cout << j << ":" << V2[snt2[j]] << " "; - cout << "[" - << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2 + cout << "[" + << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2 << "] "<< p.stats.score << endl; // break; } @@ -433,20 +433,20 @@ int main(int argc, char* argv[]) cout << " ::: "; for (size_t j = p.s2; j < p.e2; ++j) cout << j << ":" << V2[snt2[j]] << " "; - cout << "[" - << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2 + cout << "[" + << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2 << "] "<< p.stats.score << endl; } -#endif +#endif // sort(pp_all.begin(),pp_all.end()); // BOOST_FOREACH(PhrasePair const& pp, pp_all) // { - // while (ppm1[pp.s1].size() < pp.e1 - pp.s1) + // while (ppm1[pp.s1].size() < pp.e1 - pp.s1) // ppm1[pp.s1].push_back(vector<PhrasePair>()); // vector<PhrasePair>& v1 = ppm1[pp.s1][pp.e1-pp.s1-1]; // if (v1.size() && v1[0].stats.score > pp.stats.score) // continue; - // while (ppm2[pp.s2].size() < pp.e2 - pp.s2) + // while (ppm2[pp.s2].size() < pp.e2 - pp.s2) // ppm2[pp.s2].push_back(vector<PhrasePair>()); // vector<PhrasePair>& v2 = ppm2[pp.s2][pp.e2-pp.s2-1]; // if (v2.size() && v2[0].stats.score > pp.stats.score) @@ -455,12 +455,12 @@ int main(int argc, char* argv[]) // v2.push_back(pp); // } - + // BOOST_FOREACH(vector<vector<PhrasePair> >& vv, ppm1) - // { - // BOOST_FOREACH(vector<PhrasePair>& v, vv) - // { - // sort(v.begin(),v.end()); + // { + // BOOST_FOREACH(vector<PhrasePair>& v, vv) + // { + // sort(v.begin(),v.end()); // if (v.size() > 1 && v[0].stats.score == v[1].stats.score) // v.clear(); // } @@ -468,19 +468,19 @@ int main(int argc, char* argv[]) // for (size_t i2 = 0; i2 < ppm2.size(); ++i2) // { // for (size_t k2 = 0; k2 < ppm2[i2].size(); ++k2) - // { + // { // vector<PhrasePair>& v2 = ppm2[i2][k2]; // sort(v2.begin(),v2.end()); - // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score) + // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score) // { // v2.clear(); // continue; // } // ushort i1 = v2[0].s1; // ushort k1 = v2[0].e1 - i1 -1; - - // if (ppm1[i1][k1].size() == 0 || - // ppm1[i1][k1][0].s2 != i2 || + + // if (ppm1[i1][k1].size() == 0 || + // ppm1[i1][k1][0].s2 != i2 || // ppm1[i1][k1][0].e2 != i2 + k2 + 1) // { v2.clear(); } // else pp_good.push_back(ppm2[i2][k2][0]); @@ -508,7 +508,7 @@ int main(int argc, char* argv[]) // // cout << V2[snt2[z]] << " "; // // cout << pp.m1 << "/" << pp.j << "/" << pp.m2 << endl; // // } - + } } diff --git a/moses/TranslationModel/UG/try-align2.cc b/moses/TranslationModel/UG/try-align2.cc index 57cf25035..a18ce8d92 100644 --- a/moses/TranslationModel/UG/try-align2.cc +++ b/moses/TranslationModel/UG/try-align2.cc @@ -29,7 +29,7 @@ float lbop_level = .05; namespace stats { using namespace Moses::bitext; - float + float pmi(size_t j,size_t m1, size_t m2, size_t N) { #if smooth @@ -41,8 +41,8 @@ namespace stats return log(j) + log(N) - log(m1) - log(m2); #endif } - - float + + float npmi(size_t j,size_t m1, size_t m2, size_t N) { #if smooth @@ -52,11 +52,11 @@ namespace stats float p12 = lbop(N,j,lbop_level); return (log(p12) - log(p1) - log(p2)) / -log(p12); #else - return pmi(j,m1,m2,N) / (log(N) - log(j)); + return pmi(j,m1,m2,N) / (log(N) - log(j)); #endif } - float + float mi(size_t j,size_t m1, size_t m2, size_t N) { float ret = 0; @@ -92,7 +92,7 @@ struct PhrasePair2 float mi; // mutual information float score; - void + void set(vector<ttrack::Position> const& o1, vector<ttrack::Position> const& o2, size_t const N) @@ -103,7 +103,7 @@ struct PhrasePair2 { if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; } if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; } - + if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; } else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; } else { ++i2; ++m2; } @@ -127,19 +127,19 @@ struct PhrasePair2 this->score = npmi; // npmi; // hmean; // /sqrt(z); } } stats; - + PhrasePair2(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0) : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { } - bool + bool operator<(PhrasePair2 const& other) const - { - return (this->stats.score == other.stats.score + { + return (this->stats.score == other.stats.score ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2) - : (this->stats.score > other.stats.score)); + : (this->stats.score > other.stats.score)); } - + size_t len1() const { return e1 - s1; } size_t len2() const { return e2 - s2; } bool includes(PhrasePair2 const& o) const @@ -155,8 +155,8 @@ PhrasePair2::stats_t::cache_t ppcache; struct SortByPositionInCorpus { - bool - operator()(ttrack::Position const& a, + bool + operator()(ttrack::Position const& a, ttrack::Position const& b) const { return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset; @@ -164,8 +164,8 @@ struct SortByPositionInCorpus }; -void -getoccs(tsa_t::tree_iterator const& m, +void +getoccs(tsa_t::tree_iterator const& m, vector<ttrack::Position>& occs) { occs.clear(); @@ -179,9 +179,9 @@ getoccs(tsa_t::tree_iterator const& m, sort(occs.begin(),occs.end(),SortByPositionInCorpus()); } -void -lookup_phrases(vector<id_type> const& snt, - TokenIndex& V, ttrack_t const& T, +void +lookup_phrases(vector<id_type> const& snt, + TokenIndex& V, ttrack_t const& T, tsa_t const& I, SinglePhrase::cache_t& cache, vector<vector<sptr<SinglePhrase> > >& dest) { @@ -195,7 +195,7 @@ lookup_phrases(vector<id_type> const& snt, if (m.approxOccurrenceCount() < 3) break; // if (k - i > 0) break; sptr<SinglePhrase>& o = cache[m.getPid()]; - if (!o) + if (!o) { o.reset(new SinglePhrase()); o->pid = m.getPid(); @@ -207,7 +207,7 @@ lookup_phrases(vector<id_type> const& snt, } -struct +struct RowIndexSorter { vector<vector<float> > const& M; @@ -216,14 +216,14 @@ RowIndexSorter : M(m), my_col(c) { } template<typename T> - bool - operator()(T const& a, T const& b) const - { + bool + operator()(T const& a, T const& b) const + { return M.at(a).at(my_col) > M.at(b).at(my_col); } }; -struct +struct ColIndexSorter { vector<vector<float> > const& M; @@ -232,9 +232,9 @@ ColIndexSorter : M(m), my_row(r) { } template<typename T> - bool - operator()(T const& a, T const& b) const - { + bool + operator()(T const& a, T const& b) const + { return M.at(my_row).at(a) > M[my_row].at(b); } @@ -249,7 +249,7 @@ public: { #if 0 cout << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " - << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " " + << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " " << pp.joint << " " << __FILE__ << ":" << __LINE__ << endl; #endif pp.good2 = ceil(pp.raw2 * float(pp.good1)/pp.raw1); @@ -266,7 +266,7 @@ class Alnhyp }; -size_t +size_t lcs(string const a, string const b) { using namespace stringdist; @@ -279,10 +279,10 @@ lcs(string const a, string const b) { StringDiff::Segment const& s = diff[i]; if (s.match != StringDiff::same && s.match != StringDiff::cap) - { + { if (len > ret) ret = len; - len = 0; - continue; + len = 0; + continue; } len += s.end_a - s.start_a; } @@ -290,9 +290,9 @@ lcs(string const a, string const b) return ret; } -size_t -mapstring(string const& utf8, - UnicodeString& U, +size_t +mapstring(string const& utf8, + UnicodeString& U, vector<int>& c2w, vector<int>* wlen=NULL) { @@ -338,10 +338,10 @@ align_letters(UnicodeString const& A, vector<int> const& a2p, // } } -void +void map_back(vector<vector<int> > const& W, vector<vector<int> > & X, - vector<uchar> const & aln) + vector<uchar> const & aln) { for (size_t i = 0; i < aln.size(); i += 2) { @@ -354,7 +354,7 @@ map_back(vector<vector<int> > const& W, } -void trymatch3(vector<PhrasePair<Token> > const& tcands, +void trymatch3(vector<PhrasePair<Token> > const& tcands, UnicodeString const& T, size_t const tlen, vector<int> const& t2p, TokenIndex const& V2, vector<vector<int> >&X) @@ -374,8 +374,8 @@ void trymatch3(vector<PhrasePair<Token> > const& tcands, cout << slen << " " << tlen << endl; cout << "W: " << W.size() << " rows; " << W[0].size() << " cols" << endl; cout << "X: " << X.size() << " rows; " << X[0].size() << " cols" << endl; - cout << "aln: "; - for (size_t a = 0; a < pp.aln.size(); a +=2) + cout << "aln: "; + for (size_t a = 0; a < pp.aln.size(); a +=2) cout << int(pp.aln[a]) << "-" << int(pp.aln[a+1]) << " "; cout << endl; #endif @@ -383,7 +383,7 @@ void trymatch3(vector<PhrasePair<Token> > const& tcands, } } -void minmatch_filter(vector<vector<int> > & X, +void minmatch_filter(vector<vector<int> > & X, vector<int> const& len1, vector<int> const& len2) { @@ -437,20 +437,20 @@ trymatch2(TokenIndex& V1, // source language vocab TokenIndex& V2, // target language vocab string const& source, // source phrase string const& target, // observed target candidate - vector<PhrasePair<Token> > const* const tcands, + vector<PhrasePair<Token> > const* const tcands, vector<vector<int> >& X) // destination alignment matrix // tcands: translations for source { - UnicodeString S,T; + UnicodeString S,T; vector<int> t2p, s2p; // maps from character position in string to word pos. vector<int> wlen_t, wlen_s; // individual word lengths size_t slen = mapstring(source, S, s2p, &wlen_s); size_t tlen = mapstring(target, T, t2p, &wlen_t); - + X.assign(slen,vector<int>(tlen,0)); - if (slen == 1 && tlen ==1 && S == T) + if (slen == 1 && tlen ==1 && S == T) X[0][0] = S.length(); - else + else { align_letters(S,s2p,T,t2p,X); if (tcands) trymatch3(*tcands, T, tlen, t2p, V2, X); @@ -475,7 +475,7 @@ trymatch2(TokenIndex& V1, // source language vocab // float -// trymatch(string const a, string const b, +// trymatch(string const a, string const b, // vector<PhrasePair<Token> > const* atrans, // vector<PhrasePair<Token> > const* btrans) // { @@ -501,11 +501,11 @@ trymatch2(TokenIndex& V1, // source language vocab // // float bar = float(lcs(foo,b))/min(foo.size(),b.size()); // float bar = float(lcs(foo,b)); -// if (bar > .5) +// if (bar > .5) // { // // score = max(pp.score * bar,score); // score = max(bar,score); -// // cout << "[" << bar << "] " << foo << " ::: " << b +// // cout << "[" << bar << "] " << foo << " ::: " << b // // << " (" << a << ") " << pp.score << endl; // } // } @@ -525,10 +525,10 @@ trymatch2(TokenIndex& V1, // source language vocab // string foo = toString(*BT.V1,pp.start2,pp.len2); // // float bar = float(lcs(a,foo))/min(a.size(),foo.size()); // float bar = float(lcs(a,foo)); -// if (bar > .5) +// if (bar > .5) // { // score = max(bar,score); -// // cout << "[" << bar<< "] " << a << " ::: " << foo +// // cout << "[" << bar<< "] " << a << " ::: " << foo // // << " (" << b << ") " << pp.score << endl; // } // } @@ -547,8 +547,8 @@ struct ahyp struct AlnPoint { enum status { no = 0, yes = 1, maybe = -1, undef = -7 }; - float score; - status state; + float score; + status state; AlnPoint() : score(0), state(undef) {} }; @@ -562,14 +562,14 @@ class AlnMatrix vector<bitvector> A1,A2; // final alignment matrix vector<bitvector> S1,S2; // shadow alignment matrix public: - vector<bitvector*> m1,m2; // margins + vector<bitvector*> m1,m2; // margins AlnMatrix(size_t const rows, size_t const cols); - bitvector const& + bitvector const& operator[](size_t const r) const { return A1.at(r); } bool - incorporate(span_t const& rspan, span_t const& cspan, + incorporate(span_t const& rspan, span_t const& cspan, vector<uchar> const& aln, bool const flip); size_t size() const { return A1.size(); } @@ -588,9 +588,9 @@ AlnMatrix(size_t const rows, size_t const cols) bool AlnMatrix:: -incorporate(span_t const& rspan, - span_t const& cspan, - vector<uchar> const& aln, +incorporate(span_t const& rspan, + span_t const& cspan, + vector<uchar> const& aln, bool const flip) { for (size_t r = rspan.first; r < rspan.second; ++r) @@ -622,7 +622,7 @@ incorporate(span_t const& rspan, if (m1[r] && (*m1[r]) != S1[r]) return false; for (size_t c = cspan.first; c < cspan.second; ++c) if (m2[c] && (*m2[c]) != S2[c]) return false; - + // all good, add new points for (size_t r = rspan.first; r < rspan.second; ++r) if (!m1[r]) { A1[r] = S1[r]; m1[r] = &A1[r]; } @@ -632,9 +632,9 @@ incorporate(span_t const& rspan, return true; } -struct alink -{ - size_t r,c,m; +struct alink +{ + size_t r,c,m; bool operator<(alink const& o) const { return m < o.m; } bool operator>(alink const& o) const { return m > o.m; } }; @@ -659,9 +659,9 @@ int main(int argc, char* argv[]) vector<vector<uint64_t> > pm1,pm2; BT.lookup(snt1,*BT.I1,pt1,&pm1,&scorer); BT.lookup(snt2,*BT.I2,pt2,&pm2,&scorer); - + // build map from phrases to positions - typedef boost::unordered_map<uint64_t, vector<span_t> > + typedef boost::unordered_map<uint64_t, vector<span_t> > p2s_map_t; typedef p2s_map_t::iterator p2s_iter; p2s_map_t p2s1,p2s2; @@ -684,7 +684,7 @@ int main(int argc, char* argv[]) BOOST_FOREACH(PhrasePair<Token> const& pp, *pt1[i][k]) { if (pp.score < 0) break; - if (p2s2.find(pp.p2) != p2s2.end()) + if (p2s2.find(pp.p2) != p2s2.end()) pp_all.push_back(pp); } } @@ -704,10 +704,10 @@ int main(int argc, char* argv[]) { PhrasePair<Token> const& pp = pp_all[p]; #if 0 - cout << (boost::format("%30s ::: %-30s ") + cout << (boost::format("%30s ::: %-30s ") % BT.toString(pp.p1,0).c_str() % BT.toString(pp.p2,1).c_str()); - cout << (boost::format("%.4f [%d/%d/%d]") + cout << (boost::format("%.4f [%d/%d/%d]") % pp.score % pp.good1 % pp.joint % pp.good2); for (size_t a = 0; a < pp.aln.size(); a += 2) cout << " " << int(pp.aln[a]) << "-" << int(pp.aln[a+1]); @@ -720,7 +720,7 @@ int main(int argc, char* argv[]) for (size_t i = v1[0].first; i < v1[0].second; ++i) if (a1[i] < 0) a1[i] = p; if (v2.size() == 1) - for (size_t i = v2[0].first; i < v2[0].second; ++i) + for (size_t i = v2[0].first; i < v2[0].second; ++i) if (a2[i] < 0) a2[i] = p; if (v1.size() == 1 && v2.size() == 1) @@ -740,11 +740,11 @@ int main(int argc, char* argv[]) vector<PhrasePair<Token> > const* atrans, *btrans; ahyp h; vector<ahyp> hyps; - vector<vector<int> > L(snt1.size(),vector<int>(snt2.size(),0)); + vector<vector<int> > L(snt1.size(),vector<int>(snt2.size(),0)); // L: matches by letter overlap for (h.s1 = 0; h.s1 < a1.size(); ++h.s1) - { + { if (a1[h.s1] >= 0) continue; ostringstream buf1; for (h.e1 = h.s1; h.e1 < a1.size() && a1[h.e1] < 0; ++h.e1) @@ -762,23 +762,23 @@ int main(int argc, char* argv[]) if (a2[h.s2] >= 0) continue; for (h.e2 = h.s2; h.e2 < a2.size() && a2[h.e2] < 0; ++h.e2) { - if (h.e2 > h.s2) + if (h.e2 > h.s2) { if (pt2[h.s2].size() + h.s2 <= h.e2) break; buf2 << " "; } buf2 << (*BT.V2)[snt2[h.e2].id()]; - btrans = (pt2[h.s2].size() - ? pt2[h.s2].at(h.e2-h.s2).get() + btrans = (pt2[h.s2].size() + ? pt2[h.s2].at(h.e2-h.s2).get() : NULL); vector<vector<int> > aln; - trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(), + trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(), atrans,aln); for (size_t i = 0; i < aln.size(); ++i) for (size_t k = 0; k < aln[i].size(); ++k) L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[i][k]); - trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(), + trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(), btrans,aln); for (size_t i = 0; i < aln[0].size(); ++i) for (size_t k = 0; k < aln.size(); ++k) @@ -795,7 +795,7 @@ int main(int argc, char* argv[]) alink x; for (x.r = 0; x.r < L.size(); ++x.r) { - + for (x.c = 0; x.c < L[x.r].size(); ++x.c) { x.m = L[x.r][x.c]; @@ -807,22 +807,22 @@ int main(int argc, char* argv[]) BOOST_FOREACH(alink& x, links) { - if (L[x.r][x.c]) + if (L[x.r][x.c]) { cout << (*BT.V1)[snt1[x.r].id()] << " ::: " << (*BT.V2)[snt2[x.c].id()] << " ::: " << L[x.r][x.c] << endl; } - } + } // sort(hyps.begin(),hyps.end(),greater<ahyp>()); // BOOST_FOREACH(ahyp const& h, hyps) // { // if (h.score < .5) break; - // for (size_t i = h.s1; i <= h.e1; ++i) + // for (size_t i = h.s1; i <= h.e1; ++i) // cout << i << ":" << (*BT.V1)[snt1[i].id()] << " "; // cout << " ::: "; - // for (size_t i = h.s2; i <= h.e2; ++i) + // for (size_t i = h.s2; i <= h.e2; ++i) // cout << i << ":" << (*BT.V2)[snt2[i].id()] << " "; // cout << h.score << endl; // } @@ -854,15 +854,15 @@ int main(int argc, char* argv[]) // #if 0 // if (match) // { -// if (first) +// if (first) // { // cout << BT.toString(pm1[i][k],0) << endl; // first = false; // } -// cout << boost::format("%.4f") % pt.score << " " +// cout << boost::format("%.4f") % pt.score << " " // << setw(5) << d1 << " " << (match ? "* " : " ") // << toString(*BT.V2, pt.start2, pt.len2) << " [" -// << pt.good1 << "/" << pt.joint << "/" +// << pt.good1 << "/" << pt.joint << "/" // << pt.good2 << "]"; // for (size_t a = 0; a < pt.aln.size(); a += 2) // cout << " " << int(pt.aln[a]) << "-" << int(pt.aln[a+1]); @@ -879,7 +879,7 @@ int main(int argc, char* argv[]) // pp_all.push_back(pt); // // pp_all.back().m1 -= d1; // } - + // } // if (!first) cout << endl; // } diff --git a/moses/TranslationModel/UG/util/ibm1-align.cc b/moses/TranslationModel/UG/util/ibm1-align.cc index 08ac1f89b..3c43743d0 100644 --- a/moses/TranslationModel/UG/util/ibm1-align.cc +++ b/moses/TranslationModel/UG/util/ibm1-align.cc @@ -1,7 +1,7 @@ // -*- c++ -*- // Parallel text alignment via IBM1 / raw counts of word alignments // aiming at high precision (to seed Yawat alignments) -// This program is tailored for use with Yawat. +// This program is tailored for use with Yawat. // Written by Ulrich Germann. #include <string> @@ -29,20 +29,20 @@ public: table_t COOC; TokenIndex V1,V2; - void + void align(string const& s1, string const& s2, vector<int>& aln) const; - void - align(vector<id_type> const& x1, - vector<id_type> const& x2, + void + align(vector<id_type> const& x1, + vector<id_type> const& x2, vector<int>& aln) const; - - void - fill_amatrix(vector<id_type> const& x1, - vector<id_type> const& x2, + + void + fill_amatrix(vector<id_type> const& x1, + vector<id_type> const& x2, vector<vector<int> >& aln) const; - void + void open(string const base, string const L1, string const L2); }; @@ -75,10 +75,10 @@ u(StringPiece str, size_t start, size_t stop) return ret; } -void +void IBM1:: -fill_amatrix(vector<id_type> const& x1, - vector<id_type> const& x2, +fill_amatrix(vector<id_type> const& x1, + vector<id_type> const& x2, vector<vector<int> >& aln) const { aln.assign(x1.size(),vector<int>(x2.size())); @@ -108,8 +108,8 @@ fill_amatrix(vector<id_type> const& x1, void IBM1:: -align(vector<id_type> const& x1, - vector<id_type> const& x2, +align(vector<id_type> const& x1, + vector<id_type> const& x2, vector<int>& aln) const { vector<vector<int> > M; @@ -157,7 +157,7 @@ int main(int argc, char* argv[]) // cout << line1 << endl; // cout << line2 << endl; // for (size_t i = 0; i < a.size(); i += 2) - // cout << ibm1.V1[s1[a[i]]] << " - " + // cout << ibm1.V1[s1[a[i]]] << " - " // << ibm1.V2[s2[a[i+1]]] << endl; } // cout << endl; diff --git a/moses/TranslationModel/UG/util/tokenindex.dump.cc b/moses/TranslationModel/UG/util/tokenindex.dump.cc index 8ab68579d..0e885630f 100644 --- a/moses/TranslationModel/UG/util/tokenindex.dump.cc +++ b/moses/TranslationModel/UG/util/tokenindex.dump.cc @@ -13,7 +13,7 @@ using namespace std; using namespace ugdiss; -int +int main(int argc,char* argv[]) { if (argc > 1 && !strcmp(argv[1], "-h")) { |