diff options
author | Ulrich Germann <Ulrich.Germann@gmail.com> | 2015-04-05 16:17:47 +0300 |
---|---|---|
committer | Ulrich Germann <Ulrich.Germann@gmail.com> | 2015-04-05 16:29:00 +0300 |
commit | 46e31a285c8f9257a9d6ab411db74b5cbec9d0fe (patch) | |
tree | 9bf1afa3827e7252e6b9fd38e8ee27cef8693a9a /moses/TranslationModel/UG/mm | |
parent | 05c4e382ff7914369700eb516a61a45238292bdf (diff) |
- Code refactoring for Bitext class.
- Bug fixes and conceptual improvements in biased sampling. The sampling now
tries to stick to the bias, even when an unsuitable corpus dominates
the occurrences.
Diffstat (limited to 'moses/TranslationModel/UG/mm')
-rw-r--r-- | moses/TranslationModel/UG/mm/mmlex-build.cc | 73 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext.cc | 264 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext.h | 1346 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_agenda.h | 186 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h | 240 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h | 102 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_jstats.cc | 91 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_jstats.h | 48 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_pstats.cc | 83 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_pstats.h | 63 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_im_bitext.cc | 87 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_im_bitext.h | 130 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_mm_bitext.h | 81 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_phrasepair.h | 246 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_sampling_bias.h | 5 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_ttrack_base.h | 28 |
16 files changed, 1475 insertions, 1598 deletions
diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc index 4ef0842e4..5e5ea194c 100644 --- a/moses/TranslationModel/UG/mm/mmlex-build.cc +++ b/moses/TranslationModel/UG/mm/mmlex-build.cc @@ -24,6 +24,7 @@ #include <boost/unordered_set.hpp> #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h" +#include "moses/Util.h" #include "ug_mm_2d_table.h" #include "ug_mm_ttrack.h" #include "ug_corpus_token.h" @@ -241,10 +242,14 @@ processSentence(id_type sid) p = binread(p,r); p = binread(p,c); // cout << sid << " " << r << "-" << c << endl; - assert(r < check1.size()); - assert(c < check2.size()); - assert(s1+r < e1); - assert(s2+c < e2); + UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid); + UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid); + // assert(r < check1.size()); + // assert(c < check2.size()); + UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid); + UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid); + // assert(s1+r < e1); + // assert(s2+c < e2); check1.reset(r); check2.reset(c); id_type id1 = (s1+r)->id(); @@ -266,66 +271,6 @@ processSentence(id_type sid) CNT[wpair(0,(s2+i)->id())].a++; } -// void -// writeTable(string ofname, -// vector<vector<uint32_t> >& FREQ, -// vector<map<id_type,uint32_t> >& RARE) -// { -// ofstream out(ofname.c_str()); -// filepos_type idxOffset=0; - -// vector<uint32_t> m1; // marginals L1 -// vector<uint32_t> m2; // marginals L2 -// m1.resize(max(first_rare_id,V1.getNumTokens()),0); -// m2.resize(V2.getNumTokens(),0); -// vector<id_type> index(V1.getNumTokens()+1,0); -// numwrite(out,idxOffset); // blank for the time being -// numwrite(out,id_type(m1.size())); -// numwrite(out,id_type(m2.size())); - -// id_type cellCount=0; -// id_type stop = min(first_rare_id,id_type(m1.size())); -// for (id_type id1 = 0; id1 < stop; ++id1) -// { -// index[id1] = cellCount; -// vector<uint32_t> const& v = FREQ[id1]; -// for (id_type id2 = 0; id2 < id_type(v.size()); ++id2) -// { -// if (!v[id2]) continue; -// cellCount++; -// numwrite(out,id2); -// out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t)); -// m1[id1] += v[id2]; -// m2[id2] += v[id2]; -// } -// } -// for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1) -// { -// index[id1] = cellCount; -// map<id_type,uint32_t> const& M = RARE[id1]; -// for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m) -// { -// if (m->second == 0) continue; -// cellCount++; -// numwrite(out,m->first); -// out.write(reinterpret_cast<char const*>(&m->second),sizeof(float)); -// m1[id1] += m->second; -// m2[m->first] += m->second; -// } -// } -// index[m1.size()] = cellCount; -// idxOffset = out.tellp(); -// for (size_t i = 0; i < index.size(); ++i) -// numwrite(out,index[i]); -// out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float)); -// out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float)); - -// // re-write the file header -// out.seekp(0); -// numwrite(out,idxOffset); -// out.close(); -// } - int main(int argc, char* argv[]) { diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 29104aaec..fe95596ab 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -11,192 +11,6 @@ namespace Moses namespace bitext { -#if UG_BITEXT_TRACK_ACTIVE_THREADS - ThreadSafeCounter pstats::active; -#endif - - pstats:: - pstats() - : raw_cnt (0) - , sample_cnt (0) - , good (0) - , sum_pairs (0) - , in_progress (0) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) - ofwd[i] = obwd[i] = 0; - } - - pstats:: - ~pstats() - { -#if UG_BITEXT_TRACK_ACTIVE_THREADS - // counter may not exist any more at destruction time, so try ... catch - try { --active; } catch (...) {} -#endif - } - - void - pstats:: - register_worker() - { - this->lock.lock(); - ++this->in_progress; - this->lock.unlock(); - } - - void - pstats:: - release() - { - this->lock.lock(); - if (this->in_progress-- == 1) // last one - >we're done - this->ready.notify_all(); - this->lock.unlock(); - } - - bool - pstats:: - add(uint64_t pid, float const w, - vector<uchar> const& a, - uint32_t const cnt2, - uint32_t fwd_o, - uint32_t bwd_o, int const docid) - { - boost::lock_guard<boost::mutex> guard(this->lock); - jstats& entry = this->trg[pid]; - entry.add(w,a,cnt2,fwd_o,bwd_o,docid); - if (this->good < entry.rcnt()) - { - UTIL_THROW(util::Exception, "more joint counts than good counts:" - << entry.rcnt() << "/" << this->good << "!"); - } - - if (docid >= 0) - { - while (int(indoc.size()) <= docid) indoc.push_back(0); - ++indoc[docid]; - } - - return true; - } - - jstats:: - jstats() - : my_rcnt(0), my_wcnt(0), my_cnt2(0) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) - ofwd[i] = obwd[i] = 0; - my_aln.reserve(1); - } - - jstats:: - jstats(jstats const& other) - { - my_rcnt = other.rcnt(); - my_wcnt = other.wcnt(); - my_aln = other.aln(); - indoc = other.indoc; - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - ofwd[i] = other.ofwd[i]; - obwd[i] = other.obwd[i]; - } - } - - uint32_t - jstats:: - dcnt_fwd(PhraseOrientation const idx) const - { - assert(idx <= Moses::LRModel::NONE); - return ofwd[idx]; - } - - uint32_t - jstats:: - dcnt_bwd(PhraseOrientation const idx) const - { - assert(idx <= Moses::LRModel::NONE); - return obwd[idx]; - } - - void - jstats:: - add(float w, vector<uchar> const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, int const docid) - { - boost::lock_guard<boost::mutex> lk(this->lock); - my_rcnt += 1; - my_wcnt += w; - // my_cnt2 += cnt2; // could I really be that stupid? [UG] - my_cnt2 = cnt2; - if (a.size()) - { - size_t i = 0; - while (i < my_aln.size() && my_aln[i].second != a) ++i; - if (i == my_aln.size()) - my_aln.push_back(pair<size_t,vector<uchar> >(1,a)); - else - my_aln[i].first++; - if (my_aln[i].first > my_aln[i/2].first) - push_heap(my_aln.begin(),my_aln.begin()+i+1); - } - ++ofwd[fwd_orient]; - ++obwd[bwd_orient]; - if (docid >= 0) - { - while (int(indoc.size()) <= docid) indoc.push_back(0); - ++indoc[docid]; - - // cout << docid << " => " << indoc[docid] << " " << HERE << endl; - - } - } - - uint32_t - jstats:: - rcnt() const - { return my_rcnt; } - - float - jstats:: - wcnt() const - { return my_wcnt; } - - uint32_t - jstats:: - cnt2() const - { return my_cnt2; } - - vector<pair<size_t, vector<uchar> > > const& - jstats:: - aln() const - { return my_aln; } - - void - jstats:: - invalidate() - { - if (my_wcnt > 0) - my_wcnt *= -1; - } - - void - jstats:: - validate() - { - if (my_wcnt < 0) - my_wcnt *= -1; - } - - bool - jstats:: - valid() - { - return my_wcnt >= 0; - } - - float lbop(size_t const tries, size_t const succ, float const confidence) { @@ -206,83 +20,6 @@ namespace Moses find_lower_bound_on_p(tries, succ, confidence))); } - template<> - sptr<imBitext<L2R_Token<SimpleWordId> > > - imBitext<L2R_Token<SimpleWordId> >:: - add(vector<string> const& s1, - vector<string> const& s2, - vector<string> const& aln) const - { - typedef L2R_Token<SimpleWordId> TKN; - assert(s1.size() == s2.size() && s1.size() == aln.size()); - -#ifndef NDEBUG - size_t first_new_snt = this->T1 ? this->T1->size() : 0; -#endif - - sptr<imBitext<TKN> > ret; - { - boost::unique_lock<boost::shared_mutex> guard(m_lock); - ret.reset(new imBitext<TKN>(*this)); - } - - // we add the sentences in separate threads (so it's faster) - boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1)); - // thread1.join(); // for debugging - boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2)); - BOOST_FOREACH(string const& a, aln) - { - istringstream ibuf(a); - ostringstream obuf; - uint32_t row,col; char c; - while (ibuf >> row >> c >> col) - { - UTIL_THROW_IF2(c != '-', "[" << HERE << "] " - << "Error in alignment information:\n" << a); - binwrite(obuf,row); - binwrite(obuf,col); - } - // important: DO NOT replace the two lines below this comment by - // char const* x = obuf.str().c_str(), as the memory x is pointing - // to is freed immediately upon deconstruction of the string object. - string foo = obuf.str(); - char const* x = foo.c_str(); - vector<char> v(x,x+foo.size()); - ret->myTx = append(ret->myTx, v); - } - - thread1.join(); - thread2.join(); - - ret->Tx = ret->myTx; - ret->T1 = ret->myT1; - ret->T2 = ret->myT2; - ret->I1 = ret->myI1; - ret->I2 = ret->myI2; - -#ifndef NDEBUG - // sanity check - for (size_t i = first_new_snt; i < ret->T1->size(); ++i) - { - size_t slen1 = ret->T1->sntLen(i); - size_t slen2 = ret->T2->sntLen(i); - char const* p = ret->Tx->sntStart(i); - char const* q = ret->Tx->sntEnd(i); - size_t k; - while (p < q) - { - p = binread(p,k); - assert(p); - assert(p < q); - assert(k < slen1); - p = binread(p,k); - assert(p); - assert(k < slen2); - } - } -#endif - return ret; - } // template<> void @@ -425,6 +162,5 @@ namespace Moses } cout << string(90,'-') << endl; } - } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index bd2975cf7..89aeeaa7a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -1,7 +1,5 @@ //-*- c++ -*- - -#ifndef __ug_bitext_h -#define __ug_bitext_h +#pragma once // Implementations of word-aligned bitext. // Written by Ulrich Germann // @@ -26,11 +24,11 @@ #include <iomanip> #include <algorithm> -#include <boost/unordered_map.hpp> #include <boost/foreach.hpp> -#include <boost/thread.hpp> #include <boost/random.hpp> #include <boost/format.hpp> +#include <boost/thread.hpp> +#include <boost/unordered_map.hpp> #include <boost/math/distributions/binomial.hpp> #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" @@ -59,6 +57,7 @@ #include "ug_lru_cache.h" #include "ug_lexical_reordering.h" #include "ug_sampling_bias.h" +#include "ug_phrasepair.h" #define PSTATS_CACHE_THRESHOLD 50 @@ -66,101 +65,10 @@ namespace Moses { class Mmsapt; namespace bitext { - // using namespace ugdiss; - // using namespace std; - - template<typename TKN> class Bitext; - template<typename TKN> class PhrasePair; using namespace ugdiss; - template<typename TKN> class Bitext; - - template<typename sid_t, typename off_t, typename len_t> - void - parse_pid(uint64_t const pid, sid_t & sid, - off_t & off, len_t& len) - { - static uint64_t two32 = uint64_t(1)<<32; - static uint64_t two16 = uint64_t(1)<<16; - len = pid%two16; - off = (pid%two32)>>16; - sid = pid>>32; - } - - float - lbop(size_t const tries, size_t const succ, - float const confidence); - - // "joint" (i.e., phrase pair) statistics - class - jstats - { - boost::mutex lock; - uint32_t my_rcnt; // unweighted count - float my_wcnt; // weighted count - uint32_t my_cnt2; - vector<pair<size_t, vector<uchar> > > my_aln; - uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1]; - public: - vector<uint32_t> indoc; - jstats(); - jstats(jstats const& other); - uint32_t rcnt() const; - uint32_t cnt2() const; // raw target phrase occurrence count - float wcnt() const; - - vector<pair<size_t, vector<uchar> > > const & aln() const; - void add(float w, vector<uchar> const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, int const docid); - void invalidate(); - void validate(); - bool valid(); - uint32_t dcnt_fwd(PhraseOrientation const idx) const; - uint32_t dcnt_bwd(PhraseOrientation const idx) const; - }; - - struct - pstats - { - typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t; - typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t; - -#if UG_BITEXT_TRACK_ACTIVE_THREADS - static ThreadSafeCounter active; -#endif - boost::mutex lock; // for parallel gathering of stats - boost::condition_variable ready; /* consumers can wait for this - * data structure to be ready. */ - - size_t raw_cnt; // (approximate) raw occurrence count - size_t sample_cnt; // number of instances selected during sampling - size_t good; // number of selected instances with valid word alignments - size_t sum_pairs; - size_t in_progress; // keeps track of how many threads are currently working on this - - // size_t Moses::LRModel::ReorderingType - uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1]; + float lbop(size_t const tries, size_t const succ, float const confidence); - vector<uint32_t> indoc; - - - // typedef typename boost::unordered_map<typename uint64_t, jstats> trg_map_t; - typedef std::map<uint64_t, jstats> trg_map_t; - trg_map_t trg; - pstats(); - ~pstats(); - void release(); - void register_worker(); - size_t count_workers() { return in_progress; } - - bool - add(uint64_t const pid, - float const w, - vector<uchar> const& a, - uint32_t const cnt2, - uint32_t fwd_o, uint32_t bwd_o, int const docid); - }; - struct ContextForQuery { @@ -174,297 +82,36 @@ namespace Moses { ContextForQuery() : bias_log(NULL) { } }; - template<typename Token> - string - toString(TokenIndex const& V, Token const* x, size_t const len) - { - if (!len) return ""; - UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); - ostringstream buf; - buf << V[x->id()]; - size_t i = 1; - for (x = x->next(); x && i < len; ++i, x = x->next()) - buf << " " << V[x->id()]; - UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); - return buf.str(); - } - template<typename Token> - class - PhrasePair + template<typename TKN> + class Bitext { public: - class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; - Token const* start1; - Token const* start2; - uint32_t len1; - uint32_t len2; - uint64_t p1, p2; - uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; - vector<float> fvals; - float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? - float dbwd[Moses::LRModel::NONE+1]; // distortion counts - vector<uchar> aln; - float score; - bool inverse; - vector<uint32_t> indoc; - PhrasePair() { }; - PhrasePair(PhrasePair const& o); - - PhrasePair const& operator+=(PhrasePair const& other); - - bool operator<(PhrasePair const& other) const; - bool operator>(PhrasePair const& other) const; - bool operator<=(PhrasePair const& other) const; - bool operator>=(PhrasePair const& other) const; - - void init(); - void init(uint64_t const pid1, bool is_inverse, - Token const* x, uint32_t const len, - pstats const* ps = NULL, size_t const numfeats=0); - - // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); - // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, - // size_t const numfeats); - - // PhrasePair const& - // update(uint64_t const pid2, size_t r2 = 0); - - PhrasePair const& - update(uint64_t const pid2, Token const* x, - uint32_t const len, jstats const& js); - - // PhrasePair const& - // update(uint64_t const pid2, jstats const& js1, jstats const& js2); - - // PhrasePair const& - // update(uint64_t const pid2, size_t const raw2extra, jstats const& js); - - // float - // eval(vector<float> const& w); - - class SortByTargetIdSeq - { - public: - int cmp(PhrasePair const& a, PhrasePair const& b) const; - bool operator()(PhrasePair const& a, PhrasePair const& b) const; - }; - - class SortDescendingByJointCount - { - public: - int cmp(PhrasePair const& a, PhrasePair const& b) const; - bool operator()(PhrasePair const& a, PhrasePair const& b) const; - }; - }; + typedef TKN Token; + typedef typename TSA<Token>::tree_iterator iter; + typedef typename std::vector<PhrasePair<Token> > vec_ppair; + typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t; - template<typename Token> - void - PhrasePair<Token>:: - init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len, - pstats const* ps, size_t const numfeats) - { - inverse = is_inverse; - start1 = x; len1 = len; - p1 = pid1; - p2 = 0; - if (ps) - { - raw1 = ps->raw_cnt; - sample1 = ps->sample_cnt; - good1 = ps->good; - } - else raw1 = sample1 = good1 = 0; - joint = 0; - good2 = 0; - sample2 = 0; - raw2 = 0; - fvals.resize(numfeats); - } + friend class Moses::Mmsapt; + protected: + mutable boost::shared_mutex m_lock; // for thread-safe operation - template<typename Token> - PhrasePair<Token> const& - PhrasePair<Token>:: - update(uint64_t const pid2, - Token const* x, uint32_t const len, jstats const& js) - { - p2 = pid2; - start2 = x; len2 = len; - raw2 = js.cnt2(); - joint = js.rcnt(); - assert(js.aln().size()); - if (js.aln().size()) - aln = js.aln()[0].second; - float total_fwd = 0, total_bwd = 0; - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - PhraseOrientation po = static_cast<PhraseOrientation>(i); - total_fwd += js.dcnt_fwd(po)+1; - total_bwd += js.dcnt_bwd(po)+1; - } + class agenda; // for parallel sampling see ug_bitext_agenda.h + mutable sptr<agenda> ag; + size_t m_num_workers; // number of workers available to the agenda - // should we do that here or leave the raw counts? - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - PhraseOrientation po = static_cast<PhraseOrientation>(i); - dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; - dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; - } + size_t m_default_sample_size; + size_t m_pstats_cache_threshold; // threshold for caching sampling results + sptr<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results - indoc = js.indoc; - return *this; - } - - template<typename Token> - bool - PhrasePair<Token>:: - operator<(PhrasePair const& other) const - { return this->score < other.score; } - - template<typename Token> - bool - PhrasePair<Token>:: - operator>(PhrasePair const& other) const - { return this->score > other.score; } - - template<typename Token> - bool - PhrasePair<Token>:: - operator<=(PhrasePair const& other) const - { return this->score <= other.score; } - - template<typename Token> - bool - PhrasePair<Token>:: - operator>=(PhrasePair const& other) const - { return this->score >= other.score; } - - template<typename Token> - PhrasePair<Token> const& - PhrasePair<Token>:: - operator+=(PhrasePair const& o) - { - raw1 += o.raw1; - raw2 += o.raw2; - sample1 += o.sample1; - sample2 += o.sample2; - good1 += o.good1; - good2 += o.good2; - joint += o.joint; - return *this; - } - - template<typename Token> - PhrasePair<Token>:: - PhrasePair(PhrasePair<Token> const& o) - : start1(o.start1) - , start2(o.start2) - , len1(o.len1) - , len2(o.len2) - , p1(o.p1) - , p2(o.p2) - , raw1(o.raw1) - , raw2(o.raw2) - , sample1(o.sample1) - , sample2(o.sample2) - , good1(o.good1) - , good2(o.good2) - , joint(o.joint) - , fvals(o.fvals) - , aln(o.aln) - , score(o.score) - , inverse(o.inverse) - , indoc(o.indoc) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) - { - dfwd[i] = o.dfwd[i]; - dbwd[i] = o.dbwd[i]; - } - } - - template<typename Token> - int - PhrasePair<Token>:: - SortByTargetIdSeq:: - cmp(PhrasePair const& a, PhrasePair const& b) const - { - size_t i = 0; - Token const* x = a.start2; - Token const* y = b.start2; - while (i < a.len2 && i < b.len2 && x->id() == y->id()) - { - x = x->next(); - y = y->next(); - ++i; - } - if (i == a.len2 && i == b.len2) return 0; - if (i == a.len2) return -1; - if (i == b.len2) return 1; - return x->id() < y->id() ? -1 : 1; - } - - template<typename Token> - bool - PhrasePair<Token>:: - SortByTargetIdSeq:: - operator()(PhrasePair const& a, PhrasePair const& b) const - { - return this->cmp(a,b) < 0; - } - - template<typename Token> - int - PhrasePair<Token>:: - SortDescendingByJointCount:: - cmp(PhrasePair const& a, PhrasePair const& b) const - { - // size_t i = 0; - if (a.joint == b.joint) return 0; - return a.joint > b.joint ? -1 : 1; - } - - template<typename Token> - bool - PhrasePair<Token>:: - SortDescendingByJointCount:: - operator()(PhrasePair const& a, PhrasePair const& b) const - { - return this->cmp(a,b) < 0; - } + map<string,id_type> m_docname2docid; // maps from doc names to ids + sptr<std::vector<id_type> > m_sid2docid; // maps from sentences to docs (ids) - template<typename Token> - void - PhrasePair<Token>:: - init() - { - inverse = false; - len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; - start1 = start2 = NULL; - p1 = p2 = 0; - } - - template<typename TKN> - class Bitext - { - friend class Moses::Mmsapt; + mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; + // caches for unbiased sampling; biased sampling uses the caches that + // are stored locally on the translation task - protected: - mutable boost::shared_mutex m_lock; public: - typedef TKN Token; - typedef typename TSA<Token>::tree_iterator iter; - - class agenda; - // stores the list of unfinished jobs; - // maintains a pool of workers and assigns the jobs to them - - // to be done: work with multiple agendas for faster lookup - // (multiplex jobs); not sure if an agenda having more than - // four or so workers is efficient, because workers get into - // each other's way. - mutable sptr<agenda> ag; - sptr<Ttrack<char> > Tx; // word alignments sptr<Ttrack<Token> > T1; // token track sptr<Ttrack<Token> > T2; // token track @@ -473,76 +120,43 @@ namespace Moses { sptr<TSA<Token> > I1; // indices sptr<TSA<Token> > I2; // indices - map<string,id_type> m_docname2docid; // maps from doc names to ids - sptr<vector<id_type> > m_sid2docid; // maps from sentences to docs (ids) - /// given the source phrase sid[start:stop] // find the possible start (s1 .. s2) and end (e1 .. e2) // points of the target phrase; if non-NULL, store word // alignments in *core_alignment. If /flip/, source phrase is // L2. - bool - find_trg_phr_bounds + bool find_trg_phr_bounds ( size_t const sid, // sentence to investigate size_t const start, // start of source phrase size_t const stop, // last position of source phrase size_t & s1, size_t & s2, // beginning and end of target start size_t & e1, size_t & e2, // beginning and end of target end int& po_fwd, int& po_bwd, // phrase orientations - vector<uchar> * core_alignment, // stores the core alignment + std::vector<uchar> * core_alignment, // stores the core alignment bitvector* full_alignment, // stores full word alignment for this sent. bool const flip) const; // flip source and target (reverse lookup) - sptr<pstats::cache_t> m_cache1, m_cache2; - // caches for unbiased sampling; biased sampling uses the caches that - // are stored locally on the translation task - protected: - typedef typename - lru_cache::LRU_Cache<uint64_t, vector<PhrasePair<Token> > > - pplist_cache_t; - - size_t m_default_sample_size; - size_t m_num_workers; - size_t m_pstats_cache_threshold; - mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; - - protected: - + // prep2 launches sampling and returns immediately. + // lookup (below) waits for the job to finish before it returns sptr<pstats> prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; - // prep2 launches sampling and returns immediately, lookup (below) waits - // for the job to finish before it returns - + public: - Bitext(size_t const max_sample = 1000, - size_t const xnum_workers = 16); + Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16); - Bitext(Ttrack<Token>* const t1, - Ttrack<Token>* const t2, - Ttrack<char>* const tx, - TokenIndex* const v1, - TokenIndex* const v2, - TSA<Token>* const i1, - TSA<Token>* const i2, - size_t const max_sample=1000, + Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2, + Ttrack<char>* const tx, + TokenIndex* const v1, TokenIndex* const v2, + TSA<Token>* const i1, TSA<Token>* const i2, + size_t const max_sample=1000, size_t const xnum_workers=16); - virtual void open(string const base, string const L1, string const L2) = 0; + virtual void + open(string const base, string const L1, string const L2) = 0; sptr<pstats> lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; -#if 0 - // needs to be adapted to the new API - void - lookup(vector<Token> const& snt, TSA<Token>& idx, - vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest, - vector<vector<uint64_t> >* pidmap = NULL, - typename PhrasePair<Token>::Scorer* scorer=NULL, - sptr<SamplingBias const> const bias, - bool multithread=true) const; -#endif - void prep(ttasksptr const& ttask, iter const& phrase) const; void setDefaultSampleSize(size_t const max_samples); @@ -556,11 +170,23 @@ namespace Moses { loadSentenceBias(string const& fname) const; sptr<DocumentBias> - SetupDocumentBias(string const& bserver, string const& text, - ostream* log) const; + SetupDocumentBias(string const& bserver, string const& text, ostream* log) const; + +#if 0 + // needs to be adapted to the new API + void + lookup(std::vector<Token> const& snt, TSA<Token>& idx, + std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest, + std::vector<std::vector<uint64_t> >* pidmap = NULL, + typename PhrasePair<Token>::Scorer* scorer=NULL, + sptr<SamplingBias const> const bias, + bool multithread=true) const; +#endif }; +#include "ug_bitext_agenda.h" + template<typename Token> sptr<SentenceBias> Bitext<Token>:: @@ -594,8 +220,6 @@ namespace Moses { return buf.str(); } - - template<typename Token> size_t Bitext<Token>:: @@ -620,8 +244,8 @@ namespace Moses { template<typename Token> Bitext<Token>:: Bitext(size_t const max_sample, size_t const xnum_workers) - : m_default_sample_size(max_sample) - , m_num_workers(xnum_workers) + : m_num_workers(xnum_workers) + , m_default_sample_size(max_sample) , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD) , m_cache1(new pstats::cache_t) , m_cache2(new pstats::cache_t) @@ -638,639 +262,14 @@ namespace Moses { TSA<Token>* const i2, size_t const max_sample, size_t const xnum_workers) - : Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2) + : m_num_workers(xnum_workers) , m_default_sample_size(max_sample) - , m_num_workers(xnum_workers) , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD) , m_cache1(new pstats::cache_t) , m_cache2(new pstats::cache_t) + , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2) { } - // agenda is a pool of jobs - template<typename Token> - class - Bitext<Token>:: - agenda - { - boost::mutex lock; - class job - { -#if UG_BITEXT_TRACK_ACTIVE_THREADS - static ThreadSafeCounter active; -#endif - Bitext<Token> const* const m_bitext; - boost::mutex lock; - friend class agenda; - boost::taus88 rnd; // every job has its own pseudo random generator - double rnddenom; // denominator for scaling random sampling - size_t min_diverse; // minimum number of distinct translations - public: - size_t workers; // how many workers are working on this job? - sptr<TSA<Token> const> root; // root of the underlying suffix array - char const* next; // next position to read from - char const* stop; // end of index range - size_t max_samples; // how many samples to extract at most - size_t ctr; /* # of phrase occurrences considered so far - * # of samples chosen is stored in stats->good - */ - size_t len; // phrase length - bool fwd; // if true, source phrase is L1 - sptr<pstats> stats; // stores statistics collected during sampling - sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling - float bias_total; - bool step(uint64_t & sid, uint64_t & offset); // select another occurrence - bool done() const; - job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& m, - sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, - sptr<SamplingBias const> const& bias); - ~job(); - }; - public: - class - worker - { - agenda& ag; - public: - worker(agenda& a) : ag(a) {} - void operator()(); - }; - private: - list<sptr<job> > joblist; - vector<sptr<boost::thread> > workers; - bool shutdown; - size_t doomed; - public: - Bitext<Token> const& bt; - agenda(Bitext<Token> const& bitext); - ~agenda(); - void add_workers(int n); - - sptr<pstats> - add_job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& phrase, - size_t const max_samples, sptr<SamplingBias const> const& bias); - - sptr<job> get_job(); - }; - - template<typename Token> - bool - Bitext<Token>:: - agenda:: - job:: - step(uint64_t & sid, uint64_t & offset) - { - boost::lock_guard<boost::mutex> jguard(lock); - bool ret = (max_samples == 0) && (next < stop); - if (ret) - { - next = root->readSid(next,stop,sid); - next = root->readOffset(next,stop,offset); - boost::lock_guard<boost::mutex> sguard(stats->lock); - if (stats->raw_cnt == ctr) ++stats->raw_cnt; - if (m_bias && (*m_bias)[sid] == 0) - return false; - stats->sample_cnt++; - } - else - { - while (next < stop && (stats->good < max_samples || - stats->trg.size() < min_diverse)) - { - next = root->readSid(next,stop,sid); - next = root->readOffset(next,stop,offset); - if (m_bias) - { - id_type docid = m_bias->GetClass(sid); - if (stats->indoc.size() > docid) - { - uint32_t N = stats->good; - float k = min(stats->indoc[docid],N); - float p = (*m_bias)[sid]; - - typedef boost::math::binomial_distribution<> binomial; - using namespace boost::math; - if (cdf(complement(binomial(N+1, p), k)) < .05) continue; - } - } - { // brackets required for lock scoping; - // see sguard immediately below - boost::lock_guard<boost::mutex> sguard(stats->lock); - if (stats->raw_cnt == ctr) ++stats->raw_cnt; - size_t scalefac = (stats->raw_cnt - ctr++); - size_t rnum = scalefac * (rnd()/(rnd.max()+1.)); - size_t th = (bias_total - ? ((*m_bias)[sid]/bias_total * stats->raw_cnt - * max_samples) - : max_samples); -#if 0 - cerr << rnum << "/" << scalefac << " vs. " - << max_samples - stats->good << " (" - << max_samples << " - " << stats->good << ")" - << " th=" << th; - if (m_bias) - cerr << " with bias " << (*m_bias)[sid] - << " => " << th; - else cerr << " without bias"; - cerr << endl; -#endif -#if 0 - cerr << "bias total: " << bias_total - << " bias local: " << (*m_bias)[sid] - << " rnum: " << rnum - << " good: " << stats->good - << " th: " << th - << " raw: " << stats->raw_cnt - << endl; -#endif - if (rnum + stats->good < th) - { - stats->sample_cnt++; - ret = true; - break; - } - } - } - } - - // boost::lock_guard<boost::mutex> sguard(stats->lock); - // abuse of lock for clean output to cerr - // cerr << stats->sample_cnt++; - return ret; - } - - template<typename Token> - void - Bitext<Token>:: - agenda:: - add_workers(int n) - { - static boost::posix_time::time_duration nodelay(0,0,0,0); - boost::lock_guard<boost::mutex> guard(this->lock); - - int target = max(1, int(n + workers.size() - this->doomed)); - // house keeping: remove all workers that have finished - for (size_t i = 0; i < workers.size(); ) - { - if (workers[i]->timed_join(nodelay)) - { - if (i + 1 < workers.size()) - workers[i].swap(workers.back()); - workers.pop_back(); - } - else ++i; - } - // cerr << workers.size() << "/" << target << " active" << endl; - if (int(workers.size()) > target) - this->doomed = workers.size() - target; - else - while (int(workers.size()) < target) - { - sptr<boost::thread> w(new boost::thread(worker(*this))); - workers.push_back(w); - } - } - - template<typename Token> - void - Bitext<Token>:: - agenda:: - worker:: - operator()() - { - // things to do: - // - have each worker maintain their own pstats object and merge results at the end; - // - ensure the minimum size of samples considered by a non-locked counter that is only - // ever incremented -- who cares if we look at more samples than required, as long - // as we look at at least the minimum required - // This way, we can reduce the number of lock / unlock operations we need to do during - // sampling. - size_t s1=0, s2=0, e1=0, e2=0; - uint64_t sid=0, offset=0; // of the source phrase - while(sptr<job> j = ag.get_job()) - { - j->stats->register_worker(); - vector<uchar> aln; - bitvector full_alignment(100*100); - while (j->step(sid,offset)) - { - int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1; - - Token const* t = ag.bt.T2->sntStart(sid); - Token const* eos = ag.bt.T2->sntEnd(sid); -#if 0 - cerr << "[" << j->stats->good + 1 << "] "; - while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; - cerr << "[" << docid << "]" << endl; -#endif - aln.clear(); - int po_fwd=Moses::LRModel::NONE,po_bwd=Moses::LRModel::NONE; - if (j->fwd) - { - if (!ag.bt.find_trg_phr_bounds - (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd, - &aln,&full_alignment,false)) - continue; - } - else if (!ag.bt.find_trg_phr_bounds - (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd, - &aln,NULL,true)) // NULL,NULL,true)) - continue; - j->stats->lock.lock(); - j->stats->good += 1; - j->stats->sum_pairs += (s2-s1+1)*(e2-e1+1); - ++j->stats->ofwd[po_fwd]; - ++j->stats->obwd[po_bwd]; - j->stats->lock.unlock(); - // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) - for (size_t k = 1; k < aln.size(); k += 2) - aln[k] += s2 - s1; - Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid); - float sample_weight = 1./((s2-s1+1)*(e2-e1+1)); - - vector<uint64_t> seen; - seen.reserve(100); - // It is possible that the phrase extraction extracts the same - // phrase twice, e.g., when word a co-occurs with sequence b b b - // but is aligned only to the middle word. We can only count - // each phrase pair once per source phrase occurrence, or else - // run the risk of having more joint counts than marginal - // counts. - - for (size_t s = s1; s <= s2; ++s) - { - sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s); - if (!b || b->size() < e1 -s) - UTIL_THROW(util::Exception, "target phrase not found"); - // assert(b); - for (size_t i = e1; i <= e2; ++i) - { - uint64_t tpid = b->getPid(); - size_t s = 0; - while (s < seen.size() && seen[s] != tpid) ++s; - if (s < seen.size()) - { -#if 0 - size_t sid, off, len; - parse_pid(tpid,sid,off,len); - cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl; - for (size_t z = 0; z < len; ++z) - { - id_type tid = ag.bt.T2->sntStart(sid)[off+z].id(); - cerr << (*ag.bt.V2)[tid] << " "; - } - cerr << endl; -#endif - continue; - } - seen.push_back(tpid); - if (! j->stats->add(tpid,sample_weight,aln, - b->approxOccurrenceCount(), - po_fwd,po_bwd,docid)) - { - cerr << "FATAL ERROR AT " << __FILE__ - << ":" << __LINE__ << endl; - assert(0); - ostringstream msg; - for (size_t z = 0; z < j->len; ++z) - { - id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id(); - cerr << (*ag.bt.V1)[tid] << " "; - } - cerr << endl; - for (size_t z = s; z <= i; ++z) - cerr << (*ag.bt.V2)[(o+z)->id()] << " "; - cerr << endl; - assert(0); - UTIL_THROW(util::Exception,"Error in sampling."); - } - if (i < e2) - { -#ifndef NDEBUG - bool ok = b->extend(o[i].id()); - assert(ok); -#else - b->extend(o[i].id()); - // cerr << "boo" << endl; -#endif - } - } - // if (j->fwd && s < s2) - // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) - if (s < s2) - for (size_t k = 1; k < aln.size(); k += 2) - --aln[k]; - } - // j->stats->lock.unlock(); - } - j->stats->release(); - } - } - - template<typename Token> - Bitext<Token>:: - agenda:: - job:: - ~job() - { - if (stats) stats.reset(); -#if UG_BITEXT_TRACK_ACTIVE_THREADS - try { --active; } catch (...) {} -#endif - // counter may not exist any more at destruction time - } - - template<typename Token> - Bitext<Token>:: - agenda:: - job:: - job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& m, - sptr<TSA<Token> > const& r, size_t maxsmpl, - bool isfwd, sptr<SamplingBias const> const& bias) - : m_bitext(theBitext) - , rnd(0) - , rnddenom(rnd.max() + 1.) - , min_diverse(1) - , workers(0) - , root(r) - , next(m.lower_bound(-1)) - , stop(m.upper_bound(-1)) - , max_samples(maxsmpl) - , ctr(0) - , len(m.size()) - , fwd(isfwd) - , m_bias(bias) - { - stats.reset(new pstats()); - stats->raw_cnt = m.approxOccurrenceCount(); - bias_total = 0; - // we need to renormalize on the fly, as the summ of all sentence probs over - // all candidates (not all sentences in the corpus) needs to add to 1. - // Profiling question: how much does that cost us? - if (m_bias) - { - int ctr = 0; - stats->raw_cnt = 0; - for (char const* x = m.lower_bound(-1); x < stop;) - { - uint32_t sid; ushort offset; - x = root->readSid(x,stop,sid); - x = root->readOffset(x,stop,offset); -#if 0 - cerr << ctr++ << " " << m.str(m_bitext->V1.get()) - << " " << sid << "/" << root->getCorpusSize() - << " " << offset << " " << stop-x << endl; -#endif - bias_total += (*m_bias)[sid]; - ++stats->raw_cnt; - } - } -#if UG_BITEXT_TRACK_ACTIVE_THREADS - ++active; - // if (active%5 == 0) - // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl; -#endif - } - - template<typename Token> - sptr<pstats> - Bitext<Token>:: - agenda:: - add_job(Bitext<Token> const* const theBitext, - typename TSA<Token>::tree_iterator const& phrase, - size_t const max_samples, sptr<SamplingBias const> const& bias) - { - boost::unique_lock<boost::mutex> lk(this->lock); - static boost::posix_time::time_duration nodelay(0,0,0,0); - bool fwd = phrase.root == bt.I1.get(); - sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, - max_samples, fwd, bias)); - j->stats->register_worker(); - - joblist.push_back(j); - if (joblist.size() == 1) - { - size_t i = 0; - while (i < workers.size()) - { - if (workers[i]->timed_join(nodelay)) - { - if (doomed) - { - if (i+1 < workers.size()) - workers[i].swap(workers.back()); - workers.pop_back(); - --doomed; - } - else - workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this))); - } - else ++i; - } - } - return j->stats; - } - - template<typename Token> - sptr<typename Bitext<Token>::agenda::job> - Bitext<Token>:: - agenda:: - get_job() - { - // cerr << workers.size() << " workers on record" << endl; - sptr<job> ret; - if (this->shutdown) return ret; - boost::unique_lock<boost::mutex> lock(this->lock); - if (this->doomed) - { - --this->doomed; - return ret; - } - typename list<sptr<job> >::iterator j = joblist.begin(); - while (j != joblist.end()) - { - if ((*j)->done()) - { - (*j)->stats->release(); - joblist.erase(j++); - } - else if ((*j)->workers >= 4) - { - ++j; - } - else break; - } - if (joblist.size()) - { - ret = j == joblist.end() ? joblist.front() : *j; - boost::lock_guard<boost::mutex> jguard(ret->lock); - ++ret->workers; - } - return ret; - } - - - template<typename TKN> - class mmBitext : public Bitext<TKN> - { - void load_document_map(string const& fname); - public: - void open(string const base, string const L1, string L2); - mmBitext(); - }; - - template<typename TKN> - mmBitext<TKN>:: - mmBitext() - : Bitext<TKN>(new mmTtrack<TKN>(), - new mmTtrack<TKN>(), - new mmTtrack<char>(), - new TokenIndex(), - new TokenIndex(), - new mmTSA<TKN>(), - new mmTSA<TKN>()) - {}; - - template<typename TKN> - void - mmBitext<TKN>:: - load_document_map(string const& fname) - { - ifstream docmap(fname.c_str()); - // the docmap file should list the documents in the corpus - // in the order in which they appear with one line per document: - // <docname> <number of lines / sentences> - // - // in the future, we might also allow listing documents with - // sentence ranges. - string buffer,docname; size_t a=0,b; - this->m_sid2docid.reset(new vector<id_type>(this->T1->size())); - while(getline(docmap,buffer)) - { - istringstream line(buffer); - if (!(line>>docname)) continue; // empty line - if (docname.size() && docname[0] == '#') continue; // comment - size_t docid = this->m_docname2docid.size(); - this->m_docname2docid[docname] = docid; - line >> b; - VERBOSE(1, "DOCUMENT MAP " << docname - << " " << a << "-" << b+a << endl); - for (b += a; a < b; ++a) - (*this->m_sid2docid)[a] = docid; - } - UTIL_THROW_IF2(b != this->T1->size(), - "Document map doesn't match corpus!"); - } - - template<typename TKN> - void - mmBitext<TKN>:: - open(string const base, string const L1, string L2) - { - mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get()); - mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get()); - mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get()); - t1.open(base+L1+".mct"); - t2.open(base+L2+".mct"); - tx.open(base+L1+"-"+L2+".mam"); - this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); - this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); - mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get()); - mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get()); - i1.open(base+L1+".sfa", this->T1); - i2.open(base+L2+".sfa", this->T2); - assert(this->T1->size() == this->T2->size()); - - string docmapfile = base+"dmp"; - if (!access(docmapfile.c_str(),F_OK)) - load_document_map(docmapfile); - } - - - template<typename TKN> - class imBitext : public Bitext<TKN> - { - sptr<imTtrack<char> > myTx; - sptr<imTtrack<TKN> > myT1; - sptr<imTtrack<TKN> > myT2; - sptr<imTSA<TKN> > myI1; - sptr<imTSA<TKN> > myI2; - static ThreadSafeCounter my_revision; - public: - size_t revision() const { return my_revision; } - void open(string const base, string const L1, string L2); - imBitext(sptr<TokenIndex> const& V1, - sptr<TokenIndex> const& V2, - size_t max_sample = 5000, size_t num_workers=4); - imBitext(size_t max_sample = 5000, size_t num_workers=4); - imBitext(imBitext const& other); - - // sptr<imBitext<TKN> > - // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a); - - sptr<imBitext<TKN> > - add(vector<string> const& s1, - vector<string> const& s2, - vector<string> const& a) const; - - }; - - template<typename TKN> - ThreadSafeCounter - imBitext<TKN>::my_revision; - - template<typename TKN> - imBitext<TKN>:: - imBitext(size_t max_sample, size_t num_workers) - : Bitext<TKN>(max_sample, num_workers) - { - this->m_default_sample_size = max_sample; - this->V1.reset(new TokenIndex()); - this->V2.reset(new TokenIndex()); - this->V1->setDynamic(true); - this->V2->setDynamic(true); - ++my_revision; - } - - template<typename TKN> - imBitext<TKN>:: - imBitext(sptr<TokenIndex> const& v1, - sptr<TokenIndex> const& v2, - size_t max_sample, size_t num_workers) - : Bitext<TKN>(max_sample, num_workers) - { - // this->default_sample_size = max_sample; - this->V1 = v1; - this->V2 = v2; - this->V1->setDynamic(true); - this->V2->setDynamic(true); - ++my_revision; - } - - - template<typename TKN> - imBitext<TKN>:: - imBitext(imBitext<TKN> const& other) - { - this->myTx = other.myTx; - this->myT1 = other.myT1; - this->myT2 = other.myT2; - this->myI1 = other.myI1; - this->myI2 = other.myI2; - this->Tx = this->myTx; - this->T1 = this->myT1; - this->T2 = this->myT2; - this->I1 = this->myI1; - this->I2 = this->myI2; - this->V1 = other.V1; - this->V2 = other.V2; - this->m_default_sample_size = other.m_default_sample_size; - this->m_num_workers = other.m_num_workers; - ++my_revision; - } - template<typename TKN> class snt_adder; template<> class snt_adder<L2R_Token<SimpleWordId> >; @@ -1278,147 +277,17 @@ namespace Moses { class snt_adder<L2R_Token<SimpleWordId> > { typedef L2R_Token<SimpleWordId> TKN; - vector<string> const & snt; + std::vector<string> const & snt; TokenIndex & V; sptr<imTtrack<TKN> > & track; sptr<imTSA<TKN > > & index; public: - snt_adder(vector<string> const& s, TokenIndex& v, + snt_adder(std::vector<string> const& s, TokenIndex& v, sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i); void operator()(); }; - // template<typename TKN> - // class snt_adder - // { - // vector<string> const & snt; - // TokenIndex & V; - // sptr<imTtrack<TKN> > & track; - // sptr<imTSA<TKN > > & index; - // public: - // snt_adder(vector<string> const& s, TokenIndex& v, - // sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i); - - // template<typename T> - // void operator()(); - // }; - - // // template<> - // void - // snt_adder<L2R_Token<SimpleWordId> >:: - // operator()(); - - // template<> - // void - // snt_adder<char>:: - // operator()() - // { - // vector<id_type> sids; - // sids.reserve(snt.size()); - // BOOST_FOREACH(string const& s, snt) - // { - // sids.push_back(track ? track->size() : 0); - // istringstream buf(s); - // string w; - // vector<char> s; - // s.reserve(100); - // while (buf >> w) - // s.push_back(vector<char>(V[w])); - // track = append(track,s); - // } - // index.reset(new imTSA<char>(*index,track,sids,V.tsize())); - // } - - // template<typename TKN> - // snt_adder<TKN>:: - // snt_adder(vector<string> const& s, TokenIndex& v, - // sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i) - // : snt(s), V(v), track(t), index(i) - // { - // throw "Not implemented yet."; - // } - - template<> - sptr<imBitext<L2R_Token<SimpleWordId> > > - imBitext<L2R_Token<SimpleWordId> >:: - add(vector<string> const& s1, - vector<string> const& s2, - vector<string> const& aln) const; - - template<typename TKN> - sptr<imBitext<TKN> > - imBitext<TKN>:: - add(vector<string> const& s1, - vector<string> const& s2, - vector<string> const& aln) const - { - throw "Not yet implemented"; - } - // template<typename TKN> - // sptr<imBitext<TKN> > - // imBitext<TKN>:: - // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a) - // { - // boost::unique_lock<boost::shared_mutex> guard(m_lock); - // sptr<imBitext<TKN> > ret(new imBitext<TKN>()); - // vector<id_type> sids(1,this->myT1.size()-1); - // ret->myT1 = add(this->myT1,s1); - // ret->myT2 = add(this->myT2,s2); - // size_t v1size = this->V1.tsize(); - // size_t v2size = this->V2.tsize(); - // BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; } - // BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; } - // ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size)); - // ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size)); - // ostringstream abuf; - // BOOST_FOREACH(ushort x, a) binwrite(abuf,x); - // vector<char> foo(abuf.str().begin(),abuf.str().end()); - // ret->myTx = add(this->myTx,foo); - // ret->T1 = ret->myT1; - // ret->T2 = ret->myT2; - // ret->Tx = ret->myTx; - // ret->I1 = ret->myI1; - // ret->I2 = ret->myI2; - // ret->V1 = this->V1; - // ret->V2 = this->V2; - // return ret; - // } - - - // template<typename TKN> - // imBitext<TKN>:: - // imBitext() - // : Bitext<TKN>(new imTtrack<TKN>(), - // new imTtrack<TKN>(), - // new imTtrack<char>(), - // new TokenIndex(), - // new TokenIndex(), - // new imTSA<TKN>(), - // new imTSA<TKN>()) - // {} - - - template<typename TKN> - void - imBitext<TKN>:: - open(string const base, string const L1, string L2) - { - mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get()); - mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get()); - mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get()); - t1.open(base+L1+".mct"); - t2.open(base+L2+".mct"); - tx.open(base+L1+"-"+L2+".mam"); - this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); - this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); - mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get()); - mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get()); - i1.open(base+L1+".sfa", this->T1); - i2.open(base+L2+".sfa", this->T2); - assert(this->T1->size() == this->T2->size()); - } - template<typename Token> bool Bitext<Token>:: @@ -1427,7 +296,7 @@ namespace Moses { size_t const start, size_t const stop, size_t & s1, size_t & s2, size_t & e1, size_t & e2, int & po_fwd, int & po_bwd, - vector<uchar>* core_alignment, bitvector* full_alignment, + std::vector<uchar>* core_alignment, bitvector* full_alignment, bool const flip) const { // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; @@ -1464,7 +333,7 @@ namespace Moses { size_t src,trg; size_t lft = forbidden.size(); size_t rgt = 0; - vector<vector<ushort> > aln1(slen1),aln2(slen2); + std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2); char const* p = Tx->sntStart(sid); char const* x = Tx->sntEnd(sid); @@ -1532,33 +401,6 @@ namespace Moses { return ret; } -// template<typename Token> -// sptr<DocumentBias> -// Bitext<Token>:: -// SetupDocBias(string const& bserver, map<id_type,size_t> const& ctx) const -// { - -// sptr<DocumentBias> ret; -// #ifdef HAVE_CURLPP -// map<id_type,size_t>::const_iterator w = ctx.begin(); -// while(w != ctx.end() && w->second == 0) ++w; -// if (w == ctx.end()) return ret; -// string context; context.reserve(5000); -// context += (*V1)[w->first]; -// while (++w != ctx.end()) -// { -// if (w->second == 0) continue; -// context += " "; -// context += (*V1)[w->first]; -// } -// cerr << HERE << endl; -// cerr << "BIAS LOOKUP CONTEXT: " << context << endl; -// ret = GetDocBiasFromServer(bserver+curlpp::escape(context)); -// #endif -// return ret; -// } - - template<typename Token> void Bitext<Token>:: @@ -1587,7 +429,7 @@ namespace Moses { // - no caching for rare phrases and special requests (max_sample) // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias - if (max_sample == m_default_sample_size + if (max_sample == int(m_default_sample_size) && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) { cache = (phrase.root == I1.get() @@ -1622,7 +464,7 @@ namespace Moses { { Ttrack<Token> const& m_other; sptr<pstats> m_pstats; - vector<PhrasePair<Token> >& m_pplist; + std::vector<PhrasePair<Token> >& m_pplist; typename PhrasePair<Token>::Scorer const* m_scorer; PhrasePair<Token> m_pp; Token const* m_token; @@ -1635,7 +477,7 @@ namespace Moses { pstats2pplist(typename TSA<Token>::tree_iterator const& m, Ttrack<Token> const& other, sptr<pstats> const& ps, - vector<PhrasePair<Token> >& dest, + std::vector<PhrasePair<Token> >& dest, typename PhrasePair<Token>::Scorer const* scorer) : m_other(other) , m_pstats(ps) @@ -1665,7 +507,8 @@ namespace Moses { uint32_t sid,off,len; parse_pid(a->first, sid, off, len); m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second); - m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint); + m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1), + m_pp.joint); size_t J = m_pp.joint<<7; // hard coded threshold of 1/128 if (m_pp.good1 > J || m_pp.good2 > J) continue; if (m_scorer) @@ -1683,13 +526,13 @@ namespace Moses { template<typename Token> void Bitext<Token>:: - lookup(vector<Token> const& snt, TSA<Token>& idx, - vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest, - vector<vector<uint64_t> >* pidmap, + lookup(std::vector<Token> const& snt, TSA<Token>& idx, + std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest, + std::vector<std::vector<uint64_t> >* pidmap, typename PhrasePair<Token>::Scorer* scorer, sptr<SamplingBias const> const& bias, bool multithread) const { - // typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t; + // typedef std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > > ret_t; dest.clear(); dest.resize(snt.size()); @@ -1698,7 +541,7 @@ namespace Moses { // collect statistics in parallel, then build PT entries as // the sampling finishes bool fwd = &idx == I1.get(); - vector<boost::thread*> workers; // background threads doing the lookup + std::vector<boost::thread*> workers; // background threads doing the lookup pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2); if (C.capacity() < 100000) C.reserve(100000); for (size_t i = 0; i < snt.size(); ++i) @@ -1709,12 +552,12 @@ namespace Moses { { uint64_t key = m.getPid(); if (pidmap) (*pidmap)[i].push_back(key); - sptr<vector<PhrasePair<Token> > > pp = C.get(key); + sptr<std::vector<PhrasePair<Token> > > pp = C.get(key); if (pp) dest[i].push_back(pp); else { - pp.reset(new vector<PhrasePair<Token> >()); + pp.reset(new std::vector<PhrasePair<Token> >()); C.set(key,pp); dest[i].push_back(pp); sptr<pstats> x = prep2(m, this->default_sample_size,bias); @@ -1780,49 +623,12 @@ namespace Moses { return ret; } #endif - - template<typename Token> - Bitext<Token>:: - agenda:: - ~agenda() - { - this->lock.lock(); - this->shutdown = true; - this->lock.unlock(); - for (size_t i = 0; i < workers.size(); ++i) - workers[i]->join(); - } template<typename Token> - Bitext<Token>:: - agenda:: - agenda(Bitext<Token> const& thebitext) - : shutdown(false), doomed(0), bt(thebitext) - { } - - template<typename Token> - bool - Bitext<Token>:: - agenda:: - job:: - done() const - { - return (max_samples && stats->good >= max_samples) || next == stop; - } - -#if UG_BITEXT_TRACK_ACTIVE_THREADS - template<typename TKN> - ThreadSafeCounter - Bitext<TKN>:: - agenda:: - job::active; -#endif - - template<typename Token> void expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt, pstats const& ps, - vector<PhrasePair<Token> >& dest, ostream* log) + std::vector<PhrasePair<Token> >& dest, ostream* log) { bool fwd = m.root == bt.I1.get(); dest.reserve(ps.trg.size()); @@ -1887,5 +693,9 @@ namespace Moses { #endif } // end of namespace bitext } // end of namespace moses -#endif + +#include "ug_im_bitext.h" +#include "ug_mm_bitext.h" + + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h new file mode 100644 index 000000000..a9632c056 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -0,0 +1,186 @@ +// -*- c++ -*- +// to be included from ug_bitext.h + +// The agenda handles parallel sampling. +// It maintains a queue of unfinished sampling jobs and +// assigns them to a pool of workers. +// +template<typename Token> +class Bitext<Token> +::agenda +{ +public: + class job; + class worker; +private: + boost::mutex lock; + std::list<sptr<job> > joblist; + std::vector<sptr<boost::thread> > workers; + bool shutdown; + size_t doomed; + +public: + + + Bitext<Token> const& bt; + + agenda(Bitext<Token> const& bitext); + ~agenda(); + + void + add_workers(int n); + + sptr<pstats> + add_job(Bitext<Token> const* const theBitext, + typename TSA<Token>::tree_iterator const& phrase, + size_t const max_samples, sptr<SamplingBias const> const& bias); + // add_job(Bitext<Token> const* const theBitext, + // typename TSA<Token>::tree_iterator const& phrase, + // size_t const max_samples, SamplingBias const* const bias); + + sptr<job> + get_job(); +}; + +template<typename Token> +class +Bitext<Token>::agenda:: +worker +{ + agenda& ag; +public: + worker(agenda& a) : ag(a) {} + void operator()(); +}; + +#include "ug_bitext_agenda_worker.h" +#include "ug_bitext_agenda_job.h" + +template<typename Token> +void Bitext<Token> +::agenda +::add_workers(int n) +{ + static boost::posix_time::time_duration nodelay(0,0,0,0); + boost::lock_guard<boost::mutex> guard(this->lock); + + int target = max(1, int(n + workers.size() - this->doomed)); + // house keeping: remove all workers that have finished + for (size_t i = 0; i < workers.size(); ) + { + if (workers[i]->timed_join(nodelay)) + { + if (i + 1 < workers.size()) + workers[i].swap(workers.back()); + workers.pop_back(); + } + else ++i; + } + // cerr << workers.size() << "/" << target << " active" << endl; + if (int(workers.size()) > target) + this->doomed = workers.size() - target; + else + while (int(workers.size()) < target) + { + sptr<boost::thread> w(new boost::thread(worker(*this))); + workers.push_back(w); + } +} + + +template<typename Token> +sptr<pstats> Bitext<Token> +::agenda +::add_job(Bitext<Token> const* const theBitext, + typename TSA<Token>::tree_iterator const& phrase, + size_t const max_samples, sptr<SamplingBias const> const& bias) +{ + boost::unique_lock<boost::mutex> lk(this->lock); + static boost::posix_time::time_duration nodelay(0,0,0,0); + bool fwd = phrase.root == bt.I1.get(); + sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, + max_samples, fwd, bias)); + j->stats->register_worker(); + + joblist.push_back(j); + if (joblist.size() == 1) + { + size_t i = 0; + while (i < workers.size()) + { + if (workers[i]->timed_join(nodelay)) + { + if (doomed) + { + if (i+1 < workers.size()) + workers[i].swap(workers.back()); + workers.pop_back(); + --doomed; + } + else + workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this))); + } + else ++i; + } + } + return j->stats; +} + +template<typename Token> +sptr<typename Bitext<Token>::agenda::job> +Bitext<Token> +::agenda +::get_job() +{ + // cerr << workers.size() << " workers on record" << endl; + sptr<job> ret; + if (this->shutdown) return ret; + boost::unique_lock<boost::mutex> lock(this->lock); + if (this->doomed) + { // the number of workers has been reduced, tell the redundant once to quit + --this->doomed; + return ret; + } + + typename list<sptr<job> >::iterator j = joblist.begin(); + while (j != joblist.end()) + { + if ((*j)->done()) + { + (*j)->stats->release(); + joblist.erase(j++); + } + else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job + else break; // found one + } + if (joblist.size()) + { + ret = j == joblist.end() ? joblist.front() : *j; + // if we've reached the end of the queue (all jobs have 4 workers on them), + // take the first in the queue + boost::lock_guard<boost::mutex> jguard(ret->lock); + ++ret->workers; + } + return ret; +} + +template<typename Token> +Bitext<Token>:: +agenda:: +~agenda() +{ + this->lock.lock(); + this->shutdown = true; + this->lock.unlock(); + for (size_t i = 0; i < workers.size(); ++i) + workers[i]->join(); +} + +template<typename Token> +Bitext<Token>:: +agenda:: +agenda(Bitext<Token> const& thebitext) + : shutdown(false), doomed(0), bt(thebitext) +{ } + + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h new file mode 100644 index 000000000..efbebad52 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -0,0 +1,240 @@ +// -*- c++ -*- +// class declaration of template<typename Token> class Bitxt<Token>::agenda::job +// to be included by ug_bitext.h +// todo: add check to enforce this + +template<typename Token> +class +Bitext<Token>::agenda:: +job +{ +#if UG_BITEXT_TRACK_ACTIVE_THREADS + static ThreadSafeCounter active; +#endif + Bitext<Token> const* const m_bitext; + boost::mutex lock; + friend class agenda; + boost::taus88 rnd; // every job has its own pseudo random generator + double rnddenom; // denominator for scaling random sampling + size_t min_diverse; // minimum number of distinct translations + + bool flip_coin(uint64_t & sid, uint64_t & offset); + bool step(uint64_t & sid, uint64_t & offset); // proceed to next occurrence + +public: + size_t workers; // how many workers are working on this job? + sptr<TSA<Token> const> root; // root of the underlying suffix array + char const* next; // next position to read from + char const* stop; // end of index range + size_t max_samples; // how many samples to extract at most + size_t ctr; /* # of phrase occurrences considered so far + * # of samples chosen is stored in stats->good + */ + size_t len; // phrase length + bool fwd; // if true, source phrase is L1 + sptr<pstats> stats; // stores statistics collected during sampling + sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling + float bias_total; + bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence + + int + check_sample_distribution(uint64_t const& sid, uint64_t const& offset); + // for biased sampling: ensure the distribution approximately matches + // the bias + + bool done() const; + job(Bitext<Token> const* const theBitext, + typename TSA<Token>::tree_iterator const& m, + sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, + sptr<SamplingBias const> const& bias); + ~job(); +}; + +template<typename Token> +Bitext<Token>::agenda::job +::~job() +{ + if (stats) stats.reset(); +#if UG_BITEXT_TRACK_ACTIVE_THREADS + // counter may not exist any more at destruction time, hence try .. catch ... + try { --active; } catch (...) {} +#endif +} + +template<typename Token> +Bitext<Token>::agenda::job +::job(Bitext<Token> const* const theBitext, + typename TSA<Token>::tree_iterator const& m, + sptr<TSA<Token> > const& r, size_t maxsmpl, + bool isfwd, sptr<SamplingBias const> const& bias) + : m_bitext(theBitext) + , rnd(0) + , rnddenom(rnd.max() + 1.) + , min_diverse(1) + , workers(0) + , root(r) + , next(m.lower_bound(-1)) + , stop(m.upper_bound(-1)) + , max_samples(maxsmpl) + , ctr(0) + , len(m.size()) + , fwd(isfwd) + , m_bias(bias) +{ + stats.reset(new pstats()); + stats->raw_cnt = m.approxOccurrenceCount(); + bias_total = 0; + + // we need to renormalize on the fly, as the summ of all sentence probs over + // all candidates (not all sentences in the corpus) needs to add to 1. + // Profiling question: how much does that cost us? + if (m_bias) + { + int ctr = 0; + stats->raw_cnt = 0; + for (char const* x = m.lower_bound(-1); x < stop;) + { + uint32_t sid; ushort offset; + x = root->readSid(x,stop,sid); + x = root->readOffset(x,stop,offset); +#if 0 + cerr << ctr++ << " " << m.str(m_bitext->V1.get()) + << " " << sid << "/" << root->getCorpusSize() + << " " << offset << " " << stop-x << endl; +#endif + bias_total += (*m_bias)[sid]; + ++stats->raw_cnt; + } + } +#if UG_BITEXT_TRACK_ACTIVE_THREADS + ++active; + // if (active%5 == 0) + // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl; +#endif +} + +template<typename Token> +bool Bitext<Token>::agenda::job +::done() const +{ + return (max_samples && stats->good >= max_samples) || next == stop; +} + +template<typename Token> +int Bitext<Token>::agenda::job +::check_sample_distribution(uint64_t const& sid, uint64_t const& offset) +{ // ensure that the sampled distribution approximately matches the bias + // @return 0: SKIP this occurrence + // @return 1: consider this occurrence for sampling + // @return 2: include this occurrence in the sample by all means + + if (!m_bias) return 1; + + using namespace boost::math; + typedef boost::math::binomial_distribution<> binomial; + + ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; + + float p = (*m_bias)[sid]; + id_type docid = m_bias->GetClass(sid); + uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; + + // always consider candidates from dominating documents and + // from documents that have not been considered at all yet + bool ret = (p > .5 || k == 0); + + if (ret && !log) return 1; + + uint32_t N = stats->good; // number of trials + float d = cdf(complement(binomial(N, p), k)); + // d: probability that samples contains k or more instances from doc #docid + ret = ret || d >= .05; + + if (log) + { + Token const* t = root->getCorpus()->sntStart(sid)+offset; + Token const* x = t - min(offset,uint64_t(3)); + Token const* e = t+4; + if (e > root->getCorpus()->sntEnd(sid)) + e = root->getCorpus()->sntEnd(sid); + *log << docid << ":" << sid << " " << size_t(k) << "/" << N + << " @" << p << " => " << d << " ["; + for (size_t i = 0; i < stats->indoc.size(); ++i) + { + if (i) *log << " "; + *log << stats->indoc[i]; + } + *log << "] "; + for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; + if (!ret) *log << "SKIP"; + else if (p < .5 && d > .9) *log << "FORCE"; + *log << endl; + } + + return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); +} + +template<typename Token> +bool Bitext<Token>::agenda::job +::flip_coin(uint64_t & sid, uint64_t & offset) +{ + int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1; + if (no_maybe_yes == 0) return false; // no + if (no_maybe_yes > 1) return true; // yes + // ... maybe: flip a coin + size_t options_chosen = stats->good; + size_t options_total = max(stats->raw_cnt, this->ctr); + size_t options_left = (options_total - this->ctr); + size_t random_number = options_left * (rnd()/(rnd.max()+1.)); + size_t threshold; + if (bias_total) // we have a bias and there are candidates with non-zero prob + threshold = ((*m_bias)[sid]/bias_total * options_total * max_samples); + else // no bias, or all have prob 0 (can happen with a very opinionated bias) + threshold = max_samples; + return random_number + options_chosen < threshold; +} + +template<typename Token> +bool Bitext<Token>::agenda::job +::step(uint64_t & sid, uint64_t & offset) +{ // caller must lock! + if (next == stop) return false; + UTIL_THROW_IF2 + ( next > stop, "Fatal error at " << HERE << ". How did that happen?" ); + // boost::lock_guard<boost::mutex> jguard(lock); // caller must lock! + next = root->readSid(next, stop, sid); + next = root->readOffset(next, stop, offset); + ++ctr; + return true; +} + +template<typename Token> +bool Bitext<Token>::agenda::job +::nextSample(uint64_t & sid, uint64_t & offset) +{ + boost::lock_guard<boost::mutex> jguard(lock); + if (max_samples == 0) // no sampling, consider all occurrences + return step(sid, offset); + + while (step(sid,offset)) + { + size_t good = stats->good; + size_t diversity = stats->trg.size(); + if (good >= max_samples && diversity >= min_diverse) + return false; // done + + // flip_coin softly enforces approximation of the sampling to the + // bias (occurrences that would steer the sample too far from the bias + // are ruled out), and flips a biased coin otherwise. + if (!flip_coin(sid,offset)) continue; + return true; + } + return false; +} + +#if UG_BITEXT_TRACK_ACTIVE_THREADS +template<typename TKN> +ThreadSafeCounter Bitext<TKN>::agenda +::job +::active; +#endif diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h new file mode 100644 index 000000000..92ed3d36a --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -0,0 +1,102 @@ +// to be included from ug_bitext_agenda.h + +template<typename Token> +void +Bitext<Token>::agenda +::worker +::operator()() +{ + // things to do: + // + // - have each worker maintain their own pstats object and merge + // results at the end (to minimize mutex locking); + // + // - use a non-locked, monotonically increasing counter to + // ensure the minimum size of samples considered --- it's OK if + // we look at more samples than required. This way, we can + // reduce the number of lock / unlock operations we need to do + // during sampling. + + uint64_t sid=0, offset=0; // sid and offset of source phrase + size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase + vector<uchar> aln; // stores phrase-pair-internal alignment + while(sptr<job> j = ag.get_job()) + { + j->stats->register_worker(); + bitvector full_alignment(100*100); // Is full_alignment still needed??? + while (j->nextSample(sid,offset)) + { + aln.clear(); + int po_fwd = Moses::LRModel::NONE; + int po_bwd = Moses::LRModel::NONE; + int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1; + bitvector* full_aln = j->fwd ? &full_alignment : NULL; + + // find soft and hard boundaries of target phrase + bool good = (ag.bt.find_trg_phr_bounds + (sid, offset, offset + j->len, // input parameters + s1, s2, e1, e2, po_fwd, po_bwd, // bounds & orientation + &aln, full_aln, !j->fwd)); // aln info / flip sides? + + if (!good) + { // no good, probably because phrase is not coherent + j->stats->count_sample(docid, 0, po_fwd, po_bwd); + continue; + } + + // all good: register this sample as valid + size_t num_pairs = (s2-s1+1) * (e2-e1+1); + j->stats->count_sample(docid, num_pairs, po_fwd, po_bwd); + +#if 0 + Token const* t = ag.bt.T2->sntStart(sid); + Token const* eos = ag.bt.T2->sntEnd(sid); + cerr << "[" << j->stats->good + 1 << "] "; + while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; + cerr << "[" << docid << "]" << endl; +#endif + + float sample_weight = 1./num_pairs; + Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid); + + // adjust offsets in phrase-internal aligment + for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1; + + vector<uint64_t> seen; seen.reserve(10); + // It is possible that the phrase extraction extracts the same + // phrase twice, e.g., when word a co-occurs with sequence b b b + // but is aligned only to the middle word. We can only count + // each phrase pair once per source phrase occurrence, or else + // run the risk of having more joint counts than marginal + // counts. + + for (size_t s = s1; s <= s2; ++s) + { + TSA<Token> const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1; + sptr<iter> b = I.find(o + s, e1 - s); + UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found"); + + for (size_t i = e1; i <= e2; ++i) + { + uint64_t tpid = b->getPid(); + + // poor man's protection against over-counting + size_t s = 0; + while (s < seen.size() && seen[s] != tpid) ++s; + if (s < seen.size()) continue; + seen.push_back(tpid); + + size_t raw2 = b->approxOccurrenceCount(); + j->stats->add(tpid, sample_weight, aln, raw2, + po_fwd, po_bwd, docid); + bool ok = (i == e2) || b->extend(o[i].id()); + UTIL_THROW_IF2(!ok, "Could not extend target phrase."); + } + if (s < s2) // shift phrase-internal alignments + for (size_t k = 1; k < aln.size(); k += 2) + --aln[k]; + } + } + j->stats->release(); // indicate that you're done working on j->stats + } +} diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc new file mode 100644 index 000000000..2dda3ab9a --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -0,0 +1,91 @@ +#include "ug_bitext_jstats.h" +namespace Moses +{ + namespace bitext + { + + uint32_t jstats::rcnt() const { return my_rcnt; } + float jstats::wcnt() const { return my_wcnt; } + uint32_t jstats::cnt2() const { return my_cnt2; } + + // What was that used for again? UG + bool jstats::valid() { return my_wcnt >= 0; } + void jstats::validate() { if (my_wcnt < 0) my_wcnt *= -1; } + void jstats::invalidate() { if (my_wcnt > 0) my_wcnt *= -1; } + + jstats:: + jstats() + : my_rcnt(0), my_cnt2(0), my_wcnt(0) + { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + ofwd[i] = obwd[i] = 0; + my_aln.reserve(1); + } + + jstats:: + jstats(jstats const& other) + { + my_rcnt = other.rcnt(); + my_wcnt = other.wcnt(); + my_aln = other.aln(); + indoc = other.indoc; + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + ofwd[i] = other.ofwd[i]; + obwd[i] = other.obwd[i]; + } + } + + uint32_t + jstats:: + dcnt_fwd(PhraseOrientation const idx) const + { + assert(idx <= Moses::LRModel::NONE); + return ofwd[idx]; + } + + uint32_t + jstats:: + dcnt_bwd(PhraseOrientation const idx) const + { + assert(idx <= Moses::LRModel::NONE); + return obwd[idx]; + } + + void + jstats:: + add(float w, vector<uchar> const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient, int const docid) + { + boost::lock_guard<boost::mutex> lk(this->lock); + my_cnt2 = cnt2; + my_rcnt += 1; + my_wcnt += w; + if (a.size()) + { + size_t i = 0; + while (i < my_aln.size() && my_aln[i].second != a) ++i; + if (i == my_aln.size()) + my_aln.push_back(pair<size_t,vector<uchar> >(1,a)); + else + my_aln[i].first++; + if (my_aln[i].first > my_aln[i/2].first) + push_heap(my_aln.begin(),my_aln.begin()+i+1); + } + ++ofwd[fwd_orient]; + ++obwd[bwd_orient]; + if (docid >= 0) + { + while (int(indoc.size()) <= docid) indoc.push_back(0); + ++indoc[docid]; + } + } + + vector<pair<size_t, vector<uchar> > > const& + jstats:: + aln() const + { return my_aln; } + + + } +} diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h new file mode 100644 index 000000000..13c86e34d --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -0,0 +1,48 @@ +// -*- c++ -*- +#pragma once +#include "ug_typedefs.h" +#include "ug_lexical_reordering.h" +#include <boost/thread.hpp> + +namespace Moses +{ + namespace bitext + { + using namespace ugdiss; + + // "joint" (i.e., phrase pair) statistics + class + jstats + { + boost::mutex lock; + uint32_t my_rcnt; // unweighted joint count + uint32_t my_cnt2; // raw counts L2 + float my_wcnt; // weighted joint count + + // to do: use a static alignment pattern store that stores each pattern only + // once, so that we don't have to store so many alignment vectors + vector<pair<size_t, vector<uchar> > > my_aln; // internal word alignment + + uint32_t ofwd[Moses::LRModel::NONE+1]; // forward distortion type counts + uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts + + public: + vector<uint32_t> indoc; // counts origin of samples (for biased sampling) + jstats(); + jstats(jstats const& other); + uint32_t rcnt() const; // raw joint counts + uint32_t cnt2() const; // raw target phrase occurrence count + float wcnt() const; // weighted joint counts + + vector<pair<size_t, vector<uchar> > > const & aln() const; + void add(float w, vector<uchar> const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient, + int const docid); + void invalidate(); + void validate(); + bool valid(); + uint32_t dcnt_fwd(PhraseOrientation const idx) const; + uint32_t dcnt_bwd(PhraseOrientation const idx) const; + }; + } +} diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc new file mode 100644 index 000000000..bbae42e85 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -0,0 +1,83 @@ +#include "ug_bitext_pstats.h" + +namespace Moses +{ + namespace bitext + { + +#if UG_BITEXT_TRACK_ACTIVE_THREADS + ThreadSafeCounter pstats::active; +#endif + + pstats:: + pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0) + { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + ofwd[i] = obwd[i] = 0; + } + + pstats:: + ~pstats() + { +#if UG_BITEXT_TRACK_ACTIVE_THREADS + // counter may not exist any more at destruction time, so try ... catch + try { --active; } catch (...) {} +#endif + } + + void + pstats:: + register_worker() + { + this->lock.lock(); + ++this->in_progress; + this->lock.unlock(); + } + + void + pstats:: + release() + { + this->lock.lock(); + if (this->in_progress-- == 1) // last one - >we're done + this->ready.notify_all(); + this->lock.unlock(); + } + + void + pstats + ::count_sample(int const docid, size_t const num_pairs, + int const po_fwd, int const po_bwd) + { + boost::lock_guard<boost::mutex> guard(lock); + ++sample_cnt; + if (num_pairs == 0) return; + ++good; + sum_pairs += num_pairs; + ++ofwd[po_fwd]; + ++obwd[po_bwd]; + while (int(indoc.size()) <= docid) indoc.push_back(0); + ++indoc[docid]; + } + + bool + pstats:: + add(uint64_t pid, float const w, + vector<uchar> const& a, + uint32_t const cnt2, + uint32_t fwd_o, + uint32_t bwd_o, int const docid) + { + boost::lock_guard<boost::mutex> guard(this->lock); + jstats& entry = this->trg[pid]; + entry.add(w, a, cnt2, fwd_o, bwd_o, docid); + if (this->good < entry.rcnt()) + { + UTIL_THROW(util::Exception, "more joint counts than good counts:" + << entry.rcnt() << "/" << this->good << "!"); + } + return true; + } + + } +} diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h new file mode 100644 index 000000000..c5b6c0152 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -0,0 +1,63 @@ +// -*- c++ -*- +#pragma once + +#include <boost/thread.hpp> +#include <boost/unordered_map.hpp> + +#include "ug_typedefs.h" +#include "ug_bitext_jstats.h" +#include "moses/thread_safe_container.h" + +namespace Moses +{ + namespace bitext + { + struct + pstats + { + typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t; + typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t; + typedef std::vector<uchar> alnvec; +#if UG_BITEXT_TRACK_ACTIVE_THREADS + static ThreadSafeCounter active; +#endif + boost::mutex lock; // for parallel gathering of stats + boost::condition_variable ready; // consumers can wait for me to be ready + + size_t raw_cnt; // (approximate) raw occurrence count + size_t sample_cnt; // number of instances selected during sampling + size_t good; // number of selected instances with valid word alignments + size_t sum_pairs; // total number of target phrases extracted (can be > raw_cnt) + size_t in_progress; // how many threads are currently working on this? + + uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations + uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations + + std::vector<uint32_t> indoc; // distribution over where samples came from + + typedef std::map<uint64_t, jstats> trg_map_t; + trg_map_t trg; + pstats(); + ~pstats(); + void release(); + void register_worker(); + size_t count_workers() { return in_progress; } + + bool + add(uint64_t const pid, // target phrase id + float const w, // sample weight (1./(# of phrases extractable)) + alnvec const& a, // local alignment + uint32_t const cnt2, // raw target phrase count + uint32_t fwd_o, // fwd. phrase orientation + uint32_t bwd_o, // bwd. phrase orientation + int const docid); // document where sample was found + + void + count_sample(int const docid, // document where sample was found + size_t const num_pairs, // # of phrases extractable here + int const po_fwd, // fwd phrase orientation + int const po_bwd); // bwd phrase orientation + }; + + } +} diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc new file mode 100644 index 000000000..9f26a181b --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc @@ -0,0 +1,87 @@ +#include "ug_im_bitext.h" + +namespace Moses +{ + namespace bitext + { + + template<> + sptr<imBitext<L2R_Token<SimpleWordId> > > + imBitext<L2R_Token<SimpleWordId> >:: + add(vector<string> const& s1, + vector<string> const& s2, + vector<string> const& aln) const + { + typedef L2R_Token<SimpleWordId> TKN; + assert(s1.size() == s2.size() && s1.size() == aln.size()); + +#ifndef NDEBUG + size_t first_new_snt = this->T1 ? this->T1->size() : 0; +#endif + + sptr<imBitext<TKN> > ret; + { + boost::unique_lock<boost::shared_mutex> guard(m_lock); + ret.reset(new imBitext<TKN>(*this)); + } + + // we add the sentences in separate threads (so it's faster) + boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1)); + // thread1.join(); // for debugging + boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2)); + BOOST_FOREACH(string const& a, aln) + { + istringstream ibuf(a); + ostringstream obuf; + uint32_t row,col; char c; + while (ibuf >> row >> c >> col) + { + UTIL_THROW_IF2(c != '-', "[" << HERE << "] " + << "Error in alignment information:\n" << a); + binwrite(obuf,row); + binwrite(obuf,col); + } + // important: DO NOT replace the two lines below this comment by + // char const* x = obuf.str().c_str(), as the memory x is pointing + // to is freed immediately upon deconstruction of the string object. + string foo = obuf.str(); + char const* x = foo.c_str(); + vector<char> v(x,x+foo.size()); + ret->myTx = append(ret->myTx, v); + } + + thread1.join(); + thread2.join(); + + ret->Tx = ret->myTx; + ret->T1 = ret->myT1; + ret->T2 = ret->myT2; + ret->I1 = ret->myI1; + ret->I2 = ret->myI2; + +#ifndef NDEBUG + // sanity check + for (size_t i = first_new_snt; i < ret->T1->size(); ++i) + { + size_t slen1 = ret->T1->sntLen(i); + size_t slen2 = ret->T2->sntLen(i); + char const* p = ret->Tx->sntStart(i); + char const* q = ret->Tx->sntEnd(i); + size_t k; + while (p < q) + { + p = binread(p,k); + assert(p); + assert(p < q); + assert(k < slen1); + p = binread(p,k); + assert(p); + assert(k < slen2); + } + } +#endif + return ret; + } + + } +} diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h new file mode 100644 index 000000000..a620b7219 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h @@ -0,0 +1,130 @@ +// -*- c++ -*- +#pragma once +#include "ug_bitext.h" + +namespace Moses +{ + namespace bitext + { + template<typename TKN> + class imBitext : public Bitext<TKN> + { + sptr<imTtrack<char> > myTx; + sptr<imTtrack<TKN> > myT1; + sptr<imTtrack<TKN> > myT2; + sptr<imTSA<TKN> > myI1; + sptr<imTSA<TKN> > myI2; + static ThreadSafeCounter my_revision; + public: + size_t revision() const { return my_revision; } + void open(string const base, string const L1, string L2); + imBitext(sptr<TokenIndex> const& V1, + sptr<TokenIndex> const& V2, + size_t max_sample = 5000, size_t num_workers=4); + imBitext(size_t max_sample = 5000, size_t num_workers=4); + imBitext(imBitext const& other); + + // sptr<imBitext<TKN> > + // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a); + + sptr<imBitext<TKN> > + add(vector<string> const& s1, + vector<string> const& s2, + vector<string> const& a) const; + + }; + + template<typename TKN> + ThreadSafeCounter + imBitext<TKN>::my_revision; + + template<typename TKN> + imBitext<TKN>:: + imBitext(size_t max_sample, size_t num_workers) + : Bitext<TKN>(max_sample, num_workers) + { + this->m_default_sample_size = max_sample; + this->V1.reset(new TokenIndex()); + this->V2.reset(new TokenIndex()); + this->V1->setDynamic(true); + this->V2->setDynamic(true); + ++my_revision; + } + + template<typename TKN> + imBitext<TKN>:: + imBitext(sptr<TokenIndex> const& v1, + sptr<TokenIndex> const& v2, + size_t max_sample, size_t num_workers) + : Bitext<TKN>(max_sample, num_workers) + { + // this->default_sample_size = max_sample; + this->V1 = v1; + this->V2 = v2; + this->V1->setDynamic(true); + this->V2->setDynamic(true); + ++my_revision; + } + + + template<typename TKN> + imBitext<TKN>:: + imBitext(imBitext<TKN> const& other) + { + this->myTx = other.myTx; + this->myT1 = other.myT1; + this->myT2 = other.myT2; + this->myI1 = other.myI1; + this->myI2 = other.myI2; + this->Tx = this->myTx; + this->T1 = this->myT1; + this->T2 = this->myT2; + this->I1 = this->myI1; + this->I2 = this->myI2; + this->V1 = other.V1; + this->V2 = other.V2; + this->m_default_sample_size = other.m_default_sample_size; + this->m_num_workers = other.m_num_workers; + ++my_revision; + } + + template<> + sptr<imBitext<L2R_Token<SimpleWordId> > > + imBitext<L2R_Token<SimpleWordId> >:: + add(vector<string> const& s1, + vector<string> const& s2, + vector<string> const& aln) const; + + template<typename TKN> + sptr<imBitext<TKN> > + imBitext<TKN>:: + add(vector<string> const& s1, + vector<string> const& s2, + vector<string> const& aln) const + { + throw "Not yet implemented"; + } + + // What's up with this function???? UG + template<typename TKN> + void + imBitext<TKN>:: + open(string const base, string const L1, string L2) + { + mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get()); + mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get()); + mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get()); + t1.open(base+L1+".mct"); + t2.open(base+L2+".mct"); + tx.open(base+L1+"-"+L2+".mam"); + this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); + this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); + mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get()); + mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get()); + i1.open(base+L1+".sfa", this->T1); + i2.open(base+L2+".sfa", this->T2); + assert(this->T1->size() == this->T2->size()); + } + + } +} diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h new file mode 100644 index 000000000..211793277 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h @@ -0,0 +1,81 @@ +// -*- c++ -*- +// don't include this file directly! it is included by ug_bitext.h + +namespace Moses +{ + namespace bitext + { + template<typename TKN> + class mmBitext : public Bitext<TKN> + { + void load_document_map(string const& fname); + public: + void open(string const base, string const L1, string L2); + mmBitext(); + }; + + template<typename TKN> + mmBitext<TKN>:: + mmBitext() + : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(), + new TokenIndex(), new TokenIndex(), + new mmTSA<TKN>(), new mmTSA<TKN>()) + {}; + + template<typename TKN> + void + mmBitext<TKN>:: + load_document_map(string const& fname) + { + ifstream docmap(fname.c_str()); + // the docmap file should list the documents in the corpus + // in the order in which they appear with one line per document: + // <docname> <number of lines / sentences> + // + // in the future, we might also allow listing documents with + // sentence ranges. + string buffer,docname; size_t a=0,b; + this->m_sid2docid.reset(new vector<id_type>(this->T1->size())); + while(getline(docmap,buffer)) + { + istringstream line(buffer); + if (!(line>>docname)) continue; // empty line + if (docname.size() && docname[0] == '#') continue; // comment + size_t docid = this->m_docname2docid.size(); + this->m_docname2docid[docname] = docid; + line >> b; + VERBOSE(1, "DOCUMENT MAP " << docname + << " " << a << "-" << b+a << endl); + for (b += a; a < b; ++a) + (*this->m_sid2docid)[a] = docid; + } + UTIL_THROW_IF2(b != this->T1->size(), + "Document map doesn't match corpus!"); + } + + template<typename TKN> + void + mmBitext<TKN>:: + open(string const base, string const L1, string L2) + { + mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get()); + mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get()); + mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get()); + t1.open(base+L1+".mct"); + t2.open(base+L2+".mct"); + tx.open(base+L1+"-"+L2+".mam"); + this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); + this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); + mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get()); + mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get()); + i1.open(base+L1+".sfa", this->T1); + i2.open(base+L2+".sfa", this->T2); + assert(this->T1->size() == this->T2->size()); + + string docmapfile = base+"dmp"; + if (!access(docmapfile.c_str(),F_OK)) + load_document_map(docmapfile); + } + + } +} diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h new file mode 100644 index 000000000..28a926587 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -0,0 +1,246 @@ +// -*- c++ -*- +#pragma once +#include <vector> +#include "ug_typedefs.h" +#include "ug_bitext_pstats.h" + +namespace Moses +{ + namespace bitext + { + template<typename Token> + class + PhrasePair + { + public: + class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; + Token const* start1; + Token const* start2; + uint32_t len1; + uint32_t len2; + uint64_t p1, p2; + uint32_t raw1, raw2, sample1, sample2, good1, good2, joint; + std::vector<float> fvals; + float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? + float dbwd[Moses::LRModel::NONE+1]; // distortion counts + std::vector<uchar> aln; + float score; + bool inverse; + std::vector<uint32_t> indoc; + PhrasePair() { }; + PhrasePair(PhrasePair const& o); + + PhrasePair const& operator+=(PhrasePair const& other); + + bool operator<(PhrasePair const& other) const; + bool operator>(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; + bool operator>=(PhrasePair const& other) const; + + void init(); + void init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps = NULL, size_t const numfeats=0); + + PhrasePair const& + update(uint64_t const pid2, Token const* x, + uint32_t const len, jstats const& js); + + class SortByTargetIdSeq + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + + class SortDescendingByJointCount + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + }; + + template<typename Token> + void PhrasePair<Token> + ::init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps, size_t const numfeats) + { + inverse = is_inverse; + start1 = x; len1 = len; + p1 = pid1; + p2 = 0; + if (ps) + { + raw1 = ps->raw_cnt; + sample1 = ps->sample_cnt; + good1 = ps->good; + } + else raw1 = sample1 = good1 = 0; + joint = 0; + good2 = 0; + sample2 = 0; + raw2 = 0; + fvals.resize(numfeats); + } + + template<typename Token> + PhrasePair<Token> const& + PhrasePair<Token> + ::update(uint64_t const pid2, + Token const* x, uint32_t const len, jstats const& js) + { + p2 = pid2; + start2 = x; len2 = len; + raw2 = js.cnt2(); + joint = js.rcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + float total_fwd = 0, total_bwd = 0; + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + total_fwd += js.dcnt_fwd(po)+1; + total_bwd += js.dcnt_bwd(po)+1; + } + + // should we do that here or leave the raw counts? + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; + dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; + } + + indoc = js.indoc; + return *this; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator<(PhrasePair const& other) const + { + return this->score < other.score; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator>(PhrasePair const& other) const + { + return this->score > other.score; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator<=(PhrasePair const& other) const + { + return this->score <= other.score; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator>=(PhrasePair const& other) const + { + return this->score >= other.score; + } + + template<typename Token> + PhrasePair<Token> const& + PhrasePair<Token> + ::operator+=(PhrasePair const& o) + { + raw1 += o.raw1; + raw2 += o.raw2; + good1 += o.good1; + good2 += o.good2; + joint += o.joint; + sample1 += o.sample1; + sample2 += o.sample2; + return *this; + } + + template<typename Token> + PhrasePair<Token> + ::PhrasePair(PhrasePair<Token> const& o) + : start1(o.start1) , start2(o.start2) + , len1(o.len1) , len2(o.len2) + , p1(o.p1) , p2(o.p2) + , raw1(o.raw1) , raw2(o.raw2) + , sample1(o.sample1) , sample2(o.sample2) + , good1(o.good1) , good2(o.good2) + , joint(o.joint) + , fvals(o.fvals) + , aln(o.aln) + , score(o.score) + , inverse(o.inverse) + , indoc(o.indoc) + { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } + + template<typename Token> + int PhrasePair<Token> + ::SortByTargetIdSeq + ::cmp(PhrasePair const& a, PhrasePair const& b) const + { + size_t i = 0; + Token const* x = a.start2; + Token const* y = b.start2; + while (i < a.len2 && i < b.len2 && x->id() == y->id()) + { + x = x->next(); + y = y->next(); + ++i; + } + if (i == a.len2 && i == b.len2) return 0; + if (i == a.len2) return -1; + if (i == b.len2) return 1; + return x->id() < y->id() ? -1 : 1; + } + + template<typename Token> + bool PhrasePair<Token> + ::SortByTargetIdSeq + ::operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template<typename Token> + int PhrasePair<Token> + ::SortDescendingByJointCount + ::cmp(PhrasePair const& a, PhrasePair const& b) const + { + if (a.joint == b.joint) return 0; + return a.joint > b.joint ? -1 : 1; + } + + template<typename Token> + bool PhrasePair<Token> + ::SortDescendingByJointCount + ::operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template<typename Token> + void PhrasePair<Token> + ::init() + { + inverse = false; + len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + start1 = start2 = NULL; + p1 = p2 = 0; + } + } +} diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index f7c95f439..faed69e63 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -4,9 +4,9 @@ #include <map> #include<vector> #include <string> +#include <iostream> #include "moses/Util.h" #include "ug_typedefs.h" - namespace Moses { namespace bitext @@ -18,7 +18,8 @@ namespace Moses class SamplingBias { public: - + int loglevel; + std::ostream* log; virtual float operator[](id_type const ID) const = 0; // returns (unnormalized bias) for the class of item ID diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h index 7c11b3942..f9864bda6 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h @@ -17,6 +17,7 @@ #include "ug_ttrack_position.h" #include "tpt_typedefs.h" #include "tpt_tokenindex.h" +#include "moses/Util.h" // #include "ug_vocab.h" namespace ugdiss @@ -25,6 +26,33 @@ namespace ugdiss typedef boost::dynamic_bitset<uint64_t> bdBitset; + template<typename sid_t, typename off_t, typename len_t> + void + parse_pid(uint64_t const pid, sid_t & sid, + off_t & off, len_t& len) + { + static uint64_t two32 = uint64_t(1)<<32; + static uint64_t two16 = uint64_t(1)<<16; + len = pid%two16; + off = (pid%two32)>>16; + sid = pid>>32; + } + + template<typename Token> + string + toString(TokenIndex const& V, Token const* x, size_t const len) + { + if (!len) return ""; + UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); + ostringstream buf; + buf << V[x->id()]; + size_t i = 1; + for (x = x->next(); x && i < len; ++i, x = x->next()) + buf << " " << V[x->id()]; + UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); + return buf.str(); + } + template<typename TKN=id_type> class Ttrack { |