Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Germann <Ulrich.Germann@gmail.com>2015-04-05 16:17:47 +0300
committerUlrich Germann <Ulrich.Germann@gmail.com>2015-04-05 16:29:00 +0300
commit46e31a285c8f9257a9d6ab411db74b5cbec9d0fe (patch)
tree9bf1afa3827e7252e6b9fd38e8ee27cef8693a9a /moses/TranslationModel/UG/mm
parent05c4e382ff7914369700eb516a61a45238292bdf (diff)
- Code refactoring for Bitext class.
- Bug fixes and conceptual improvements in biased sampling. The sampling now tries to stick to the bias, even when an unsuitable corpus dominates the occurrences.
Diffstat (limited to 'moses/TranslationModel/UG/mm')
-rw-r--r--moses/TranslationModel/UG/mm/mmlex-build.cc73
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.cc264
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h1346
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda.h186
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h240
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h102
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.cc91
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.h48
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.cc83
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.h63
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_bitext.cc87
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_bitext.h130
-rw-r--r--moses/TranslationModel/UG/mm/ug_mm_bitext.h81
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h246
-rw-r--r--moses/TranslationModel/UG/mm/ug_sampling_bias.h5
-rw-r--r--moses/TranslationModel/UG/mm/ug_ttrack_base.h28
16 files changed, 1475 insertions, 1598 deletions
diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc
index 4ef0842e4..5e5ea194c 100644
--- a/moses/TranslationModel/UG/mm/mmlex-build.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-build.cc
@@ -24,6 +24,7 @@
#include <boost/unordered_set.hpp>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "moses/Util.h"
#include "ug_mm_2d_table.h"
#include "ug_mm_ttrack.h"
#include "ug_corpus_token.h"
@@ -241,10 +242,14 @@ processSentence(id_type sid)
p = binread(p,r);
p = binread(p,c);
// cout << sid << " " << r << "-" << c << endl;
- assert(r < check1.size());
- assert(c < check2.size());
- assert(s1+r < e1);
- assert(s2+c < e2);
+ UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid);
+ UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid);
+ // assert(r < check1.size());
+ // assert(c < check2.size());
+ UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid);
+ UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid);
+ // assert(s1+r < e1);
+ // assert(s2+c < e2);
check1.reset(r);
check2.reset(c);
id_type id1 = (s1+r)->id();
@@ -266,66 +271,6 @@ processSentence(id_type sid)
CNT[wpair(0,(s2+i)->id())].a++;
}
-// void
-// writeTable(string ofname,
-// vector<vector<uint32_t> >& FREQ,
-// vector<map<id_type,uint32_t> >& RARE)
-// {
-// ofstream out(ofname.c_str());
-// filepos_type idxOffset=0;
-
-// vector<uint32_t> m1; // marginals L1
-// vector<uint32_t> m2; // marginals L2
-// m1.resize(max(first_rare_id,V1.getNumTokens()),0);
-// m2.resize(V2.getNumTokens(),0);
-// vector<id_type> index(V1.getNumTokens()+1,0);
-// numwrite(out,idxOffset); // blank for the time being
-// numwrite(out,id_type(m1.size()));
-// numwrite(out,id_type(m2.size()));
-
-// id_type cellCount=0;
-// id_type stop = min(first_rare_id,id_type(m1.size()));
-// for (id_type id1 = 0; id1 < stop; ++id1)
-// {
-// index[id1] = cellCount;
-// vector<uint32_t> const& v = FREQ[id1];
-// for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
-// {
-// if (!v[id2]) continue;
-// cellCount++;
-// numwrite(out,id2);
-// out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
-// m1[id1] += v[id2];
-// m2[id2] += v[id2];
-// }
-// }
-// for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
-// {
-// index[id1] = cellCount;
-// map<id_type,uint32_t> const& M = RARE[id1];
-// for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
-// {
-// if (m->second == 0) continue;
-// cellCount++;
-// numwrite(out,m->first);
-// out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
-// m1[id1] += m->second;
-// m2[m->first] += m->second;
-// }
-// }
-// index[m1.size()] = cellCount;
-// idxOffset = out.tellp();
-// for (size_t i = 0; i < index.size(); ++i)
-// numwrite(out,index[i]);
-// out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
-// out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
-
-// // re-write the file header
-// out.seekp(0);
-// numwrite(out,idxOffset);
-// out.close();
-// }
-
int
main(int argc, char* argv[])
{
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 29104aaec..fe95596ab 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -11,192 +11,6 @@ namespace Moses
namespace bitext
{
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- ThreadSafeCounter pstats::active;
-#endif
-
- pstats::
- pstats()
- : raw_cnt (0)
- , sample_cnt (0)
- , good (0)
- , sum_pairs (0)
- , in_progress (0)
- {
- for (int i = 0; i <= Moses::LRModel::NONE; ++i)
- ofwd[i] = obwd[i] = 0;
- }
-
- pstats::
- ~pstats()
- {
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- // counter may not exist any more at destruction time, so try ... catch
- try { --active; } catch (...) {}
-#endif
- }
-
- void
- pstats::
- register_worker()
- {
- this->lock.lock();
- ++this->in_progress;
- this->lock.unlock();
- }
-
- void
- pstats::
- release()
- {
- this->lock.lock();
- if (this->in_progress-- == 1) // last one - >we're done
- this->ready.notify_all();
- this->lock.unlock();
- }
-
- bool
- pstats::
- add(uint64_t pid, float const w,
- vector<uchar> const& a,
- uint32_t const cnt2,
- uint32_t fwd_o,
- uint32_t bwd_o, int const docid)
- {
- boost::lock_guard<boost::mutex> guard(this->lock);
- jstats& entry = this->trg[pid];
- entry.add(w,a,cnt2,fwd_o,bwd_o,docid);
- if (this->good < entry.rcnt())
- {
- UTIL_THROW(util::Exception, "more joint counts than good counts:"
- << entry.rcnt() << "/" << this->good << "!");
- }
-
- if (docid >= 0)
- {
- while (int(indoc.size()) <= docid) indoc.push_back(0);
- ++indoc[docid];
- }
-
- return true;
- }
-
- jstats::
- jstats()
- : my_rcnt(0), my_wcnt(0), my_cnt2(0)
- {
- for (int i = 0; i <= Moses::LRModel::NONE; ++i)
- ofwd[i] = obwd[i] = 0;
- my_aln.reserve(1);
- }
-
- jstats::
- jstats(jstats const& other)
- {
- my_rcnt = other.rcnt();
- my_wcnt = other.wcnt();
- my_aln = other.aln();
- indoc = other.indoc;
- for (int i = 0; i <= Moses::LRModel::NONE; i++)
- {
- ofwd[i] = other.ofwd[i];
- obwd[i] = other.obwd[i];
- }
- }
-
- uint32_t
- jstats::
- dcnt_fwd(PhraseOrientation const idx) const
- {
- assert(idx <= Moses::LRModel::NONE);
- return ofwd[idx];
- }
-
- uint32_t
- jstats::
- dcnt_bwd(PhraseOrientation const idx) const
- {
- assert(idx <= Moses::LRModel::NONE);
- return obwd[idx];
- }
-
- void
- jstats::
- add(float w, vector<uchar> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
- {
- boost::lock_guard<boost::mutex> lk(this->lock);
- my_rcnt += 1;
- my_wcnt += w;
- // my_cnt2 += cnt2; // could I really be that stupid? [UG]
- my_cnt2 = cnt2;
- if (a.size())
- {
- size_t i = 0;
- while (i < my_aln.size() && my_aln[i].second != a) ++i;
- if (i == my_aln.size())
- my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
- else
- my_aln[i].first++;
- if (my_aln[i].first > my_aln[i/2].first)
- push_heap(my_aln.begin(),my_aln.begin()+i+1);
- }
- ++ofwd[fwd_orient];
- ++obwd[bwd_orient];
- if (docid >= 0)
- {
- while (int(indoc.size()) <= docid) indoc.push_back(0);
- ++indoc[docid];
-
- // cout << docid << " => " << indoc[docid] << " " << HERE << endl;
-
- }
- }
-
- uint32_t
- jstats::
- rcnt() const
- { return my_rcnt; }
-
- float
- jstats::
- wcnt() const
- { return my_wcnt; }
-
- uint32_t
- jstats::
- cnt2() const
- { return my_cnt2; }
-
- vector<pair<size_t, vector<uchar> > > const&
- jstats::
- aln() const
- { return my_aln; }
-
- void
- jstats::
- invalidate()
- {
- if (my_wcnt > 0)
- my_wcnt *= -1;
- }
-
- void
- jstats::
- validate()
- {
- if (my_wcnt < 0)
- my_wcnt *= -1;
- }
-
- bool
- jstats::
- valid()
- {
- return my_wcnt >= 0;
- }
-
-
float
lbop(size_t const tries, size_t const succ, float const confidence)
{
@@ -206,83 +20,6 @@ namespace Moses
find_lower_bound_on_p(tries, succ, confidence)));
}
- template<>
- sptr<imBitext<L2R_Token<SimpleWordId> > >
- imBitext<L2R_Token<SimpleWordId> >::
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& aln) const
- {
- typedef L2R_Token<SimpleWordId> TKN;
- assert(s1.size() == s2.size() && s1.size() == aln.size());
-
-#ifndef NDEBUG
- size_t first_new_snt = this->T1 ? this->T1->size() : 0;
-#endif
-
- sptr<imBitext<TKN> > ret;
- {
- boost::unique_lock<boost::shared_mutex> guard(m_lock);
- ret.reset(new imBitext<TKN>(*this));
- }
-
- // we add the sentences in separate threads (so it's faster)
- boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
- // thread1.join(); // for debugging
- boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
- BOOST_FOREACH(string const& a, aln)
- {
- istringstream ibuf(a);
- ostringstream obuf;
- uint32_t row,col; char c;
- while (ibuf >> row >> c >> col)
- {
- UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
- << "Error in alignment information:\n" << a);
- binwrite(obuf,row);
- binwrite(obuf,col);
- }
- // important: DO NOT replace the two lines below this comment by
- // char const* x = obuf.str().c_str(), as the memory x is pointing
- // to is freed immediately upon deconstruction of the string object.
- string foo = obuf.str();
- char const* x = foo.c_str();
- vector<char> v(x,x+foo.size());
- ret->myTx = append(ret->myTx, v);
- }
-
- thread1.join();
- thread2.join();
-
- ret->Tx = ret->myTx;
- ret->T1 = ret->myT1;
- ret->T2 = ret->myT2;
- ret->I1 = ret->myI1;
- ret->I2 = ret->myI2;
-
-#ifndef NDEBUG
- // sanity check
- for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
- {
- size_t slen1 = ret->T1->sntLen(i);
- size_t slen2 = ret->T2->sntLen(i);
- char const* p = ret->Tx->sntStart(i);
- char const* q = ret->Tx->sntEnd(i);
- size_t k;
- while (p < q)
- {
- p = binread(p,k);
- assert(p);
- assert(p < q);
- assert(k < slen1);
- p = binread(p,k);
- assert(p);
- assert(k < slen2);
- }
- }
-#endif
- return ret;
- }
// template<>
void
@@ -425,6 +162,5 @@ namespace Moses
}
cout << string(90,'-') << endl;
}
-
}
}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index bd2975cf7..89aeeaa7a 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -1,7 +1,5 @@
//-*- c++ -*-
-
-#ifndef __ug_bitext_h
-#define __ug_bitext_h
+#pragma once
// Implementations of word-aligned bitext.
// Written by Ulrich Germann
//
@@ -26,11 +24,11 @@
#include <iomanip>
#include <algorithm>
-#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
-#include <boost/thread.hpp>
#include <boost/random.hpp>
#include <boost/format.hpp>
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
#include <boost/math/distributions/binomial.hpp>
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
@@ -59,6 +57,7 @@
#include "ug_lru_cache.h"
#include "ug_lexical_reordering.h"
#include "ug_sampling_bias.h"
+#include "ug_phrasepair.h"
#define PSTATS_CACHE_THRESHOLD 50
@@ -66,101 +65,10 @@ namespace Moses {
class Mmsapt;
namespace bitext
{
- // using namespace ugdiss;
- // using namespace std;
-
- template<typename TKN> class Bitext;
- template<typename TKN> class PhrasePair;
using namespace ugdiss;
- template<typename TKN> class Bitext;
-
- template<typename sid_t, typename off_t, typename len_t>
- void
- parse_pid(uint64_t const pid, sid_t & sid,
- off_t & off, len_t& len)
- {
- static uint64_t two32 = uint64_t(1)<<32;
- static uint64_t two16 = uint64_t(1)<<16;
- len = pid%two16;
- off = (pid%two32)>>16;
- sid = pid>>32;
- }
-
- float
- lbop(size_t const tries, size_t const succ,
- float const confidence);
-
- // "joint" (i.e., phrase pair) statistics
- class
- jstats
- {
- boost::mutex lock;
- uint32_t my_rcnt; // unweighted count
- float my_wcnt; // weighted count
- uint32_t my_cnt2;
- vector<pair<size_t, vector<uchar> > > my_aln;
- uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1];
- public:
- vector<uint32_t> indoc;
- jstats();
- jstats(jstats const& other);
- uint32_t rcnt() const;
- uint32_t cnt2() const; // raw target phrase occurrence count
- float wcnt() const;
-
- vector<pair<size_t, vector<uchar> > > const & aln() const;
- void add(float w, vector<uchar> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid);
- void invalidate();
- void validate();
- bool valid();
- uint32_t dcnt_fwd(PhraseOrientation const idx) const;
- uint32_t dcnt_bwd(PhraseOrientation const idx) const;
- };
-
- struct
- pstats
- {
- typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t;
- typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t;
-
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- static ThreadSafeCounter active;
-#endif
- boost::mutex lock; // for parallel gathering of stats
- boost::condition_variable ready; /* consumers can wait for this
- * data structure to be ready. */
-
- size_t raw_cnt; // (approximate) raw occurrence count
- size_t sample_cnt; // number of instances selected during sampling
- size_t good; // number of selected instances with valid word alignments
- size_t sum_pairs;
- size_t in_progress; // keeps track of how many threads are currently working on this
-
- // size_t Moses::LRModel::ReorderingType
- uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1];
+ float lbop(size_t const tries, size_t const succ, float const confidence);
- vector<uint32_t> indoc;
-
-
- // typedef typename boost::unordered_map<typename uint64_t, jstats> trg_map_t;
- typedef std::map<uint64_t, jstats> trg_map_t;
- trg_map_t trg;
- pstats();
- ~pstats();
- void release();
- void register_worker();
- size_t count_workers() { return in_progress; }
-
- bool
- add(uint64_t const pid,
- float const w,
- vector<uchar> const& a,
- uint32_t const cnt2,
- uint32_t fwd_o, uint32_t bwd_o, int const docid);
- };
-
struct
ContextForQuery
{
@@ -174,297 +82,36 @@ namespace Moses {
ContextForQuery() : bias_log(NULL) { }
};
- template<typename Token>
- string
- toString(TokenIndex const& V, Token const* x, size_t const len)
- {
- if (!len) return "";
- UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
- ostringstream buf;
- buf << V[x->id()];
- size_t i = 1;
- for (x = x->next(); x && i < len; ++i, x = x->next())
- buf << " " << V[x->id()];
- UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
- return buf.str();
- }
- template<typename Token>
- class
- PhrasePair
+ template<typename TKN>
+ class Bitext
{
public:
- class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
- Token const* start1;
- Token const* start2;
- uint32_t len1;
- uint32_t len2;
- uint64_t p1, p2;
- uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
- vector<float> fvals;
- float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs?
- float dbwd[Moses::LRModel::NONE+1]; // distortion counts
- vector<uchar> aln;
- float score;
- bool inverse;
- vector<uint32_t> indoc;
- PhrasePair() { };
- PhrasePair(PhrasePair const& o);
-
- PhrasePair const& operator+=(PhrasePair const& other);
-
- bool operator<(PhrasePair const& other) const;
- bool operator>(PhrasePair const& other) const;
- bool operator<=(PhrasePair const& other) const;
- bool operator>=(PhrasePair const& other) const;
-
- void init();
- void init(uint64_t const pid1, bool is_inverse,
- Token const* x, uint32_t const len,
- pstats const* ps = NULL, size_t const numfeats=0);
-
- // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
- // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
- // size_t const numfeats);
-
- // PhrasePair const&
- // update(uint64_t const pid2, size_t r2 = 0);
-
- PhrasePair const&
- update(uint64_t const pid2, Token const* x,
- uint32_t const len, jstats const& js);
-
- // PhrasePair const&
- // update(uint64_t const pid2, jstats const& js1, jstats const& js2);
-
- // PhrasePair const&
- // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
- // float
- // eval(vector<float> const& w);
-
- class SortByTargetIdSeq
- {
- public:
- int cmp(PhrasePair const& a, PhrasePair const& b) const;
- bool operator()(PhrasePair const& a, PhrasePair const& b) const;
- };
-
- class SortDescendingByJointCount
- {
- public:
- int cmp(PhrasePair const& a, PhrasePair const& b) const;
- bool operator()(PhrasePair const& a, PhrasePair const& b) const;
- };
- };
+ typedef TKN Token;
+ typedef typename TSA<Token>::tree_iterator iter;
+ typedef typename std::vector<PhrasePair<Token> > vec_ppair;
+ typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
- template<typename Token>
- void
- PhrasePair<Token>::
- init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len,
- pstats const* ps, size_t const numfeats)
- {
- inverse = is_inverse;
- start1 = x; len1 = len;
- p1 = pid1;
- p2 = 0;
- if (ps)
- {
- raw1 = ps->raw_cnt;
- sample1 = ps->sample_cnt;
- good1 = ps->good;
- }
- else raw1 = sample1 = good1 = 0;
- joint = 0;
- good2 = 0;
- sample2 = 0;
- raw2 = 0;
- fvals.resize(numfeats);
- }
+ friend class Moses::Mmsapt;
+ protected:
+ mutable boost::shared_mutex m_lock; // for thread-safe operation
- template<typename Token>
- PhrasePair<Token> const&
- PhrasePair<Token>::
- update(uint64_t const pid2,
- Token const* x, uint32_t const len, jstats const& js)
- {
- p2 = pid2;
- start2 = x; len2 = len;
- raw2 = js.cnt2();
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- float total_fwd = 0, total_bwd = 0;
- for (int i = 0; i <= Moses::LRModel::NONE; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- total_fwd += js.dcnt_fwd(po)+1;
- total_bwd += js.dcnt_bwd(po)+1;
- }
+ class agenda; // for parallel sampling see ug_bitext_agenda.h
+ mutable sptr<agenda> ag;
+ size_t m_num_workers; // number of workers available to the agenda
- // should we do that here or leave the raw counts?
- for (int i = 0; i <= Moses::LRModel::NONE; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
- dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
- }
+ size_t m_default_sample_size;
+ size_t m_pstats_cache_threshold; // threshold for caching sampling results
+ sptr<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
- indoc = js.indoc;
- return *this;
- }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator<(PhrasePair const& other) const
- { return this->score < other.score; }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator>(PhrasePair const& other) const
- { return this->score > other.score; }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator<=(PhrasePair const& other) const
- { return this->score <= other.score; }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- operator>=(PhrasePair const& other) const
- { return this->score >= other.score; }
-
- template<typename Token>
- PhrasePair<Token> const&
- PhrasePair<Token>::
- operator+=(PhrasePair const& o)
- {
- raw1 += o.raw1;
- raw2 += o.raw2;
- sample1 += o.sample1;
- sample2 += o.sample2;
- good1 += o.good1;
- good2 += o.good2;
- joint += o.joint;
- return *this;
- }
-
- template<typename Token>
- PhrasePair<Token>::
- PhrasePair(PhrasePair<Token> const& o)
- : start1(o.start1)
- , start2(o.start2)
- , len1(o.len1)
- , len2(o.len2)
- , p1(o.p1)
- , p2(o.p2)
- , raw1(o.raw1)
- , raw2(o.raw2)
- , sample1(o.sample1)
- , sample2(o.sample2)
- , good1(o.good1)
- , good2(o.good2)
- , joint(o.joint)
- , fvals(o.fvals)
- , aln(o.aln)
- , score(o.score)
- , inverse(o.inverse)
- , indoc(o.indoc)
- {
- for (int i = 0; i <= Moses::LRModel::NONE; ++i)
- {
- dfwd[i] = o.dfwd[i];
- dbwd[i] = o.dbwd[i];
- }
- }
-
- template<typename Token>
- int
- PhrasePair<Token>::
- SortByTargetIdSeq::
- cmp(PhrasePair const& a, PhrasePair const& b) const
- {
- size_t i = 0;
- Token const* x = a.start2;
- Token const* y = b.start2;
- while (i < a.len2 && i < b.len2 && x->id() == y->id())
- {
- x = x->next();
- y = y->next();
- ++i;
- }
- if (i == a.len2 && i == b.len2) return 0;
- if (i == a.len2) return -1;
- if (i == b.len2) return 1;
- return x->id() < y->id() ? -1 : 1;
- }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- SortByTargetIdSeq::
- operator()(PhrasePair const& a, PhrasePair const& b) const
- {
- return this->cmp(a,b) < 0;
- }
-
- template<typename Token>
- int
- PhrasePair<Token>::
- SortDescendingByJointCount::
- cmp(PhrasePair const& a, PhrasePair const& b) const
- {
- // size_t i = 0;
- if (a.joint == b.joint) return 0;
- return a.joint > b.joint ? -1 : 1;
- }
-
- template<typename Token>
- bool
- PhrasePair<Token>::
- SortDescendingByJointCount::
- operator()(PhrasePair const& a, PhrasePair const& b) const
- {
- return this->cmp(a,b) < 0;
- }
+ map<string,id_type> m_docname2docid; // maps from doc names to ids
+ sptr<std::vector<id_type> > m_sid2docid; // maps from sentences to docs (ids)
- template<typename Token>
- void
- PhrasePair<Token>::
- init()
- {
- inverse = false;
- len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
- start1 = start2 = NULL;
- p1 = p2 = 0;
- }
-
- template<typename TKN>
- class Bitext
- {
- friend class Moses::Mmsapt;
+ mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
+ // caches for unbiased sampling; biased sampling uses the caches that
+ // are stored locally on the translation task
- protected:
- mutable boost::shared_mutex m_lock;
public:
- typedef TKN Token;
- typedef typename TSA<Token>::tree_iterator iter;
-
- class agenda;
- // stores the list of unfinished jobs;
- // maintains a pool of workers and assigns the jobs to them
-
- // to be done: work with multiple agendas for faster lookup
- // (multiplex jobs); not sure if an agenda having more than
- // four or so workers is efficient, because workers get into
- // each other's way.
- mutable sptr<agenda> ag;
-
sptr<Ttrack<char> > Tx; // word alignments
sptr<Ttrack<Token> > T1; // token track
sptr<Ttrack<Token> > T2; // token track
@@ -473,76 +120,43 @@ namespace Moses {
sptr<TSA<Token> > I1; // indices
sptr<TSA<Token> > I2; // indices
- map<string,id_type> m_docname2docid; // maps from doc names to ids
- sptr<vector<id_type> > m_sid2docid; // maps from sentences to docs (ids)
-
/// given the source phrase sid[start:stop]
// find the possible start (s1 .. s2) and end (e1 .. e2)
// points of the target phrase; if non-NULL, store word
// alignments in *core_alignment. If /flip/, source phrase is
// L2.
- bool
- find_trg_phr_bounds
+ bool find_trg_phr_bounds
( size_t const sid, // sentence to investigate
size_t const start, // start of source phrase
size_t const stop, // last position of source phrase
size_t & s1, size_t & s2, // beginning and end of target start
size_t & e1, size_t & e2, // beginning and end of target end
int& po_fwd, int& po_bwd, // phrase orientations
- vector<uchar> * core_alignment, // stores the core alignment
+ std::vector<uchar> * core_alignment, // stores the core alignment
bitvector* full_alignment, // stores full word alignment for this sent.
bool const flip) const; // flip source and target (reverse lookup)
- sptr<pstats::cache_t> m_cache1, m_cache2;
- // caches for unbiased sampling; biased sampling uses the caches that
- // are stored locally on the translation task
- protected:
- typedef typename
- lru_cache::LRU_Cache<uint64_t, vector<PhrasePair<Token> > >
- pplist_cache_t;
-
- size_t m_default_sample_size;
- size_t m_num_workers;
- size_t m_pstats_cache_threshold;
- mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
-
- protected:
-
+ // prep2 launches sampling and returns immediately.
+ // lookup (below) waits for the job to finish before it returns
sptr<pstats>
prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
- // prep2 launches sampling and returns immediately, lookup (below) waits
- // for the job to finish before it returns
-
+
public:
- Bitext(size_t const max_sample = 1000,
- size_t const xnum_workers = 16);
+ Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
- Bitext(Ttrack<Token>* const t1,
- Ttrack<Token>* const t2,
- Ttrack<char>* const tx,
- TokenIndex* const v1,
- TokenIndex* const v2,
- TSA<Token>* const i1,
- TSA<Token>* const i2,
- size_t const max_sample=1000,
+ Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
+ Ttrack<char>* const tx,
+ TokenIndex* const v1, TokenIndex* const v2,
+ TSA<Token>* const i1, TSA<Token>* const i2,
+ size_t const max_sample=1000,
size_t const xnum_workers=16);
- virtual void open(string const base, string const L1, string const L2) = 0;
+ virtual void
+ open(string const base, string const L1, string const L2) = 0;
sptr<pstats>
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
-#if 0
- // needs to be adapted to the new API
- void
- lookup(vector<Token> const& snt, TSA<Token>& idx,
- vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
- vector<vector<uint64_t> >* pidmap = NULL,
- typename PhrasePair<Token>::Scorer* scorer=NULL,
- sptr<SamplingBias const> const bias,
- bool multithread=true) const;
-#endif
-
void prep(ttasksptr const& ttask, iter const& phrase) const;
void setDefaultSampleSize(size_t const max_samples);
@@ -556,11 +170,23 @@ namespace Moses {
loadSentenceBias(string const& fname) const;
sptr<DocumentBias>
- SetupDocumentBias(string const& bserver, string const& text,
- ostream* log) const;
+ SetupDocumentBias(string const& bserver, string const& text, ostream* log) const;
+
+#if 0
+ // needs to be adapted to the new API
+ void
+ lookup(std::vector<Token> const& snt, TSA<Token>& idx,
+ std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
+ std::vector<std::vector<uint64_t> >* pidmap = NULL,
+ typename PhrasePair<Token>::Scorer* scorer=NULL,
+ sptr<SamplingBias const> const bias,
+ bool multithread=true) const;
+#endif
};
+#include "ug_bitext_agenda.h"
+
template<typename Token>
sptr<SentenceBias>
Bitext<Token>::
@@ -594,8 +220,6 @@ namespace Moses {
return buf.str();
}
-
-
template<typename Token>
size_t
Bitext<Token>::
@@ -620,8 +244,8 @@ namespace Moses {
template<typename Token>
Bitext<Token>::
Bitext(size_t const max_sample, size_t const xnum_workers)
- : m_default_sample_size(max_sample)
- , m_num_workers(xnum_workers)
+ : m_num_workers(xnum_workers)
+ , m_default_sample_size(max_sample)
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
, m_cache1(new pstats::cache_t)
, m_cache2(new pstats::cache_t)
@@ -638,639 +262,14 @@ namespace Moses {
TSA<Token>* const i2,
size_t const max_sample,
size_t const xnum_workers)
- : Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
+ : m_num_workers(xnum_workers)
, m_default_sample_size(max_sample)
- , m_num_workers(xnum_workers)
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
, m_cache1(new pstats::cache_t)
, m_cache2(new pstats::cache_t)
+ , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
{ }
- // agenda is a pool of jobs
- template<typename Token>
- class
- Bitext<Token>::
- agenda
- {
- boost::mutex lock;
- class job
- {
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- static ThreadSafeCounter active;
-#endif
- Bitext<Token> const* const m_bitext;
- boost::mutex lock;
- friend class agenda;
- boost::taus88 rnd; // every job has its own pseudo random generator
- double rnddenom; // denominator for scaling random sampling
- size_t min_diverse; // minimum number of distinct translations
- public:
- size_t workers; // how many workers are working on this job?
- sptr<TSA<Token> const> root; // root of the underlying suffix array
- char const* next; // next position to read from
- char const* stop; // end of index range
- size_t max_samples; // how many samples to extract at most
- size_t ctr; /* # of phrase occurrences considered so far
- * # of samples chosen is stored in stats->good
- */
- size_t len; // phrase length
- bool fwd; // if true, source phrase is L1
- sptr<pstats> stats; // stores statistics collected during sampling
- sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling
- float bias_total;
- bool step(uint64_t & sid, uint64_t & offset); // select another occurrence
- bool done() const;
- job(Bitext<Token> const* const theBitext,
- typename TSA<Token>::tree_iterator const& m,
- sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
- sptr<SamplingBias const> const& bias);
- ~job();
- };
- public:
- class
- worker
- {
- agenda& ag;
- public:
- worker(agenda& a) : ag(a) {}
- void operator()();
- };
- private:
- list<sptr<job> > joblist;
- vector<sptr<boost::thread> > workers;
- bool shutdown;
- size_t doomed;
- public:
- Bitext<Token> const& bt;
- agenda(Bitext<Token> const& bitext);
- ~agenda();
- void add_workers(int n);
-
- sptr<pstats>
- add_job(Bitext<Token> const* const theBitext,
- typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, sptr<SamplingBias const> const& bias);
-
- sptr<job> get_job();
- };
-
- template<typename Token>
- bool
- Bitext<Token>::
- agenda::
- job::
- step(uint64_t & sid, uint64_t & offset)
- {
- boost::lock_guard<boost::mutex> jguard(lock);
- bool ret = (max_samples == 0) && (next < stop);
- if (ret)
- {
- next = root->readSid(next,stop,sid);
- next = root->readOffset(next,stop,offset);
- boost::lock_guard<boost::mutex> sguard(stats->lock);
- if (stats->raw_cnt == ctr) ++stats->raw_cnt;
- if (m_bias && (*m_bias)[sid] == 0)
- return false;
- stats->sample_cnt++;
- }
- else
- {
- while (next < stop && (stats->good < max_samples ||
- stats->trg.size() < min_diverse))
- {
- next = root->readSid(next,stop,sid);
- next = root->readOffset(next,stop,offset);
- if (m_bias)
- {
- id_type docid = m_bias->GetClass(sid);
- if (stats->indoc.size() > docid)
- {
- uint32_t N = stats->good;
- float k = min(stats->indoc[docid],N);
- float p = (*m_bias)[sid];
-
- typedef boost::math::binomial_distribution<> binomial;
- using namespace boost::math;
- if (cdf(complement(binomial(N+1, p), k)) < .05) continue;
- }
- }
- { // brackets required for lock scoping;
- // see sguard immediately below
- boost::lock_guard<boost::mutex> sguard(stats->lock);
- if (stats->raw_cnt == ctr) ++stats->raw_cnt;
- size_t scalefac = (stats->raw_cnt - ctr++);
- size_t rnum = scalefac * (rnd()/(rnd.max()+1.));
- size_t th = (bias_total
- ? ((*m_bias)[sid]/bias_total * stats->raw_cnt
- * max_samples)
- : max_samples);
-#if 0
- cerr << rnum << "/" << scalefac << " vs. "
- << max_samples - stats->good << " ("
- << max_samples << " - " << stats->good << ")"
- << " th=" << th;
- if (m_bias)
- cerr << " with bias " << (*m_bias)[sid]
- << " => " << th;
- else cerr << " without bias";
- cerr << endl;
-#endif
-#if 0
- cerr << "bias total: " << bias_total
- << " bias local: " << (*m_bias)[sid]
- << " rnum: " << rnum
- << " good: " << stats->good
- << " th: " << th
- << " raw: " << stats->raw_cnt
- << endl;
-#endif
- if (rnum + stats->good < th)
- {
- stats->sample_cnt++;
- ret = true;
- break;
- }
- }
- }
- }
-
- // boost::lock_guard<boost::mutex> sguard(stats->lock);
- // abuse of lock for clean output to cerr
- // cerr << stats->sample_cnt++;
- return ret;
- }
-
- template<typename Token>
- void
- Bitext<Token>::
- agenda::
- add_workers(int n)
- {
- static boost::posix_time::time_duration nodelay(0,0,0,0);
- boost::lock_guard<boost::mutex> guard(this->lock);
-
- int target = max(1, int(n + workers.size() - this->doomed));
- // house keeping: remove all workers that have finished
- for (size_t i = 0; i < workers.size(); )
- {
- if (workers[i]->timed_join(nodelay))
- {
- if (i + 1 < workers.size())
- workers[i].swap(workers.back());
- workers.pop_back();
- }
- else ++i;
- }
- // cerr << workers.size() << "/" << target << " active" << endl;
- if (int(workers.size()) > target)
- this->doomed = workers.size() - target;
- else
- while (int(workers.size()) < target)
- {
- sptr<boost::thread> w(new boost::thread(worker(*this)));
- workers.push_back(w);
- }
- }
-
- template<typename Token>
- void
- Bitext<Token>::
- agenda::
- worker::
- operator()()
- {
- // things to do:
- // - have each worker maintain their own pstats object and merge results at the end;
- // - ensure the minimum size of samples considered by a non-locked counter that is only
- // ever incremented -- who cares if we look at more samples than required, as long
- // as we look at at least the minimum required
- // This way, we can reduce the number of lock / unlock operations we need to do during
- // sampling.
- size_t s1=0, s2=0, e1=0, e2=0;
- uint64_t sid=0, offset=0; // of the source phrase
- while(sptr<job> j = ag.get_job())
- {
- j->stats->register_worker();
- vector<uchar> aln;
- bitvector full_alignment(100*100);
- while (j->step(sid,offset))
- {
- int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1;
-
- Token const* t = ag.bt.T2->sntStart(sid);
- Token const* eos = ag.bt.T2->sntEnd(sid);
-#if 0
- cerr << "[" << j->stats->good + 1 << "] ";
- while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " ";
- cerr << "[" << docid << "]" << endl;
-#endif
- aln.clear();
- int po_fwd=Moses::LRModel::NONE,po_bwd=Moses::LRModel::NONE;
- if (j->fwd)
- {
- if (!ag.bt.find_trg_phr_bounds
- (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
- &aln,&full_alignment,false))
- continue;
- }
- else if (!ag.bt.find_trg_phr_bounds
- (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
- &aln,NULL,true)) // NULL,NULL,true))
- continue;
- j->stats->lock.lock();
- j->stats->good += 1;
- j->stats->sum_pairs += (s2-s1+1)*(e2-e1+1);
- ++j->stats->ofwd[po_fwd];
- ++j->stats->obwd[po_bwd];
- j->stats->lock.unlock();
- // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
- for (size_t k = 1; k < aln.size(); k += 2)
- aln[k] += s2 - s1;
- Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
- float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
-
- vector<uint64_t> seen;
- seen.reserve(100);
- // It is possible that the phrase extraction extracts the same
- // phrase twice, e.g., when word a co-occurs with sequence b b b
- // but is aligned only to the middle word. We can only count
- // each phrase pair once per source phrase occurrence, or else
- // run the risk of having more joint counts than marginal
- // counts.
-
- for (size_t s = s1; s <= s2; ++s)
- {
- sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s);
- if (!b || b->size() < e1 -s)
- UTIL_THROW(util::Exception, "target phrase not found");
- // assert(b);
- for (size_t i = e1; i <= e2; ++i)
- {
- uint64_t tpid = b->getPid();
- size_t s = 0;
- while (s < seen.size() && seen[s] != tpid) ++s;
- if (s < seen.size())
- {
-#if 0
- size_t sid, off, len;
- parse_pid(tpid,sid,off,len);
- cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl;
- for (size_t z = 0; z < len; ++z)
- {
- id_type tid = ag.bt.T2->sntStart(sid)[off+z].id();
- cerr << (*ag.bt.V2)[tid] << " ";
- }
- cerr << endl;
-#endif
- continue;
- }
- seen.push_back(tpid);
- if (! j->stats->add(tpid,sample_weight,aln,
- b->approxOccurrenceCount(),
- po_fwd,po_bwd,docid))
- {
- cerr << "FATAL ERROR AT " << __FILE__
- << ":" << __LINE__ << endl;
- assert(0);
- ostringstream msg;
- for (size_t z = 0; z < j->len; ++z)
- {
- id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
- cerr << (*ag.bt.V1)[tid] << " ";
- }
- cerr << endl;
- for (size_t z = s; z <= i; ++z)
- cerr << (*ag.bt.V2)[(o+z)->id()] << " ";
- cerr << endl;
- assert(0);
- UTIL_THROW(util::Exception,"Error in sampling.");
- }
- if (i < e2)
- {
-#ifndef NDEBUG
- bool ok = b->extend(o[i].id());
- assert(ok);
-#else
- b->extend(o[i].id());
- // cerr << "boo" << endl;
-#endif
- }
- }
- // if (j->fwd && s < s2)
- // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
- if (s < s2)
- for (size_t k = 1; k < aln.size(); k += 2)
- --aln[k];
- }
- // j->stats->lock.unlock();
- }
- j->stats->release();
- }
- }
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- job::
- ~job()
- {
- if (stats) stats.reset();
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- try { --active; } catch (...) {}
-#endif
- // counter may not exist any more at destruction time
- }
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- job::
- job(Bitext<Token> const* const theBitext,
- typename TSA<Token>::tree_iterator const& m,
- sptr<TSA<Token> > const& r, size_t maxsmpl,
- bool isfwd, sptr<SamplingBias const> const& bias)
- : m_bitext(theBitext)
- , rnd(0)
- , rnddenom(rnd.max() + 1.)
- , min_diverse(1)
- , workers(0)
- , root(r)
- , next(m.lower_bound(-1))
- , stop(m.upper_bound(-1))
- , max_samples(maxsmpl)
- , ctr(0)
- , len(m.size())
- , fwd(isfwd)
- , m_bias(bias)
- {
- stats.reset(new pstats());
- stats->raw_cnt = m.approxOccurrenceCount();
- bias_total = 0;
- // we need to renormalize on the fly, as the summ of all sentence probs over
- // all candidates (not all sentences in the corpus) needs to add to 1.
- // Profiling question: how much does that cost us?
- if (m_bias)
- {
- int ctr = 0;
- stats->raw_cnt = 0;
- for (char const* x = m.lower_bound(-1); x < stop;)
- {
- uint32_t sid; ushort offset;
- x = root->readSid(x,stop,sid);
- x = root->readOffset(x,stop,offset);
-#if 0
- cerr << ctr++ << " " << m.str(m_bitext->V1.get())
- << " " << sid << "/" << root->getCorpusSize()
- << " " << offset << " " << stop-x << endl;
-#endif
- bias_total += (*m_bias)[sid];
- ++stats->raw_cnt;
- }
- }
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- ++active;
- // if (active%5 == 0)
- // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
-#endif
- }
-
- template<typename Token>
- sptr<pstats>
- Bitext<Token>::
- agenda::
- add_job(Bitext<Token> const* const theBitext,
- typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, sptr<SamplingBias const> const& bias)
- {
- boost::unique_lock<boost::mutex> lk(this->lock);
- static boost::posix_time::time_duration nodelay(0,0,0,0);
- bool fwd = phrase.root == bt.I1.get();
- sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
- max_samples, fwd, bias));
- j->stats->register_worker();
-
- joblist.push_back(j);
- if (joblist.size() == 1)
- {
- size_t i = 0;
- while (i < workers.size())
- {
- if (workers[i]->timed_join(nodelay))
- {
- if (doomed)
- {
- if (i+1 < workers.size())
- workers[i].swap(workers.back());
- workers.pop_back();
- --doomed;
- }
- else
- workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this)));
- }
- else ++i;
- }
- }
- return j->stats;
- }
-
- template<typename Token>
- sptr<typename Bitext<Token>::agenda::job>
- Bitext<Token>::
- agenda::
- get_job()
- {
- // cerr << workers.size() << " workers on record" << endl;
- sptr<job> ret;
- if (this->shutdown) return ret;
- boost::unique_lock<boost::mutex> lock(this->lock);
- if (this->doomed)
- {
- --this->doomed;
- return ret;
- }
- typename list<sptr<job> >::iterator j = joblist.begin();
- while (j != joblist.end())
- {
- if ((*j)->done())
- {
- (*j)->stats->release();
- joblist.erase(j++);
- }
- else if ((*j)->workers >= 4)
- {
- ++j;
- }
- else break;
- }
- if (joblist.size())
- {
- ret = j == joblist.end() ? joblist.front() : *j;
- boost::lock_guard<boost::mutex> jguard(ret->lock);
- ++ret->workers;
- }
- return ret;
- }
-
-
- template<typename TKN>
- class mmBitext : public Bitext<TKN>
- {
- void load_document_map(string const& fname);
- public:
- void open(string const base, string const L1, string L2);
- mmBitext();
- };
-
- template<typename TKN>
- mmBitext<TKN>::
- mmBitext()
- : Bitext<TKN>(new mmTtrack<TKN>(),
- new mmTtrack<TKN>(),
- new mmTtrack<char>(),
- new TokenIndex(),
- new TokenIndex(),
- new mmTSA<TKN>(),
- new mmTSA<TKN>())
- {};
-
- template<typename TKN>
- void
- mmBitext<TKN>::
- load_document_map(string const& fname)
- {
- ifstream docmap(fname.c_str());
- // the docmap file should list the documents in the corpus
- // in the order in which they appear with one line per document:
- // <docname> <number of lines / sentences>
- //
- // in the future, we might also allow listing documents with
- // sentence ranges.
- string buffer,docname; size_t a=0,b;
- this->m_sid2docid.reset(new vector<id_type>(this->T1->size()));
- while(getline(docmap,buffer))
- {
- istringstream line(buffer);
- if (!(line>>docname)) continue; // empty line
- if (docname.size() && docname[0] == '#') continue; // comment
- size_t docid = this->m_docname2docid.size();
- this->m_docname2docid[docname] = docid;
- line >> b;
- VERBOSE(1, "DOCUMENT MAP " << docname
- << " " << a << "-" << b+a << endl);
- for (b += a; a < b; ++a)
- (*this->m_sid2docid)[a] = docid;
- }
- UTIL_THROW_IF2(b != this->T1->size(),
- "Document map doesn't match corpus!");
- }
-
- template<typename TKN>
- void
- mmBitext<TKN>::
- open(string const base, string const L1, string L2)
- {
- mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
- mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
- mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
- t1.open(base+L1+".mct");
- t2.open(base+L2+".mct");
- tx.open(base+L1+"-"+L2+".mam");
- this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
- this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
- mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
- mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
- i1.open(base+L1+".sfa", this->T1);
- i2.open(base+L2+".sfa", this->T2);
- assert(this->T1->size() == this->T2->size());
-
- string docmapfile = base+"dmp";
- if (!access(docmapfile.c_str(),F_OK))
- load_document_map(docmapfile);
- }
-
-
- template<typename TKN>
- class imBitext : public Bitext<TKN>
- {
- sptr<imTtrack<char> > myTx;
- sptr<imTtrack<TKN> > myT1;
- sptr<imTtrack<TKN> > myT2;
- sptr<imTSA<TKN> > myI1;
- sptr<imTSA<TKN> > myI2;
- static ThreadSafeCounter my_revision;
- public:
- size_t revision() const { return my_revision; }
- void open(string const base, string const L1, string L2);
- imBitext(sptr<TokenIndex> const& V1,
- sptr<TokenIndex> const& V2,
- size_t max_sample = 5000, size_t num_workers=4);
- imBitext(size_t max_sample = 5000, size_t num_workers=4);
- imBitext(imBitext const& other);
-
- // sptr<imBitext<TKN> >
- // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
-
- sptr<imBitext<TKN> >
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& a) const;
-
- };
-
- template<typename TKN>
- ThreadSafeCounter
- imBitext<TKN>::my_revision;
-
- template<typename TKN>
- imBitext<TKN>::
- imBitext(size_t max_sample, size_t num_workers)
- : Bitext<TKN>(max_sample, num_workers)
- {
- this->m_default_sample_size = max_sample;
- this->V1.reset(new TokenIndex());
- this->V2.reset(new TokenIndex());
- this->V1->setDynamic(true);
- this->V2->setDynamic(true);
- ++my_revision;
- }
-
- template<typename TKN>
- imBitext<TKN>::
- imBitext(sptr<TokenIndex> const& v1,
- sptr<TokenIndex> const& v2,
- size_t max_sample, size_t num_workers)
- : Bitext<TKN>(max_sample, num_workers)
- {
- // this->default_sample_size = max_sample;
- this->V1 = v1;
- this->V2 = v2;
- this->V1->setDynamic(true);
- this->V2->setDynamic(true);
- ++my_revision;
- }
-
-
- template<typename TKN>
- imBitext<TKN>::
- imBitext(imBitext<TKN> const& other)
- {
- this->myTx = other.myTx;
- this->myT1 = other.myT1;
- this->myT2 = other.myT2;
- this->myI1 = other.myI1;
- this->myI2 = other.myI2;
- this->Tx = this->myTx;
- this->T1 = this->myT1;
- this->T2 = this->myT2;
- this->I1 = this->myI1;
- this->I2 = this->myI2;
- this->V1 = other.V1;
- this->V2 = other.V2;
- this->m_default_sample_size = other.m_default_sample_size;
- this->m_num_workers = other.m_num_workers;
- ++my_revision;
- }
-
template<typename TKN> class snt_adder;
template<> class snt_adder<L2R_Token<SimpleWordId> >;
@@ -1278,147 +277,17 @@ namespace Moses {
class snt_adder<L2R_Token<SimpleWordId> >
{
typedef L2R_Token<SimpleWordId> TKN;
- vector<string> const & snt;
+ std::vector<string> const & snt;
TokenIndex & V;
sptr<imTtrack<TKN> > & track;
sptr<imTSA<TKN > > & index;
public:
- snt_adder(vector<string> const& s, TokenIndex& v,
+ snt_adder(std::vector<string> const& s, TokenIndex& v,
sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
void operator()();
};
- // template<typename TKN>
- // class snt_adder
- // {
- // vector<string> const & snt;
- // TokenIndex & V;
- // sptr<imTtrack<TKN> > & track;
- // sptr<imTSA<TKN > > & index;
- // public:
- // snt_adder(vector<string> const& s, TokenIndex& v,
- // sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
-
- // template<typename T>
- // void operator()();
- // };
-
- // // template<>
- // void
- // snt_adder<L2R_Token<SimpleWordId> >::
- // operator()();
-
- // template<>
- // void
- // snt_adder<char>::
- // operator()()
- // {
- // vector<id_type> sids;
- // sids.reserve(snt.size());
- // BOOST_FOREACH(string const& s, snt)
- // {
- // sids.push_back(track ? track->size() : 0);
- // istringstream buf(s);
- // string w;
- // vector<char> s;
- // s.reserve(100);
- // while (buf >> w)
- // s.push_back(vector<char>(V[w]));
- // track = append(track,s);
- // }
- // index.reset(new imTSA<char>(*index,track,sids,V.tsize()));
- // }
-
- // template<typename TKN>
- // snt_adder<TKN>::
- // snt_adder(vector<string> const& s, TokenIndex& v,
- // sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i)
- // : snt(s), V(v), track(t), index(i)
- // {
- // throw "Not implemented yet.";
- // }
-
- template<>
- sptr<imBitext<L2R_Token<SimpleWordId> > >
- imBitext<L2R_Token<SimpleWordId> >::
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& aln) const;
-
- template<typename TKN>
- sptr<imBitext<TKN> >
- imBitext<TKN>::
- add(vector<string> const& s1,
- vector<string> const& s2,
- vector<string> const& aln) const
- {
- throw "Not yet implemented";
- }
- // template<typename TKN>
- // sptr<imBitext<TKN> >
- // imBitext<TKN>::
- // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a)
- // {
- // boost::unique_lock<boost::shared_mutex> guard(m_lock);
- // sptr<imBitext<TKN> > ret(new imBitext<TKN>());
- // vector<id_type> sids(1,this->myT1.size()-1);
- // ret->myT1 = add(this->myT1,s1);
- // ret->myT2 = add(this->myT2,s2);
- // size_t v1size = this->V1.tsize();
- // size_t v2size = this->V2.tsize();
- // BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; }
- // BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; }
- // ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size));
- // ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size));
- // ostringstream abuf;
- // BOOST_FOREACH(ushort x, a) binwrite(abuf,x);
- // vector<char> foo(abuf.str().begin(),abuf.str().end());
- // ret->myTx = add(this->myTx,foo);
- // ret->T1 = ret->myT1;
- // ret->T2 = ret->myT2;
- // ret->Tx = ret->myTx;
- // ret->I1 = ret->myI1;
- // ret->I2 = ret->myI2;
- // ret->V1 = this->V1;
- // ret->V2 = this->V2;
- // return ret;
- // }
-
-
- // template<typename TKN>
- // imBitext<TKN>::
- // imBitext()
- // : Bitext<TKN>(new imTtrack<TKN>(),
- // new imTtrack<TKN>(),
- // new imTtrack<char>(),
- // new TokenIndex(),
- // new TokenIndex(),
- // new imTSA<TKN>(),
- // new imTSA<TKN>())
- // {}
-
-
- template<typename TKN>
- void
- imBitext<TKN>::
- open(string const base, string const L1, string L2)
- {
- mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
- mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
- mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
- t1.open(base+L1+".mct");
- t2.open(base+L2+".mct");
- tx.open(base+L1+"-"+L2+".mam");
- this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
- this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
- mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
- mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
- i1.open(base+L1+".sfa", this->T1);
- i2.open(base+L2+".sfa", this->T2);
- assert(this->T1->size() == this->T2->size());
- }
-
template<typename Token>
bool
Bitext<Token>::
@@ -1427,7 +296,7 @@ namespace Moses {
size_t const start, size_t const stop,
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
int & po_fwd, int & po_bwd,
- vector<uchar>* core_alignment, bitvector* full_alignment,
+ std::vector<uchar>* core_alignment, bitvector* full_alignment,
bool const flip) const
{
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
@@ -1464,7 +333,7 @@ namespace Moses {
size_t src,trg;
size_t lft = forbidden.size();
size_t rgt = 0;
- vector<vector<ushort> > aln1(slen1),aln2(slen2);
+ std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
char const* p = Tx->sntStart(sid);
char const* x = Tx->sntEnd(sid);
@@ -1532,33 +401,6 @@ namespace Moses {
return ret;
}
-// template<typename Token>
-// sptr<DocumentBias>
-// Bitext<Token>::
-// SetupDocBias(string const& bserver, map<id_type,size_t> const& ctx) const
-// {
-
-// sptr<DocumentBias> ret;
-// #ifdef HAVE_CURLPP
-// map<id_type,size_t>::const_iterator w = ctx.begin();
-// while(w != ctx.end() && w->second == 0) ++w;
-// if (w == ctx.end()) return ret;
-// string context; context.reserve(5000);
-// context += (*V1)[w->first];
-// while (++w != ctx.end())
-// {
-// if (w->second == 0) continue;
-// context += " ";
-// context += (*V1)[w->first];
-// }
-// cerr << HERE << endl;
-// cerr << "BIAS LOOKUP CONTEXT: " << context << endl;
-// ret = GetDocBiasFromServer(bserver+curlpp::escape(context));
-// #endif
-// return ret;
-// }
-
-
template<typename Token>
void
Bitext<Token>::
@@ -1587,7 +429,7 @@ namespace Moses {
// - no caching for rare phrases and special requests (max_sample)
// (still need to test what a good caching threshold is ...)
// - use the task-specific cache when there is a sampling bias
- if (max_sample == m_default_sample_size
+ if (max_sample == int(m_default_sample_size)
&& phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
{
cache = (phrase.root == I1.get()
@@ -1622,7 +464,7 @@ namespace Moses {
{
Ttrack<Token> const& m_other;
sptr<pstats> m_pstats;
- vector<PhrasePair<Token> >& m_pplist;
+ std::vector<PhrasePair<Token> >& m_pplist;
typename PhrasePair<Token>::Scorer const* m_scorer;
PhrasePair<Token> m_pp;
Token const* m_token;
@@ -1635,7 +477,7 @@ namespace Moses {
pstats2pplist(typename TSA<Token>::tree_iterator const& m,
Ttrack<Token> const& other,
sptr<pstats> const& ps,
- vector<PhrasePair<Token> >& dest,
+ std::vector<PhrasePair<Token> >& dest,
typename PhrasePair<Token>::Scorer const* scorer)
: m_other(other)
, m_pstats(ps)
@@ -1665,7 +507,8 @@ namespace Moses {
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
- m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint);
+ m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
+ m_pp.joint);
size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
if (m_pp.good1 > J || m_pp.good2 > J) continue;
if (m_scorer)
@@ -1683,13 +526,13 @@ namespace Moses {
template<typename Token>
void
Bitext<Token>::
- lookup(vector<Token> const& snt, TSA<Token>& idx,
- vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
- vector<vector<uint64_t> >* pidmap,
+ lookup(std::vector<Token> const& snt, TSA<Token>& idx,
+ std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
+ std::vector<std::vector<uint64_t> >* pidmap,
typename PhrasePair<Token>::Scorer* scorer,
sptr<SamplingBias const> const& bias, bool multithread) const
{
- // typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
+ // typedef std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > > ret_t;
dest.clear();
dest.resize(snt.size());
@@ -1698,7 +541,7 @@ namespace Moses {
// collect statistics in parallel, then build PT entries as
// the sampling finishes
bool fwd = &idx == I1.get();
- vector<boost::thread*> workers; // background threads doing the lookup
+ std::vector<boost::thread*> workers; // background threads doing the lookup
pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2);
if (C.capacity() < 100000) C.reserve(100000);
for (size_t i = 0; i < snt.size(); ++i)
@@ -1709,12 +552,12 @@ namespace Moses {
{
uint64_t key = m.getPid();
if (pidmap) (*pidmap)[i].push_back(key);
- sptr<vector<PhrasePair<Token> > > pp = C.get(key);
+ sptr<std::vector<PhrasePair<Token> > > pp = C.get(key);
if (pp)
dest[i].push_back(pp);
else
{
- pp.reset(new vector<PhrasePair<Token> >());
+ pp.reset(new std::vector<PhrasePair<Token> >());
C.set(key,pp);
dest[i].push_back(pp);
sptr<pstats> x = prep2(m, this->default_sample_size,bias);
@@ -1780,49 +623,12 @@ namespace Moses {
return ret;
}
#endif
-
- template<typename Token>
- Bitext<Token>::
- agenda::
- ~agenda()
- {
- this->lock.lock();
- this->shutdown = true;
- this->lock.unlock();
- for (size_t i = 0; i < workers.size(); ++i)
- workers[i]->join();
- }
template<typename Token>
- Bitext<Token>::
- agenda::
- agenda(Bitext<Token> const& thebitext)
- : shutdown(false), doomed(0), bt(thebitext)
- { }
-
- template<typename Token>
- bool
- Bitext<Token>::
- agenda::
- job::
- done() const
- {
- return (max_samples && stats->good >= max_samples) || next == stop;
- }
-
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
- template<typename TKN>
- ThreadSafeCounter
- Bitext<TKN>::
- agenda::
- job::active;
-#endif
-
- template<typename Token>
void
expand(typename Bitext<Token>::iter const& m,
Bitext<Token> const& bt, pstats const& ps,
- vector<PhrasePair<Token> >& dest, ostream* log)
+ std::vector<PhrasePair<Token> >& dest, ostream* log)
{
bool fwd = m.root == bt.I1.get();
dest.reserve(ps.trg.size());
@@ -1887,5 +693,9 @@ namespace Moses {
#endif
} // end of namespace bitext
} // end of namespace moses
-#endif
+
+#include "ug_im_bitext.h"
+#include "ug_mm_bitext.h"
+
+
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
new file mode 100644
index 000000000..a9632c056
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@@ -0,0 +1,186 @@
+// -*- c++ -*-
+// to be included from ug_bitext.h
+
+// The agenda handles parallel sampling.
+// It maintains a queue of unfinished sampling jobs and
+// assigns them to a pool of workers.
+//
+template<typename Token>
+class Bitext<Token>
+::agenda
+{
+public:
+ class job;
+ class worker;
+private:
+ boost::mutex lock;
+ std::list<sptr<job> > joblist;
+ std::vector<sptr<boost::thread> > workers;
+ bool shutdown;
+ size_t doomed;
+
+public:
+
+
+ Bitext<Token> const& bt;
+
+ agenda(Bitext<Token> const& bitext);
+ ~agenda();
+
+ void
+ add_workers(int n);
+
+ sptr<pstats>
+ add_job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& phrase,
+ size_t const max_samples, sptr<SamplingBias const> const& bias);
+ // add_job(Bitext<Token> const* const theBitext,
+ // typename TSA<Token>::tree_iterator const& phrase,
+ // size_t const max_samples, SamplingBias const* const bias);
+
+ sptr<job>
+ get_job();
+};
+
+template<typename Token>
+class
+Bitext<Token>::agenda::
+worker
+{
+ agenda& ag;
+public:
+ worker(agenda& a) : ag(a) {}
+ void operator()();
+};
+
+#include "ug_bitext_agenda_worker.h"
+#include "ug_bitext_agenda_job.h"
+
+template<typename Token>
+void Bitext<Token>
+::agenda
+::add_workers(int n)
+{
+ static boost::posix_time::time_duration nodelay(0,0,0,0);
+ boost::lock_guard<boost::mutex> guard(this->lock);
+
+ int target = max(1, int(n + workers.size() - this->doomed));
+ // house keeping: remove all workers that have finished
+ for (size_t i = 0; i < workers.size(); )
+ {
+ if (workers[i]->timed_join(nodelay))
+ {
+ if (i + 1 < workers.size())
+ workers[i].swap(workers.back());
+ workers.pop_back();
+ }
+ else ++i;
+ }
+ // cerr << workers.size() << "/" << target << " active" << endl;
+ if (int(workers.size()) > target)
+ this->doomed = workers.size() - target;
+ else
+ while (int(workers.size()) < target)
+ {
+ sptr<boost::thread> w(new boost::thread(worker(*this)));
+ workers.push_back(w);
+ }
+}
+
+
+template<typename Token>
+sptr<pstats> Bitext<Token>
+::agenda
+::add_job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& phrase,
+ size_t const max_samples, sptr<SamplingBias const> const& bias)
+{
+ boost::unique_lock<boost::mutex> lk(this->lock);
+ static boost::posix_time::time_duration nodelay(0,0,0,0);
+ bool fwd = phrase.root == bt.I1.get();
+ sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
+ max_samples, fwd, bias));
+ j->stats->register_worker();
+
+ joblist.push_back(j);
+ if (joblist.size() == 1)
+ {
+ size_t i = 0;
+ while (i < workers.size())
+ {
+ if (workers[i]->timed_join(nodelay))
+ {
+ if (doomed)
+ {
+ if (i+1 < workers.size())
+ workers[i].swap(workers.back());
+ workers.pop_back();
+ --doomed;
+ }
+ else
+ workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this)));
+ }
+ else ++i;
+ }
+ }
+ return j->stats;
+}
+
+template<typename Token>
+sptr<typename Bitext<Token>::agenda::job>
+Bitext<Token>
+::agenda
+::get_job()
+{
+ // cerr << workers.size() << " workers on record" << endl;
+ sptr<job> ret;
+ if (this->shutdown) return ret;
+ boost::unique_lock<boost::mutex> lock(this->lock);
+ if (this->doomed)
+ { // the number of workers has been reduced, tell the redundant once to quit
+ --this->doomed;
+ return ret;
+ }
+
+ typename list<sptr<job> >::iterator j = joblist.begin();
+ while (j != joblist.end())
+ {
+ if ((*j)->done())
+ {
+ (*j)->stats->release();
+ joblist.erase(j++);
+ }
+ else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job
+ else break; // found one
+ }
+ if (joblist.size())
+ {
+ ret = j == joblist.end() ? joblist.front() : *j;
+ // if we've reached the end of the queue (all jobs have 4 workers on them),
+ // take the first in the queue
+ boost::lock_guard<boost::mutex> jguard(ret->lock);
+ ++ret->workers;
+ }
+ return ret;
+}
+
+template<typename Token>
+Bitext<Token>::
+agenda::
+~agenda()
+{
+ this->lock.lock();
+ this->shutdown = true;
+ this->lock.unlock();
+ for (size_t i = 0; i < workers.size(); ++i)
+ workers[i]->join();
+}
+
+template<typename Token>
+Bitext<Token>::
+agenda::
+agenda(Bitext<Token> const& thebitext)
+ : shutdown(false), doomed(0), bt(thebitext)
+{ }
+
+
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
new file mode 100644
index 000000000..efbebad52
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -0,0 +1,240 @@
+// -*- c++ -*-
+// class declaration of template<typename Token> class Bitxt<Token>::agenda::job
+// to be included by ug_bitext.h
+// todo: add check to enforce this
+
+template<typename Token>
+class
+Bitext<Token>::agenda::
+job
+{
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ static ThreadSafeCounter active;
+#endif
+ Bitext<Token> const* const m_bitext;
+ boost::mutex lock;
+ friend class agenda;
+ boost::taus88 rnd; // every job has its own pseudo random generator
+ double rnddenom; // denominator for scaling random sampling
+ size_t min_diverse; // minimum number of distinct translations
+
+ bool flip_coin(uint64_t & sid, uint64_t & offset);
+ bool step(uint64_t & sid, uint64_t & offset); // proceed to next occurrence
+
+public:
+ size_t workers; // how many workers are working on this job?
+ sptr<TSA<Token> const> root; // root of the underlying suffix array
+ char const* next; // next position to read from
+ char const* stop; // end of index range
+ size_t max_samples; // how many samples to extract at most
+ size_t ctr; /* # of phrase occurrences considered so far
+ * # of samples chosen is stored in stats->good
+ */
+ size_t len; // phrase length
+ bool fwd; // if true, source phrase is L1
+ sptr<pstats> stats; // stores statistics collected during sampling
+ sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling
+ float bias_total;
+ bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
+
+ int
+ check_sample_distribution(uint64_t const& sid, uint64_t const& offset);
+ // for biased sampling: ensure the distribution approximately matches
+ // the bias
+
+ bool done() const;
+ job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& m,
+ sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
+ sptr<SamplingBias const> const& bias);
+ ~job();
+};
+
+template<typename Token>
+Bitext<Token>::agenda::job
+::~job()
+{
+ if (stats) stats.reset();
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ // counter may not exist any more at destruction time, hence try .. catch ...
+ try { --active; } catch (...) {}
+#endif
+}
+
+template<typename Token>
+Bitext<Token>::agenda::job
+::job(Bitext<Token> const* const theBitext,
+ typename TSA<Token>::tree_iterator const& m,
+ sptr<TSA<Token> > const& r, size_t maxsmpl,
+ bool isfwd, sptr<SamplingBias const> const& bias)
+ : m_bitext(theBitext)
+ , rnd(0)
+ , rnddenom(rnd.max() + 1.)
+ , min_diverse(1)
+ , workers(0)
+ , root(r)
+ , next(m.lower_bound(-1))
+ , stop(m.upper_bound(-1))
+ , max_samples(maxsmpl)
+ , ctr(0)
+ , len(m.size())
+ , fwd(isfwd)
+ , m_bias(bias)
+{
+ stats.reset(new pstats());
+ stats->raw_cnt = m.approxOccurrenceCount();
+ bias_total = 0;
+
+ // we need to renormalize on the fly, as the summ of all sentence probs over
+ // all candidates (not all sentences in the corpus) needs to add to 1.
+ // Profiling question: how much does that cost us?
+ if (m_bias)
+ {
+ int ctr = 0;
+ stats->raw_cnt = 0;
+ for (char const* x = m.lower_bound(-1); x < stop;)
+ {
+ uint32_t sid; ushort offset;
+ x = root->readSid(x,stop,sid);
+ x = root->readOffset(x,stop,offset);
+#if 0
+ cerr << ctr++ << " " << m.str(m_bitext->V1.get())
+ << " " << sid << "/" << root->getCorpusSize()
+ << " " << offset << " " << stop-x << endl;
+#endif
+ bias_total += (*m_bias)[sid];
+ ++stats->raw_cnt;
+ }
+ }
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ ++active;
+ // if (active%5 == 0)
+ // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::done() const
+{
+ return (max_samples && stats->good >= max_samples) || next == stop;
+}
+
+template<typename Token>
+int Bitext<Token>::agenda::job
+::check_sample_distribution(uint64_t const& sid, uint64_t const& offset)
+{ // ensure that the sampled distribution approximately matches the bias
+ // @return 0: SKIP this occurrence
+ // @return 1: consider this occurrence for sampling
+ // @return 2: include this occurrence in the sample by all means
+
+ if (!m_bias) return 1;
+
+ using namespace boost::math;
+ typedef boost::math::binomial_distribution<> binomial;
+
+ ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL;
+
+ float p = (*m_bias)[sid];
+ id_type docid = m_bias->GetClass(sid);
+ uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0;
+
+ // always consider candidates from dominating documents and
+ // from documents that have not been considered at all yet
+ bool ret = (p > .5 || k == 0);
+
+ if (ret && !log) return 1;
+
+ uint32_t N = stats->good; // number of trials
+ float d = cdf(complement(binomial(N, p), k));
+ // d: probability that samples contains k or more instances from doc #docid
+ ret = ret || d >= .05;
+
+ if (log)
+ {
+ Token const* t = root->getCorpus()->sntStart(sid)+offset;
+ Token const* x = t - min(offset,uint64_t(3));
+ Token const* e = t+4;
+ if (e > root->getCorpus()->sntEnd(sid))
+ e = root->getCorpus()->sntEnd(sid);
+ *log << docid << ":" << sid << " " << size_t(k) << "/" << N
+ << " @" << p << " => " << d << " [";
+ for (size_t i = 0; i < stats->indoc.size(); ++i)
+ {
+ if (i) *log << " ";
+ *log << stats->indoc[i];
+ }
+ *log << "] ";
+ for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
+ if (!ret) *log << "SKIP";
+ else if (p < .5 && d > .9) *log << "FORCE";
+ *log << endl;
+ }
+
+ return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0);
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::flip_coin(uint64_t & sid, uint64_t & offset)
+{
+ int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1;
+ if (no_maybe_yes == 0) return false; // no
+ if (no_maybe_yes > 1) return true; // yes
+ // ... maybe: flip a coin
+ size_t options_chosen = stats->good;
+ size_t options_total = max(stats->raw_cnt, this->ctr);
+ size_t options_left = (options_total - this->ctr);
+ size_t random_number = options_left * (rnd()/(rnd.max()+1.));
+ size_t threshold;
+ if (bias_total) // we have a bias and there are candidates with non-zero prob
+ threshold = ((*m_bias)[sid]/bias_total * options_total * max_samples);
+ else // no bias, or all have prob 0 (can happen with a very opinionated bias)
+ threshold = max_samples;
+ return random_number + options_chosen < threshold;
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::step(uint64_t & sid, uint64_t & offset)
+{ // caller must lock!
+ if (next == stop) return false;
+ UTIL_THROW_IF2
+ ( next > stop, "Fatal error at " << HERE << ". How did that happen?" );
+ // boost::lock_guard<boost::mutex> jguard(lock); // caller must lock!
+ next = root->readSid(next, stop, sid);
+ next = root->readOffset(next, stop, offset);
+ ++ctr;
+ return true;
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::nextSample(uint64_t & sid, uint64_t & offset)
+{
+ boost::lock_guard<boost::mutex> jguard(lock);
+ if (max_samples == 0) // no sampling, consider all occurrences
+ return step(sid, offset);
+
+ while (step(sid,offset))
+ {
+ size_t good = stats->good;
+ size_t diversity = stats->trg.size();
+ if (good >= max_samples && diversity >= min_diverse)
+ return false; // done
+
+ // flip_coin softly enforces approximation of the sampling to the
+ // bias (occurrences that would steer the sample too far from the bias
+ // are ruled out), and flips a biased coin otherwise.
+ if (!flip_coin(sid,offset)) continue;
+ return true;
+ }
+ return false;
+}
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+template<typename TKN>
+ThreadSafeCounter Bitext<TKN>::agenda
+::job
+::active;
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
new file mode 100644
index 000000000..92ed3d36a
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
@@ -0,0 +1,102 @@
+// to be included from ug_bitext_agenda.h
+
+template<typename Token>
+void
+Bitext<Token>::agenda
+::worker
+::operator()()
+{
+ // things to do:
+ //
+ // - have each worker maintain their own pstats object and merge
+ // results at the end (to minimize mutex locking);
+ //
+ // - use a non-locked, monotonically increasing counter to
+ // ensure the minimum size of samples considered --- it's OK if
+ // we look at more samples than required. This way, we can
+ // reduce the number of lock / unlock operations we need to do
+ // during sampling.
+
+ uint64_t sid=0, offset=0; // sid and offset of source phrase
+ size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase
+ vector<uchar> aln; // stores phrase-pair-internal alignment
+ while(sptr<job> j = ag.get_job())
+ {
+ j->stats->register_worker();
+ bitvector full_alignment(100*100); // Is full_alignment still needed???
+ while (j->nextSample(sid,offset))
+ {
+ aln.clear();
+ int po_fwd = Moses::LRModel::NONE;
+ int po_bwd = Moses::LRModel::NONE;
+ int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1;
+ bitvector* full_aln = j->fwd ? &full_alignment : NULL;
+
+ // find soft and hard boundaries of target phrase
+ bool good = (ag.bt.find_trg_phr_bounds
+ (sid, offset, offset + j->len, // input parameters
+ s1, s2, e1, e2, po_fwd, po_bwd, // bounds & orientation
+ &aln, full_aln, !j->fwd)); // aln info / flip sides?
+
+ if (!good)
+ { // no good, probably because phrase is not coherent
+ j->stats->count_sample(docid, 0, po_fwd, po_bwd);
+ continue;
+ }
+
+ // all good: register this sample as valid
+ size_t num_pairs = (s2-s1+1) * (e2-e1+1);
+ j->stats->count_sample(docid, num_pairs, po_fwd, po_bwd);
+
+#if 0
+ Token const* t = ag.bt.T2->sntStart(sid);
+ Token const* eos = ag.bt.T2->sntEnd(sid);
+ cerr << "[" << j->stats->good + 1 << "] ";
+ while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " ";
+ cerr << "[" << docid << "]" << endl;
+#endif
+
+ float sample_weight = 1./num_pairs;
+ Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
+
+ // adjust offsets in phrase-internal aligment
+ for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1;
+
+ vector<uint64_t> seen; seen.reserve(10);
+ // It is possible that the phrase extraction extracts the same
+ // phrase twice, e.g., when word a co-occurs with sequence b b b
+ // but is aligned only to the middle word. We can only count
+ // each phrase pair once per source phrase occurrence, or else
+ // run the risk of having more joint counts than marginal
+ // counts.
+
+ for (size_t s = s1; s <= s2; ++s)
+ {
+ TSA<Token> const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1;
+ sptr<iter> b = I.find(o + s, e1 - s);
+ UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found");
+
+ for (size_t i = e1; i <= e2; ++i)
+ {
+ uint64_t tpid = b->getPid();
+
+ // poor man's protection against over-counting
+ size_t s = 0;
+ while (s < seen.size() && seen[s] != tpid) ++s;
+ if (s < seen.size()) continue;
+ seen.push_back(tpid);
+
+ size_t raw2 = b->approxOccurrenceCount();
+ j->stats->add(tpid, sample_weight, aln, raw2,
+ po_fwd, po_bwd, docid);
+ bool ok = (i == e2) || b->extend(o[i].id());
+ UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
+ }
+ if (s < s2) // shift phrase-internal alignments
+ for (size_t k = 1; k < aln.size(); k += 2)
+ --aln[k];
+ }
+ }
+ j->stats->release(); // indicate that you're done working on j->stats
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
new file mode 100644
index 000000000..2dda3ab9a
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -0,0 +1,91 @@
+#include "ug_bitext_jstats.h"
+namespace Moses
+{
+ namespace bitext
+ {
+
+ uint32_t jstats::rcnt() const { return my_rcnt; }
+ float jstats::wcnt() const { return my_wcnt; }
+ uint32_t jstats::cnt2() const { return my_cnt2; }
+
+ // What was that used for again? UG
+ bool jstats::valid() { return my_wcnt >= 0; }
+ void jstats::validate() { if (my_wcnt < 0) my_wcnt *= -1; }
+ void jstats::invalidate() { if (my_wcnt > 0) my_wcnt *= -1; }
+
+ jstats::
+ jstats()
+ : my_rcnt(0), my_cnt2(0), my_wcnt(0)
+ {
+ for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+ ofwd[i] = obwd[i] = 0;
+ my_aln.reserve(1);
+ }
+
+ jstats::
+ jstats(jstats const& other)
+ {
+ my_rcnt = other.rcnt();
+ my_wcnt = other.wcnt();
+ my_aln = other.aln();
+ indoc = other.indoc;
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ ofwd[i] = other.ofwd[i];
+ obwd[i] = other.obwd[i];
+ }
+ }
+
+ uint32_t
+ jstats::
+ dcnt_fwd(PhraseOrientation const idx) const
+ {
+ assert(idx <= Moses::LRModel::NONE);
+ return ofwd[idx];
+ }
+
+ uint32_t
+ jstats::
+ dcnt_bwd(PhraseOrientation const idx) const
+ {
+ assert(idx <= Moses::LRModel::NONE);
+ return obwd[idx];
+ }
+
+ void
+ jstats::
+ add(float w, vector<uchar> const& a, uint32_t const cnt2,
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
+ {
+ boost::lock_guard<boost::mutex> lk(this->lock);
+ my_cnt2 = cnt2;
+ my_rcnt += 1;
+ my_wcnt += w;
+ if (a.size())
+ {
+ size_t i = 0;
+ while (i < my_aln.size() && my_aln[i].second != a) ++i;
+ if (i == my_aln.size())
+ my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
+ else
+ my_aln[i].first++;
+ if (my_aln[i].first > my_aln[i/2].first)
+ push_heap(my_aln.begin(),my_aln.begin()+i+1);
+ }
+ ++ofwd[fwd_orient];
+ ++obwd[bwd_orient];
+ if (docid >= 0)
+ {
+ while (int(indoc.size()) <= docid) indoc.push_back(0);
+ ++indoc[docid];
+ }
+ }
+
+ vector<pair<size_t, vector<uchar> > > const&
+ jstats::
+ aln() const
+ { return my_aln; }
+
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
new file mode 100644
index 000000000..13c86e34d
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -0,0 +1,48 @@
+// -*- c++ -*-
+#pragma once
+#include "ug_typedefs.h"
+#include "ug_lexical_reordering.h"
+#include <boost/thread.hpp>
+
+namespace Moses
+{
+ namespace bitext
+ {
+ using namespace ugdiss;
+
+ // "joint" (i.e., phrase pair) statistics
+ class
+ jstats
+ {
+ boost::mutex lock;
+ uint32_t my_rcnt; // unweighted joint count
+ uint32_t my_cnt2; // raw counts L2
+ float my_wcnt; // weighted joint count
+
+ // to do: use a static alignment pattern store that stores each pattern only
+ // once, so that we don't have to store so many alignment vectors
+ vector<pair<size_t, vector<uchar> > > my_aln; // internal word alignment
+
+ uint32_t ofwd[Moses::LRModel::NONE+1]; // forward distortion type counts
+ uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts
+
+ public:
+ vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
+ jstats();
+ jstats(jstats const& other);
+ uint32_t rcnt() const; // raw joint counts
+ uint32_t cnt2() const; // raw target phrase occurrence count
+ float wcnt() const; // weighted joint counts
+
+ vector<pair<size_t, vector<uchar> > > const & aln() const;
+ void add(float w, vector<uchar> const& a, uint32_t const cnt2,
+ uint32_t fwd_orient, uint32_t bwd_orient,
+ int const docid);
+ void invalidate();
+ void validate();
+ bool valid();
+ uint32_t dcnt_fwd(PhraseOrientation const idx) const;
+ uint32_t dcnt_bwd(PhraseOrientation const idx) const;
+ };
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
new file mode 100644
index 000000000..bbae42e85
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -0,0 +1,83 @@
+#include "ug_bitext_pstats.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ ThreadSafeCounter pstats::active;
+#endif
+
+ pstats::
+ pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+ {
+ for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+ ofwd[i] = obwd[i] = 0;
+ }
+
+ pstats::
+ ~pstats()
+ {
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ // counter may not exist any more at destruction time, so try ... catch
+ try { --active; } catch (...) {}
+#endif
+ }
+
+ void
+ pstats::
+ register_worker()
+ {
+ this->lock.lock();
+ ++this->in_progress;
+ this->lock.unlock();
+ }
+
+ void
+ pstats::
+ release()
+ {
+ this->lock.lock();
+ if (this->in_progress-- == 1) // last one - >we're done
+ this->ready.notify_all();
+ this->lock.unlock();
+ }
+
+ void
+ pstats
+ ::count_sample(int const docid, size_t const num_pairs,
+ int const po_fwd, int const po_bwd)
+ {
+ boost::lock_guard<boost::mutex> guard(lock);
+ ++sample_cnt;
+ if (num_pairs == 0) return;
+ ++good;
+ sum_pairs += num_pairs;
+ ++ofwd[po_fwd];
+ ++obwd[po_bwd];
+ while (int(indoc.size()) <= docid) indoc.push_back(0);
+ ++indoc[docid];
+ }
+
+ bool
+ pstats::
+ add(uint64_t pid, float const w,
+ vector<uchar> const& a,
+ uint32_t const cnt2,
+ uint32_t fwd_o,
+ uint32_t bwd_o, int const docid)
+ {
+ boost::lock_guard<boost::mutex> guard(this->lock);
+ jstats& entry = this->trg[pid];
+ entry.add(w, a, cnt2, fwd_o, bwd_o, docid);
+ if (this->good < entry.rcnt())
+ {
+ UTIL_THROW(util::Exception, "more joint counts than good counts:"
+ << entry.rcnt() << "/" << this->good << "!");
+ }
+ return true;
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
new file mode 100644
index 000000000..c5b6c0152
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -0,0 +1,63 @@
+// -*- c++ -*-
+#pragma once
+
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "ug_typedefs.h"
+#include "ug_bitext_jstats.h"
+#include "moses/thread_safe_container.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+ struct
+ pstats
+ {
+ typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t;
+ typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t;
+ typedef std::vector<uchar> alnvec;
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+ static ThreadSafeCounter active;
+#endif
+ boost::mutex lock; // for parallel gathering of stats
+ boost::condition_variable ready; // consumers can wait for me to be ready
+
+ size_t raw_cnt; // (approximate) raw occurrence count
+ size_t sample_cnt; // number of instances selected during sampling
+ size_t good; // number of selected instances with valid word alignments
+ size_t sum_pairs; // total number of target phrases extracted (can be > raw_cnt)
+ size_t in_progress; // how many threads are currently working on this?
+
+ uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations
+ uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations
+
+ std::vector<uint32_t> indoc; // distribution over where samples came from
+
+ typedef std::map<uint64_t, jstats> trg_map_t;
+ trg_map_t trg;
+ pstats();
+ ~pstats();
+ void release();
+ void register_worker();
+ size_t count_workers() { return in_progress; }
+
+ bool
+ add(uint64_t const pid, // target phrase id
+ float const w, // sample weight (1./(# of phrases extractable))
+ alnvec const& a, // local alignment
+ uint32_t const cnt2, // raw target phrase count
+ uint32_t fwd_o, // fwd. phrase orientation
+ uint32_t bwd_o, // bwd. phrase orientation
+ int const docid); // document where sample was found
+
+ void
+ count_sample(int const docid, // document where sample was found
+ size_t const num_pairs, // # of phrases extractable here
+ int const po_fwd, // fwd phrase orientation
+ int const po_bwd); // bwd phrase orientation
+ };
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc
new file mode 100644
index 000000000..9f26a181b
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc
@@ -0,0 +1,87 @@
+#include "ug_im_bitext.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+
+ template<>
+ sptr<imBitext<L2R_Token<SimpleWordId> > >
+ imBitext<L2R_Token<SimpleWordId> >::
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& aln) const
+ {
+ typedef L2R_Token<SimpleWordId> TKN;
+ assert(s1.size() == s2.size() && s1.size() == aln.size());
+
+#ifndef NDEBUG
+ size_t first_new_snt = this->T1 ? this->T1->size() : 0;
+#endif
+
+ sptr<imBitext<TKN> > ret;
+ {
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
+ ret.reset(new imBitext<TKN>(*this));
+ }
+
+ // we add the sentences in separate threads (so it's faster)
+ boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
+ // thread1.join(); // for debugging
+ boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
+ BOOST_FOREACH(string const& a, aln)
+ {
+ istringstream ibuf(a);
+ ostringstream obuf;
+ uint32_t row,col; char c;
+ while (ibuf >> row >> c >> col)
+ {
+ UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+ << "Error in alignment information:\n" << a);
+ binwrite(obuf,row);
+ binwrite(obuf,col);
+ }
+ // important: DO NOT replace the two lines below this comment by
+ // char const* x = obuf.str().c_str(), as the memory x is pointing
+ // to is freed immediately upon deconstruction of the string object.
+ string foo = obuf.str();
+ char const* x = foo.c_str();
+ vector<char> v(x,x+foo.size());
+ ret->myTx = append(ret->myTx, v);
+ }
+
+ thread1.join();
+ thread2.join();
+
+ ret->Tx = ret->myTx;
+ ret->T1 = ret->myT1;
+ ret->T2 = ret->myT2;
+ ret->I1 = ret->myI1;
+ ret->I2 = ret->myI2;
+
+#ifndef NDEBUG
+ // sanity check
+ for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
+ {
+ size_t slen1 = ret->T1->sntLen(i);
+ size_t slen2 = ret->T2->sntLen(i);
+ char const* p = ret->Tx->sntStart(i);
+ char const* q = ret->Tx->sntEnd(i);
+ size_t k;
+ while (p < q)
+ {
+ p = binread(p,k);
+ assert(p);
+ assert(p < q);
+ assert(k < slen1);
+ p = binread(p,k);
+ assert(p);
+ assert(k < slen2);
+ }
+ }
+#endif
+ return ret;
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h
new file mode 100644
index 000000000..a620b7219
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h
@@ -0,0 +1,130 @@
+// -*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+ template<typename TKN>
+ class imBitext : public Bitext<TKN>
+ {
+ sptr<imTtrack<char> > myTx;
+ sptr<imTtrack<TKN> > myT1;
+ sptr<imTtrack<TKN> > myT2;
+ sptr<imTSA<TKN> > myI1;
+ sptr<imTSA<TKN> > myI2;
+ static ThreadSafeCounter my_revision;
+ public:
+ size_t revision() const { return my_revision; }
+ void open(string const base, string const L1, string L2);
+ imBitext(sptr<TokenIndex> const& V1,
+ sptr<TokenIndex> const& V2,
+ size_t max_sample = 5000, size_t num_workers=4);
+ imBitext(size_t max_sample = 5000, size_t num_workers=4);
+ imBitext(imBitext const& other);
+
+ // sptr<imBitext<TKN> >
+ // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
+
+ sptr<imBitext<TKN> >
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& a) const;
+
+ };
+
+ template<typename TKN>
+ ThreadSafeCounter
+ imBitext<TKN>::my_revision;
+
+ template<typename TKN>
+ imBitext<TKN>::
+ imBitext(size_t max_sample, size_t num_workers)
+ : Bitext<TKN>(max_sample, num_workers)
+ {
+ this->m_default_sample_size = max_sample;
+ this->V1.reset(new TokenIndex());
+ this->V2.reset(new TokenIndex());
+ this->V1->setDynamic(true);
+ this->V2->setDynamic(true);
+ ++my_revision;
+ }
+
+ template<typename TKN>
+ imBitext<TKN>::
+ imBitext(sptr<TokenIndex> const& v1,
+ sptr<TokenIndex> const& v2,
+ size_t max_sample, size_t num_workers)
+ : Bitext<TKN>(max_sample, num_workers)
+ {
+ // this->default_sample_size = max_sample;
+ this->V1 = v1;
+ this->V2 = v2;
+ this->V1->setDynamic(true);
+ this->V2->setDynamic(true);
+ ++my_revision;
+ }
+
+
+ template<typename TKN>
+ imBitext<TKN>::
+ imBitext(imBitext<TKN> const& other)
+ {
+ this->myTx = other.myTx;
+ this->myT1 = other.myT1;
+ this->myT2 = other.myT2;
+ this->myI1 = other.myI1;
+ this->myI2 = other.myI2;
+ this->Tx = this->myTx;
+ this->T1 = this->myT1;
+ this->T2 = this->myT2;
+ this->I1 = this->myI1;
+ this->I2 = this->myI2;
+ this->V1 = other.V1;
+ this->V2 = other.V2;
+ this->m_default_sample_size = other.m_default_sample_size;
+ this->m_num_workers = other.m_num_workers;
+ ++my_revision;
+ }
+
+ template<>
+ sptr<imBitext<L2R_Token<SimpleWordId> > >
+ imBitext<L2R_Token<SimpleWordId> >::
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& aln) const;
+
+ template<typename TKN>
+ sptr<imBitext<TKN> >
+ imBitext<TKN>::
+ add(vector<string> const& s1,
+ vector<string> const& s2,
+ vector<string> const& aln) const
+ {
+ throw "Not yet implemented";
+ }
+
+ // What's up with this function???? UG
+ template<typename TKN>
+ void
+ imBitext<TKN>::
+ open(string const base, string const L1, string L2)
+ {
+ mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+ mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+ mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+ t1.open(base+L1+".mct");
+ t2.open(base+L2+".mct");
+ tx.open(base+L1+"-"+L2+".mam");
+ this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+ this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+ mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+ mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+ i1.open(base+L1+".sfa", this->T1);
+ i2.open(base+L2+".sfa", this->T2);
+ assert(this->T1->size() == this->T2->size());
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
new file mode 100644
index 000000000..211793277
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
@@ -0,0 +1,81 @@
+// -*- c++ -*-
+// don't include this file directly! it is included by ug_bitext.h
+
+namespace Moses
+{
+ namespace bitext
+ {
+ template<typename TKN>
+ class mmBitext : public Bitext<TKN>
+ {
+ void load_document_map(string const& fname);
+ public:
+ void open(string const base, string const L1, string L2);
+ mmBitext();
+ };
+
+ template<typename TKN>
+ mmBitext<TKN>::
+ mmBitext()
+ : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(),
+ new TokenIndex(), new TokenIndex(),
+ new mmTSA<TKN>(), new mmTSA<TKN>())
+ {};
+
+ template<typename TKN>
+ void
+ mmBitext<TKN>::
+ load_document_map(string const& fname)
+ {
+ ifstream docmap(fname.c_str());
+ // the docmap file should list the documents in the corpus
+ // in the order in which they appear with one line per document:
+ // <docname> <number of lines / sentences>
+ //
+ // in the future, we might also allow listing documents with
+ // sentence ranges.
+ string buffer,docname; size_t a=0,b;
+ this->m_sid2docid.reset(new vector<id_type>(this->T1->size()));
+ while(getline(docmap,buffer))
+ {
+ istringstream line(buffer);
+ if (!(line>>docname)) continue; // empty line
+ if (docname.size() && docname[0] == '#') continue; // comment
+ size_t docid = this->m_docname2docid.size();
+ this->m_docname2docid[docname] = docid;
+ line >> b;
+ VERBOSE(1, "DOCUMENT MAP " << docname
+ << " " << a << "-" << b+a << endl);
+ for (b += a; a < b; ++a)
+ (*this->m_sid2docid)[a] = docid;
+ }
+ UTIL_THROW_IF2(b != this->T1->size(),
+ "Document map doesn't match corpus!");
+ }
+
+ template<typename TKN>
+ void
+ mmBitext<TKN>::
+ open(string const base, string const L1, string L2)
+ {
+ mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+ mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+ mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+ t1.open(base+L1+".mct");
+ t2.open(base+L2+".mct");
+ tx.open(base+L1+"-"+L2+".mam");
+ this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+ this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+ mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+ mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+ i1.open(base+L1+".sfa", this->T1);
+ i2.open(base+L2+".sfa", this->T2);
+ assert(this->T1->size() == this->T2->size());
+
+ string docmapfile = base+"dmp";
+ if (!access(docmapfile.c_str(),F_OK))
+ load_document_map(docmapfile);
+ }
+
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..28a926587
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,246 @@
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include "ug_typedefs.h"
+#include "ug_bitext_pstats.h"
+
+namespace Moses
+{
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PhrasePair
+ {
+ public:
+ class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
+ Token const* start1;
+ Token const* start2;
+ uint32_t len1;
+ uint32_t len2;
+ uint64_t p1, p2;
+ uint32_t raw1, raw2, sample1, sample2, good1, good2, joint;
+ std::vector<float> fvals;
+ float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs?
+ float dbwd[Moses::LRModel::NONE+1]; // distortion counts
+ std::vector<uchar> aln;
+ float score;
+ bool inverse;
+ std::vector<uint32_t> indoc;
+ PhrasePair() { };
+ PhrasePair(PhrasePair const& o);
+
+ PhrasePair const& operator+=(PhrasePair const& other);
+
+ bool operator<(PhrasePair const& other) const;
+ bool operator>(PhrasePair const& other) const;
+ bool operator<=(PhrasePair const& other) const;
+ bool operator>=(PhrasePair const& other) const;
+
+ void init();
+ void init(uint64_t const pid1, bool is_inverse,
+ Token const* x, uint32_t const len,
+ pstats const* ps = NULL, size_t const numfeats=0);
+
+ PhrasePair const&
+ update(uint64_t const pid2, Token const* x,
+ uint32_t const len, jstats const& js);
+
+ class SortByTargetIdSeq
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+
+ class SortDescendingByJointCount
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+ };
+
+ template<typename Token>
+ void PhrasePair<Token>
+ ::init(uint64_t const pid1, bool is_inverse,
+ Token const* x, uint32_t const len,
+ pstats const* ps, size_t const numfeats)
+ {
+ inverse = is_inverse;
+ start1 = x; len1 = len;
+ p1 = pid1;
+ p2 = 0;
+ if (ps)
+ {
+ raw1 = ps->raw_cnt;
+ sample1 = ps->sample_cnt;
+ good1 = ps->good;
+ }
+ else raw1 = sample1 = good1 = 0;
+ joint = 0;
+ good2 = 0;
+ sample2 = 0;
+ raw2 = 0;
+ fvals.resize(numfeats);
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>
+ ::update(uint64_t const pid2,
+ Token const* x, uint32_t const len, jstats const& js)
+ {
+ p2 = pid2;
+ start2 = x; len2 = len;
+ raw2 = js.cnt2();
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ float total_fwd = 0, total_bwd = 0;
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ total_fwd += js.dcnt_fwd(po)+1;
+ total_bwd += js.dcnt_bwd(po)+1;
+ }
+
+ // should we do that here or leave the raw counts?
+ for (int i = 0; i <= Moses::LRModel::NONE; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+ }
+
+ indoc = js.indoc;
+ return *this;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator<(PhrasePair const& other) const
+ {
+ return this->score < other.score;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator>(PhrasePair const& other) const
+ {
+ return this->score > other.score;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator<=(PhrasePair const& other) const
+ {
+ return this->score <= other.score;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>
+ ::operator>=(PhrasePair const& other) const
+ {
+ return this->score >= other.score;
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>
+ ::operator+=(PhrasePair const& o)
+ {
+ raw1 += o.raw1;
+ raw2 += o.raw2;
+ good1 += o.good1;
+ good2 += o.good2;
+ joint += o.joint;
+ sample1 += o.sample1;
+ sample2 += o.sample2;
+ return *this;
+ }
+
+ template<typename Token>
+ PhrasePair<Token>
+ ::PhrasePair(PhrasePair<Token> const& o)
+ : start1(o.start1) , start2(o.start2)
+ , len1(o.len1) , len2(o.len2)
+ , p1(o.p1) , p2(o.p2)
+ , raw1(o.raw1) , raw2(o.raw2)
+ , sample1(o.sample1) , sample2(o.sample2)
+ , good1(o.good1) , good2(o.good2)
+ , joint(o.joint)
+ , fvals(o.fvals)
+ , aln(o.aln)
+ , score(o.score)
+ , inverse(o.inverse)
+ , indoc(o.indoc)
+ {
+ for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+ {
+ dfwd[i] = o.dfwd[i];
+ dbwd[i] = o.dbwd[i];
+ }
+ }
+
+ template<typename Token>
+ int PhrasePair<Token>
+ ::SortByTargetIdSeq
+ ::cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ size_t i = 0;
+ Token const* x = a.start2;
+ Token const* y = b.start2;
+ while (i < a.len2 && i < b.len2 && x->id() == y->id())
+ {
+ x = x->next();
+ y = y->next();
+ ++i;
+ }
+ if (i == a.len2 && i == b.len2) return 0;
+ if (i == a.len2) return -1;
+ if (i == b.len2) return 1;
+ return x->id() < y->id() ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool PhrasePair<Token>
+ ::SortByTargetIdSeq
+ ::operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ int PhrasePair<Token>
+ ::SortDescendingByJointCount
+ ::cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ if (a.joint == b.joint) return 0;
+ return a.joint > b.joint ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool PhrasePair<Token>
+ ::SortDescendingByJointCount
+ ::operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ void PhrasePair<Token>
+ ::init()
+ {
+ inverse = false;
+ len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ start1 = start2 = NULL;
+ p1 = p2 = 0;
+ }
+ }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
index f7c95f439..faed69e63 100644
--- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
@@ -4,9 +4,9 @@
#include <map>
#include<vector>
#include <string>
+#include <iostream>
#include "moses/Util.h"
#include "ug_typedefs.h"
-
namespace Moses
{
namespace bitext
@@ -18,7 +18,8 @@ namespace Moses
class SamplingBias
{
public:
-
+ int loglevel;
+ std::ostream* log;
virtual float
operator[](id_type const ID) const = 0;
// returns (unnormalized bias) for the class of item ID
diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
index 7c11b3942..f9864bda6 100644
--- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h
+++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
@@ -17,6 +17,7 @@
#include "ug_ttrack_position.h"
#include "tpt_typedefs.h"
#include "tpt_tokenindex.h"
+#include "moses/Util.h"
// #include "ug_vocab.h"
namespace ugdiss
@@ -25,6 +26,33 @@ namespace ugdiss
typedef boost::dynamic_bitset<uint64_t> bdBitset;
+ template<typename sid_t, typename off_t, typename len_t>
+ void
+ parse_pid(uint64_t const pid, sid_t & sid,
+ off_t & off, len_t& len)
+ {
+ static uint64_t two32 = uint64_t(1)<<32;
+ static uint64_t two16 = uint64_t(1)<<16;
+ len = pid%two16;
+ off = (pid%two32)>>16;
+ sid = pid>>32;
+ }
+
+ template<typename Token>
+ string
+ toString(TokenIndex const& V, Token const* x, size_t const len)
+ {
+ if (!len) return "";
+ UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+ ostringstream buf;
+ buf << V[x->id()];
+ size_t i = 1;
+ for (x = x->next(); x && i < len; ++i, x = x->next())
+ buf << " " << V[x->id()];
+ UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+ return buf.str();
+ }
+
template<typename TKN=id_type>
class Ttrack
{