diff options
author | Ulrich Germann <ugermann@inf.ed.ac.uk> | 2015-07-05 15:08:57 +0300 |
---|---|---|
committer | Ulrich Germann <ugermann@inf.ed.ac.uk> | 2015-07-05 15:08:57 +0300 |
commit | 4dd2ea3117bdc156cde45d9d126e07a245c6381d (patch) | |
tree | 82142945e1ba423ff6eb1165485bc9ddcc5d223a | |
parent | e1f31666c3c9b37ff299c7d12e3be1c1cd151f07 (diff) |
Added random sampling to BitextSampler.ranked-sampling-v0.1.0
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_bitext_sampler.h | 557 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mmsapt.cpp | 34 |
2 files changed, 356 insertions, 235 deletions
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 9333dd879..d3845415d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -1,8 +1,14 @@ // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- #pragma once + +#include <algorithm> + +#include <boost/random.hpp> #include <boost/thread.hpp> #include <boost/thread/locks.hpp> #include <boost/intrusive_ptr.hpp> +#include <boost/math/distributions/binomial.hpp> + #include "ug_bitext.h" #include "ug_bitext_pstats.h" #include "ug_sampling_bias.h" @@ -16,246 +22,357 @@ namespace Moses namespace bitext { - enum sampling_method { full_coverage, random_sampling, ranked_sampling }; +enum sampling_method { full_coverage, random_sampling, ranked_sampling }; - typedef ugdiss::ttrack::Position TokenPosition; - class CandidateSorter - { - SamplingBias const& score; - public: - CandidateSorter(SamplingBias const& s) : score(s) {} - bool operator()(TokenPosition const& a, TokenPosition const& b) const - { return score[a.sid] > score[b.sid]; } - }; +typedef ugdiss::ttrack::Position TokenPosition; +class CandidateSorter +{ + SamplingBias const& score; +public: + CandidateSorter(SamplingBias const& s) : score(s) {} + bool operator()(TokenPosition const& a, TokenPosition const& b) const + { return score[a.sid] > score[b.sid]; } +}; - template<typename Token> - class - BitextSampler : public reference_counter - { - typedef Bitext<Token> bitext; - typedef TSA<Token> tsa; - typedef SamplingBias bias; - typedef typename Bitext<Token>::iter tsa_iter; - mutable boost::condition_variable m_ready; - mutable boost::mutex m_lock; - // const members - // sptr<bitext const> const m_bitext; // keep bitext alive while I am - // should be an - iptr<bitext const> const m_bitext; // keep bitext alive as long as I am - size_t const m_plen; // length of lookup phrase - bool const m_fwd; // forward or backward direction? - sptr<tsa const> const m_root; // root of suffix array - char const* m_next; // current position - char const* m_stop; // end of search range - sampling_method const m_method; /* look at all / random sample / - * ranked samples */ - sptr<bias const> const m_bias; // bias over candidates - size_t const m_samples; // how many samples at most - // non-const members - sptr<pstats> m_stats; // destination for phrase stats - size_t m_ctr; // number of samples considered - float m_total_bias; // for random sampling with bias - bool m_finished; - bool consider_sample(TokenPosition const& p); - size_t perform_ranked_sampling(); +template<typename Token> +class +BitextSampler : public reference_counter +{ + typedef Bitext<Token> bitext; + typedef TSA<Token> tsa; + typedef SamplingBias bias; + typedef typename Bitext<Token>::iter tsa_iter; + mutable boost::condition_variable m_ready; + mutable boost::mutex m_lock; + // const members + // sptr<bitext const> const m_bitext; // keep bitext alive while I am + // should be an + iptr<bitext const> const m_bitext; // keep bitext alive as long as I am + size_t const m_plen; // length of lookup phrase + bool const m_fwd; // forward or backward direction? + sptr<tsa const> const m_root; // root of suffix array + char const* m_next; // current position + char const* m_stop; // end of search range + sampling_method const m_method; // look at all/random/ranked samples + sptr<bias const> const m_bias; // bias over candidates + size_t const m_samples; // how many samples at most + // non-const members + sptr<pstats> m_stats; // destination for phrase stats + size_t m_ctr; // number of samples considered + float m_total_bias; // for random sampling with bias + bool m_finished; + + boost::taus88 m_rnd; // every job has its own pseudo random generator + double m_rnddenom; // denominator for scaling random sampling + double m_bias_total; + + bool consider_sample(TokenPosition const& p); + size_t perform_ranked_sampling(); + size_t perform_random_sampling(); + + int check_sample_distribution(uint64_t const& sid, uint64_t const& offset); + bool flip_coin(ugdiss::id_type & sid, ushort & offset); - public: - BitextSampler(BitextSampler const& other); - BitextSampler const& operator=(BitextSampler const& other); - BitextSampler(bitext const* const bitext, typename bitext::iter const& phrase, - sptr<SamplingBias const> const& bias, size_t const max_samples, - sampling_method const method); - ~BitextSampler(); - bool operator()(); // run sampling - sptr<pstats> stats(); - bool done() const; - }; - - template<typename Token> - BitextSampler<Token>:: - BitextSampler(Bitext<Token> const* const bitext, - typename bitext::iter const& phrase, - sptr<SamplingBias const> const& bias, size_t const max_samples, - sampling_method const method) - : m_bitext(bitext) - , m_plen(phrase.size()) - , m_fwd(phrase.root == bitext->I1.get()) - , m_root(m_fwd ? bitext->I1 : bitext->I2) - , m_next(phrase.lower_bound(-1)) - , m_stop(phrase.upper_bound(-1)) - , m_method(method) - , m_bias(bias) - , m_samples(max_samples) - , m_ctr(0) - , m_total_bias(0) - , m_finished(false) - { - m_stats.reset(new pstats); - m_stats->raw_cnt = phrase.ca(); - m_stats->register_worker(); - // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl; - } - - template<typename Token> - BitextSampler<Token>:: - BitextSampler(BitextSampler const& other) - : m_bitext(other.m_bitext) - , m_plen(other.m_plen) - , m_fwd(other.m_fwd) - , m_root(other.m_root) - , m_next(other.m_next) - , m_stop(other.m_stop) - , m_method(other.m_method) - , m_bias(other.m_bias) - , m_samples(other.m_samples) - { - // lock both instances - boost::unique_lock<boost::mutex> mylock(m_lock); - boost::unique_lock<boost::mutex> yrlock(other.m_lock); - // actually, BitextSamplers should only copied on job submission - m_stats = other.m_stats; - m_stats->register_worker(); - m_ctr = other.m_ctr; - m_total_bias = other.m_total_bias; - m_finished = other.m_finished; - } +public: + BitextSampler(BitextSampler const& other); + BitextSampler const& operator=(BitextSampler const& other); + BitextSampler(bitext const* const bitext, typename bitext::iter const& phrase, + sptr<SamplingBias const> const& bias, size_t const max_samples, + sampling_method const method); + ~BitextSampler(); + bool operator()(); // run sampling + sptr<pstats> stats(); + bool done() const; +}; - // Ranked sampling sorts all samples by score and then considers the top-ranked - // candidates for phrase extraction. - template<typename Token> - size_t - BitextSampler<Token>:: - perform_ranked_sampling() - { - if (m_next == m_stop) return m_ctr; - CandidateSorter sorter(*m_bias); - // below: nbest size = 20 * m_samples to allow for failed phrase extraction - NBestList<TokenPosition, CandidateSorter> nbest(20*m_samples,sorter); - ugdiss::tsa::ArrayEntry I(m_next); - while (I.next < m_stop) - { - ++m_ctr; - nbest.add(m_root->readEntry(I.next,I)); - } - size_t n = 0; - for (size_t i = 0; n < m_samples && i < nbest.size(); ++i) - if (consider_sample(nbest[i])) ++n;; - // cerr << m_ctr << " samples considered at " - // << __FILE__ << ":" << __LINE__ << endl; - return m_ctr; - } - - template<typename Token> - bool - BitextSampler<Token>:: - consider_sample(TokenPosition const& p) - { - std::vector<unsigned char> aln; - bitvector full_aln(100*100); - PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, - !m_fwd, &aln, &full_aln); - int docid = m_bias ? m_bias->GetClass(p.sid) : -1; +template<typename Token> +int +BitextSampler<Token>:: +check_sample_distribution(uint64_t const& sid, uint64_t const& offset) +{ // ensure that the sampled distribution approximately matches the bias + // @return 0: SKIP this occurrence + // @return 1: consider this occurrence for sampling + // @return 2: include this occurrence in the sample by all means + + typedef boost::math::binomial_distribution<> binomial; + + // std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; + std::ostream* log = NULL; + + if (!m_bias) return 1; - bool good = m_bitext->find_trg_phr_bounds(rec); + float p = (*m_bias)[sid]; + id_type docid = m_bias->GetClass(sid); + + std::map<uint32_t,uint32_t>::const_iterator m = m_stats->indoc.find(docid); + uint32_t k = m != m_stats->indoc.end() ? m->second : 0 ; + + // always consider candidates from dominating documents and + // from documents that have not been considered at all yet + bool ret = (p > .5 || k == 0); + + if (ret && !log) return 1; + + uint32_t N = m_stats->good; // number of trials + float d = cdf(complement(binomial(N, p), k)); + // d: probability that samples contains k or more instances from doc #docid + ret = ret || d >= .05; #if 0 - cerr << p.sid << " " << docid << " " - << (good ? "OK " : "bad ") - << __FILE__ << ":" << __LINE__ << endl; + if (log) + { + Token const* t = m_root->getCorpus()->sntStart(sid)+offset; + Token const* x = t - min(offset,uint64_t(3)); + Token const* e = t + 4; + if (e > m_root->getCorpus()->sntEnd(sid)) + e = m_root->getCorpus()->sntEnd(sid); + *log << docid << ":" << sid << " " << size_t(k) << "/" << N + << " @" << p << " => " << d << " ["; + std::map<uint32_t, uint32_t>::const_iterator m; + for (m = m_stats->indoc.begin(); m != m_stats->indoc.end(); ++m) + { + if (m != m_stats->indoc.begin()) *log << " "; + *log << m->first << ":" << m->second; + } + *log << "] "; + for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; + if (!ret) *log << "SKIP"; + else if (p < .5 && d > .9) *log << "FORCE"; + *log << std::endl; + } #endif - if (!good) - { // no good, probably because phrase is not coherent - m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); - return false; - } + return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); +} + +template<typename Token> +bool +BitextSampler<Token>:: +flip_coin(ugdiss::id_type & sid, ushort & offset) +{ + int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1; + if (no_maybe_yes == 0) return false; // no + if (no_maybe_yes > 1) return true; // yes + // ... maybe: flip a coin + size_t options_chosen = m_stats->good; + size_t options_total = max(m_stats->raw_cnt, m_ctr); + size_t options_left = (options_total - m_ctr); + size_t random_number = options_left * (m_rnd()/(m_rnd.max()+1.)); + size_t threshold; + if (m_bias_total) // we have a bias and there are candidates with non-zero prob + threshold = ((*m_bias)[sid]/m_bias_total * options_total * m_samples); + else // no bias, or all have prob 0 (can happen with a very opinionated bias) + threshold = m_samples; + return random_number + options_chosen < threshold; +} + + + + +template<typename Token> +BitextSampler<Token>:: +BitextSampler(Bitext<Token> const* const bitext, + typename bitext::iter const& phrase, + sptr<SamplingBias const> const& bias, size_t const max_samples, + sampling_method const method) + : m_bitext(bitext) + , m_plen(phrase.size()) + , m_fwd(phrase.root == bitext->I1.get()) + , m_root(m_fwd ? bitext->I1 : bitext->I2) + , m_next(phrase.lower_bound(-1)) + , m_stop(phrase.upper_bound(-1)) + , m_method(method) + , m_bias(bias) + , m_samples(max_samples) + , m_ctr(0) + , m_total_bias(0) + , m_finished(false) +{ + m_stats.reset(new pstats); + m_stats->raw_cnt = phrase.ca(); + m_stats->register_worker(); + // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl; +} + +template<typename Token> +BitextSampler<Token>:: +BitextSampler(BitextSampler const& other) + : m_bitext(other.m_bitext) + , m_plen(other.m_plen) + , m_fwd(other.m_fwd) + , m_root(other.m_root) + , m_next(other.m_next) + , m_stop(other.m_stop) + , m_method(other.m_method) + , m_bias(other.m_bias) + , m_samples(other.m_samples) +{ + // lock both instances + boost::unique_lock<boost::mutex> mylock(m_lock); + boost::unique_lock<boost::mutex> yrlock(other.m_lock); + // actually, BitextSamplers should only copied on job submission + m_stats = other.m_stats; + m_stats->register_worker(); + m_ctr = other.m_ctr; + m_total_bias = other.m_total_bias; + m_finished = other.m_finished; +} + +// Ranked sampling sorts all samples by score and then considers the top-ranked +// candidates for phrase extraction. +template<typename Token> +size_t +BitextSampler<Token>:: +perform_ranked_sampling() +{ + if (m_next == m_stop) return m_ctr; + CandidateSorter sorter(*m_bias); + // below: nbest size = 4 * m_samples to allow for failed phrase extraction + NBestList<TokenPosition, CandidateSorter> nbest(4*m_samples,sorter); + ugdiss::tsa::ArrayEntry I(m_next); + while (I.next < m_stop) + { + ++m_ctr; + nbest.add(m_root->readEntry(I.next,I)); + } + for (size_t i = 0; m_stats->good < m_samples && i < nbest.size(); ++i) + consider_sample(nbest[i]); + return m_ctr; +} + +// Ranked sampling sorts all samples by score and then considers the top-ranked +// candidates for phrase extraction. +template<typename Token> +size_t +BitextSampler<Token>:: +perform_random_sampling() +{ + if (m_next == m_stop) return m_ctr; + m_bias_total = 0; + if (m_bias) + { + m_stats->raw_cnt = 0; + for (ugdiss::tsa::ArrayEntry I(m_next); I.next < m_stop;) + { + m_root->readEntry(I.next,I); + ++m_stats->raw_cnt; + m_bias_total += (*m_bias)[I.sid]; + } + } + + ugdiss::tsa::ArrayEntry I(m_next); + while (m_stats->good < m_samples && I.next < m_stop) + { + ++m_ctr; + m_root->readEntry(I.next,I); + if (!flip_coin(I.sid, I.offset)) continue; + consider_sample(I); + } + return m_ctr; +} + +template<typename Token> +bool +BitextSampler<Token>:: +consider_sample(TokenPosition const& p) +{ + std::vector<unsigned char> aln; + bitvector full_aln(100*100); + PhraseExtractionRecord + rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln); + int docid = m_bias ? m_bias->GetClass(p.sid) : -1; + if (!m_bitext->find_trg_phr_bounds(rec)) + { // no good, probably because phrase is not coherent + m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); + return false; + } - // all good: register this sample as valid - size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1); - m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd); + // all good: register this sample as valid + size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1); + m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd); - float sample_weight = 1./num_pairs; - Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid); + float sample_weight = 1./num_pairs; + Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid); - // adjust offsets in phrase-internal aligment - for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1; + // adjust offsets in phrase-internal aligment + for (size_t k = 1; k < aln.size(); k += 2) + aln[k] += rec.s2 - rec.s1; - vector<uint64_t> seen; seen.reserve(10); - // It is possible that the phrase extraction extracts the same - // phrase twice, e.g., when word a co-occurs with sequence b b b - // but is aligned only to the middle word. We can only count - // each phrase pair once per source phrase occurrence, or else - // run the risk of having more joint counts than marginal - // counts. + vector<uint64_t> seen; seen.reserve(10); + // It is possible that the phrase extraction extracts the same + // phrase twice, e.g., when word a co-occurs with sequence b b b but + // is aligned only to the middle word. We can only count each phrase + // pair once per source phrase occurrence, or else run the risk of + // having more joint counts than marginal counts. - for (size_t s = rec.s1; s <= rec.s2; ++s) - { - TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; - sptr<tsa_iter> b = I.find(o + s, rec.e1 - s); - UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); + for (size_t s = rec.s1; s <= rec.s2; ++s) + { + TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; + sptr<tsa_iter> b = I.find(o + s, rec.e1 - s); + UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); - for (size_t i = rec.e1; i <= rec.e2; ++i) - { - uint64_t tpid = b->getPid(); - - // poor man's protection against over-counting - size_t s = 0; - while (s < seen.size() && seen[s] != tpid) ++s; - if (s < seen.size()) continue; - seen.push_back(tpid); - - size_t raw2 = b->approxOccurrenceCount(); - m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, - aln, raw2, rec.po_fwd, rec.po_bwd, docid); - bool ok = (i == rec.e2) || b->extend(o[i].id()); - UTIL_THROW_IF2(!ok, "Could not extend target phrase."); - } - if (s < rec.s2) // shift phrase-internal alignments - for (size_t k = 1; k < aln.size(); k += 2) - --aln[k]; - } - return true; - } + for (size_t i = rec.e1; i <= rec.e2; ++i) + { + uint64_t tpid = b->getPid(); + if (find(seen.begin(), seen.end(), tpid) != seen.end()) + continue; // don't over-count + seen.push_back(tpid); + size_t raw2 = b->approxOccurrenceCount(); + m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, + aln, raw2, rec.po_fwd, rec.po_bwd, docid); + bool ok = (i == rec.e2) || b->extend(o[i].id()); + UTIL_THROW_IF2(!ok, "Could not extend target phrase."); + } + if (s < rec.s2) // shift phrase-internal alignments + for (size_t k = 1; k < aln.size(); k += 2) + --aln[k]; + } + return true; +} - template<typename Token> - bool - BitextSampler<Token>:: - operator()() - { - if (m_finished) return true; - boost::unique_lock<boost::mutex> lock(m_lock); +template<typename Token> +bool +BitextSampler<Token>:: +operator()() +{ + if (m_finished) return true; + boost::unique_lock<boost::mutex> lock(m_lock); + if (m_method == ranked_sampling) perform_ranked_sampling(); - m_finished = true; - m_ready.notify_all(); - return true; - } + else if (m_method == random_sampling) + perform_random_sampling(); + else UTIL_THROW2("Unsupported sampling method."); + m_finished = true; + m_ready.notify_all(); + return true; +} - template<typename Token> - bool - BitextSampler<Token>:: - done() const - { - return m_next == m_stop; - } +template<typename Token> +bool +BitextSampler<Token>:: +done() const +{ + return m_next == m_stop; +} - template<typename Token> - sptr<pstats> - BitextSampler<Token>:: - stats() - { - // if (m_ctr == 0) (*this)(); - // boost::unique_lock<boost::mutex> lock(m_lock); - // while (!m_finished) - // m_ready.wait(lock); - return m_stats; - } +template<typename Token> +sptr<pstats> +BitextSampler<Token>:: +stats() +{ + // if (m_ctr == 0) (*this)(); + // boost::unique_lock<boost::mutex> lock(m_lock); + // while (!m_finished) + // m_ready.wait(lock); + return m_stats; +} - template<typename Token> - BitextSampler<Token>:: - ~BitextSampler() - { - m_stats->release(); - } +template<typename Token> +BitextSampler<Token>:: +~BitextSampler() +{ + m_stats->release(); +} } // end of namespace bitext } // end of namespace Moses diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 3fa76f4d8..651621f96 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -833,17 +833,19 @@ namespace Moses } if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); - } else if (!ttask->GetContextWeights().empty()) { - if (m_bias_log) - { - *m_bias_log << HERE << endl - << "BIAS FROM MAP LOOKUP" << endl; - context->bias_log = m_bias_log; - } - context->bias - = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); - context->bias->loglevel = m_bias_loglevel; - context->bias->log = m_bias_log; + } + else if (!ttask->GetContextWeights().empty()) + { + if (m_bias_log) + { + *m_bias_log << HERE << endl + << "BIAS FROM MAP LOOKUP" << endl; + context->bias_log = m_bias_log; + } + context->bias + = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); + context->bias->loglevel = m_bias_loglevel; + context->bias->log = m_bias_log; if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); } @@ -897,13 +899,15 @@ namespace Moses Mmsapt:: InitializeForInput(ttasksptr const& ttask) { - set_bias_for_ranking(ttask, this->btfix); - // to do: depending on method, set bias for ranking, via consulting the bias - // server, or none at al. - sptr<ContextScope> const& scope = ttask->GetScope(); sptr<ContextForQuery> context = scope->get<ContextForQuery>(btfix.get(), true); + // set sampling bias, depending on sampling method specified + if (m_sampling_method == ranked_sampling) + set_bias_for_ranking(ttask, this->btfix); + else if (m_sampling_method == random_sampling) + set_bias_via_server(ttask); + boost::unique_lock<boost::shared_mutex> mylock(m_lock); sptr<TPCollCache> localcache = scope->get<TPCollCache>(cache_key); if (!localcache) |