Added random sampling to BitextSampler.ranked-sampling-v0.1.0

author: Ulrich Germann <ugermann@inf.ed.ac.uk> 2015-07-05 15:08:57 +0300
committer: Ulrich Germann <ugermann@inf.ed.ac.uk> 2015-07-05 15:08:57 +0300
commit: 4dd2ea3117bdc156cde45d9d126e07a245c6381d (patch)
tree: 82142945e1ba423ff6eb1165485bc9ddcc5d223a
parent: e1f31666c3c9b37ff299c7d12e3be1c1cd151f07 (diff)
2 files changed, 356 insertions, 235 deletions
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
index 9333dd879..d3845415d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
@@ -1,8 +1,14 @@
 // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
 #pragma once
+
+#include <algorithm>
+
+#include <boost/random.hpp>
 #include <boost/thread.hpp>
 #include <boost/thread/locks.hpp>
 #include <boost/intrusive_ptr.hpp>
+#include <boost/math/distributions/binomial.hpp>
+
 #include "ug_bitext.h"
 #include "ug_bitext_pstats.h"
 #include "ug_sampling_bias.h"
@@ -16,246 +22,357 @@ namespace Moses
 namespace bitext 
 {
   
-  enum sampling_method { full_coverage, random_sampling, ranked_sampling };
+enum sampling_method { full_coverage, random_sampling, ranked_sampling };
 
-  typedef ugdiss::ttrack::Position TokenPosition;
-  class CandidateSorter
-  {
-    SamplingBias const& score;
-  public:
-    CandidateSorter(SamplingBias const& s) : score(s) {}
-    bool operator()(TokenPosition const& a, TokenPosition const& b) const
-    { return score[a.sid] > score[b.sid]; }
-  };
+typedef ugdiss::ttrack::Position TokenPosition;
+class CandidateSorter
+{
+  SamplingBias const& score;
+public:
+  CandidateSorter(SamplingBias const& s) : score(s) {}
+  bool operator()(TokenPosition const& a, TokenPosition const& b) const
+  { return score[a.sid] > score[b.sid]; }
+};
   
-  template<typename Token>
-  class
-  BitextSampler : public reference_counter
-  {
-    typedef Bitext<Token> bitext;
-    typedef TSA<Token>       tsa;
-    typedef SamplingBias    bias;
-    typedef typename Bitext<Token>::iter tsa_iter;
-    mutable boost::condition_variable   m_ready; 
-    mutable boost::mutex                 m_lock; 
-    // const members
-    // sptr<bitext const> const   m_bitext; // keep bitext alive while I am 
-    // should be an 
-    iptr<bitext const> const   m_bitext; // keep bitext alive as long as I am 
-    size_t             const     m_plen; // length of lookup phrase
-    bool               const      m_fwd; // forward or backward direction?
-    sptr<tsa const>    const     m_root; // root of suffix array
-    char               const*    m_next; // current position
-    char               const*    m_stop; // end of search range
-    sampling_method    const   m_method; /* look at all / random sample / 
-					      * ranked samples */
-    sptr<bias const>   const     m_bias; // bias over candidates
-    size_t             const  m_samples; // how many samples at most 
-    // non-const members
-    sptr<pstats>                m_stats; // destination for phrase stats
-    size_t                        m_ctr; // number of samples considered
-    float                  m_total_bias; // for random sampling with bias
-    bool                     m_finished;
-    bool   consider_sample(TokenPosition const& p);
-    size_t perform_ranked_sampling();
+template<typename Token>
+class
+BitextSampler : public reference_counter
+{
+  typedef Bitext<Token> bitext;
+  typedef TSA<Token>       tsa;
+  typedef SamplingBias    bias;
+  typedef typename Bitext<Token>::iter tsa_iter;
+  mutable boost::condition_variable   m_ready; 
+  mutable boost::mutex                 m_lock; 
+  // const members
+  // sptr<bitext const> const   m_bitext; // keep bitext alive while I am 
+  // should be an 
+  iptr<bitext const> const   m_bitext; // keep bitext alive as long as I am 
+  size_t             const     m_plen; // length of lookup phrase
+  bool               const      m_fwd; // forward or backward direction?
+  sptr<tsa const>    const     m_root; // root of suffix array
+  char               const*    m_next; // current position
+  char               const*    m_stop; // end of search range
+  sampling_method    const   m_method; // look at all/random/ranked samples 
+  sptr<bias const>   const     m_bias; // bias over candidates
+  size_t             const  m_samples; // how many samples at most 
+  // non-const members
+  sptr<pstats>                m_stats; // destination for phrase stats
+  size_t                        m_ctr; // number of samples considered
+  float                  m_total_bias; // for random sampling with bias
+  bool                     m_finished;
+
+  boost::taus88 m_rnd;  // every job has its own pseudo random generator
+  double m_rnddenom;    // denominator for scaling random sampling
+  double m_bias_total;
+
+  bool   consider_sample(TokenPosition const& p);
+  size_t perform_ranked_sampling();
+  size_t perform_random_sampling();
+
+  int check_sample_distribution(uint64_t const& sid, uint64_t const& offset);
+  bool flip_coin(ugdiss::id_type & sid, ushort & offset);
     
-  public:
-    BitextSampler(BitextSampler const& other);
-    BitextSampler const& operator=(BitextSampler const& other);
-    BitextSampler(bitext const*  const bitext, typename bitext::iter const& phrase,
-		  sptr<SamplingBias const> const& bias, size_t const max_samples,
-		  sampling_method const method); 
-    ~BitextSampler();
-    bool operator()(); // run sampling
-    sptr<pstats> stats();
-    bool done() const;
-  };
-  
-  template<typename Token>
-  BitextSampler<Token>::
-  BitextSampler(Bitext<Token> const* const bitext, 
-		typename bitext::iter const& phrase,
-		sptr<SamplingBias const> const& bias, size_t const max_samples,
-		sampling_method const method)
-    : m_bitext(bitext)
-    , m_plen(phrase.size())
-    , m_fwd(phrase.root == bitext->I1.get())
-    , m_root(m_fwd ? bitext->I1 : bitext->I2)
-    , m_next(phrase.lower_bound(-1))
-    , m_stop(phrase.upper_bound(-1))
-    , m_method(method)
-    , m_bias(bias)
-    , m_samples(max_samples)
-    , m_ctr(0)
-    , m_total_bias(0)
-    , m_finished(false)
-  {
-    m_stats.reset(new pstats);
-    m_stats->raw_cnt = phrase.ca();
-    m_stats->register_worker();
-    // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl;
-  }
-  
-  template<typename Token>
-  BitextSampler<Token>::
-  BitextSampler(BitextSampler const& other)
-    : m_bitext(other.m_bitext)
-    , m_plen(other.m_plen)
-    , m_fwd(other.m_fwd)
-    , m_root(other.m_root)
-    , m_next(other.m_next)
-    , m_stop(other.m_stop)
-    , m_method(other.m_method)
-    , m_bias(other.m_bias)
-    , m_samples(other.m_samples)
-  {
-    // lock both instances
-    boost::unique_lock<boost::mutex> mylock(m_lock);
-    boost::unique_lock<boost::mutex> yrlock(other.m_lock);
-    // actually, BitextSamplers should only copied on job submission
-    m_stats = other.m_stats;
-    m_stats->register_worker();
-    m_ctr = other.m_ctr; 
-    m_total_bias = other.m_total_bias;
-    m_finished = other.m_finished;
-  }
+public:
+  BitextSampler(BitextSampler const& other);
+  BitextSampler const& operator=(BitextSampler const& other);
+  BitextSampler(bitext const*  const bitext, typename bitext::iter const& phrase,
+                sptr<SamplingBias const> const& bias, size_t const max_samples,
+                sampling_method const method); 
+  ~BitextSampler();
+  bool operator()(); // run sampling
+  sptr<pstats> stats();
+  bool done() const;
+};
 
-  // Ranked sampling sorts all samples by score and then considers the top-ranked 
-  // candidates for phrase extraction.
-  template<typename Token>
-  size_t
-  BitextSampler<Token>::
-  perform_ranked_sampling()
-  {
-    if (m_next == m_stop) return m_ctr;
-    CandidateSorter sorter(*m_bias);
-    // below: nbest size = 20 * m_samples to allow for failed phrase extraction
-    NBestList<TokenPosition, CandidateSorter> nbest(20*m_samples,sorter);
-    ugdiss::tsa::ArrayEntry I(m_next);
-    while (I.next < m_stop)
-      {
-        ++m_ctr;
-        nbest.add(m_root->readEntry(I.next,I));
-      }
-    size_t n = 0;
-    for (size_t i = 0; n < m_samples && i < nbest.size(); ++i)
-      if (consider_sample(nbest[i])) ++n;;
-    // cerr << m_ctr << " samples considered at " 
-    //      << __FILE__ << ":" << __LINE__ << endl;
-    return m_ctr;
-  }
-  
-  template<typename Token>
-  bool
-  BitextSampler<Token>::
-  consider_sample(TokenPosition const& p)
-  {
-    std::vector<unsigned char>  aln; 
-    bitvector full_aln(100*100);
-    PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, 
-                               !m_fwd, &aln, &full_aln);
-    int docid  = m_bias ? m_bias->GetClass(p.sid) : -1;
+template<typename Token>
+int 
+BitextSampler<Token>::
+check_sample_distribution(uint64_t const& sid, uint64_t const& offset)
+{ // ensure that the sampled distribution approximately matches the bias
+  // @return 0: SKIP this occurrence
+  // @return 1: consider this occurrence for sampling
+  // @return 2: include this occurrence in the sample by all means
+
+  typedef boost::math::binomial_distribution<> binomial;
+
+  // std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL;
+  std::ostream* log = NULL;
+
+  if (!m_bias) return 1;
 
-    bool good = m_bitext->find_trg_phr_bounds(rec);
+  float p = (*m_bias)[sid];
+  id_type docid = m_bias->GetClass(sid);
+ 
+  std::map<uint32_t,uint32_t>::const_iterator m = m_stats->indoc.find(docid);
+  uint32_t k = m != m_stats->indoc.end() ? m->second : 0 ;
+
+  // always consider candidates from dominating documents and
+  // from documents that have not been considered at all yet
+  bool ret =  (p > .5 || k == 0);
+
+  if (ret && !log) return 1;
+
+  uint32_t N = m_stats->good; // number of trials
+  float d = cdf(complement(binomial(N, p), k));
+  // d: probability that samples contains k or more instances from doc #docid
+  ret = ret || d >= .05;
 
 #if 0
-    cerr << p.sid << " " << docid << " " 
-         << (good ? "OK " : "bad ") 
-         << __FILE__ << ":" << __LINE__ << endl;
+  if (log)
+    {
+      Token const* t = m_root->getCorpus()->sntStart(sid)+offset;
+      Token const* x = t - min(offset,uint64_t(3));
+      Token const* e = t + 4;
+      if (e > m_root->getCorpus()->sntEnd(sid))
+        e = m_root->getCorpus()->sntEnd(sid);
+      *log << docid << ":" << sid << " " << size_t(k) << "/" << N
+           << " @" << p << " => " << d << " [";
+      std::map<uint32_t, uint32_t>::const_iterator m;
+      for (m = m_stats->indoc.begin(); m != m_stats->indoc.end(); ++m)
+        {
+          if (m != m_stats->indoc.begin()) *log << " ";
+          *log << m->first << ":" << m->second;
+        }
+      *log << "] ";
+      for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
+      if (!ret) *log << "SKIP";
+      else if (p < .5 && d > .9) *log << "FORCE";
+      *log << std::endl;
+    }
 #endif
-    if (!good)
-      { // no good, probably because phrase is not coherent
-        m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd);
-        return false;
-      }
+  return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0);
+}
+
+template<typename Token>
+bool 
+BitextSampler<Token>::
+flip_coin(ugdiss::id_type & sid, ushort & offset)
+{
+  int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1;
+  if (no_maybe_yes == 0) return false; // no
+  if (no_maybe_yes > 1)  return true;  // yes
+  // ... maybe: flip a coin
+  size_t options_chosen = m_stats->good;
+  size_t options_total  = max(m_stats->raw_cnt, m_ctr);
+  size_t options_left   = (options_total - m_ctr);
+  size_t random_number  = options_left * (m_rnd()/(m_rnd.max()+1.));
+  size_t threshold;
+  if (m_bias_total) // we have a bias and there are candidates with non-zero prob
+    threshold = ((*m_bias)[sid]/m_bias_total * options_total * m_samples);
+  else // no bias, or all have prob 0 (can happen with a very opinionated bias)
+    threshold = m_samples;
+  return random_number + options_chosen < threshold;
+}
+
+
+
+
+template<typename Token>
+BitextSampler<Token>::
+BitextSampler(Bitext<Token> const* const bitext, 
+              typename bitext::iter const& phrase,
+              sptr<SamplingBias const> const& bias, size_t const max_samples,
+              sampling_method const method)
+  : m_bitext(bitext)
+  , m_plen(phrase.size())
+  , m_fwd(phrase.root == bitext->I1.get())
+  , m_root(m_fwd ? bitext->I1 : bitext->I2)
+  , m_next(phrase.lower_bound(-1))
+  , m_stop(phrase.upper_bound(-1))
+  , m_method(method)
+  , m_bias(bias)
+  , m_samples(max_samples)
+  , m_ctr(0)
+  , m_total_bias(0)
+  , m_finished(false)
+{
+  m_stats.reset(new pstats);
+  m_stats->raw_cnt = phrase.ca();
+  m_stats->register_worker();
+  // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl;
+}
+  
+template<typename Token>
+BitextSampler<Token>::
+BitextSampler(BitextSampler const& other)
+  : m_bitext(other.m_bitext)
+  , m_plen(other.m_plen)
+  , m_fwd(other.m_fwd)
+  , m_root(other.m_root)
+  , m_next(other.m_next)
+  , m_stop(other.m_stop)
+  , m_method(other.m_method)
+  , m_bias(other.m_bias)
+  , m_samples(other.m_samples)
+{
+  // lock both instances
+  boost::unique_lock<boost::mutex> mylock(m_lock);
+  boost::unique_lock<boost::mutex> yrlock(other.m_lock);
+  // actually, BitextSamplers should only copied on job submission
+  m_stats = other.m_stats;
+  m_stats->register_worker();
+  m_ctr = other.m_ctr; 
+  m_total_bias = other.m_total_bias;
+  m_finished = other.m_finished;
+}
+
+// Ranked sampling sorts all samples by score and then considers the top-ranked 
+// candidates for phrase extraction.
+template<typename Token>
+size_t
+BitextSampler<Token>::
+perform_ranked_sampling()
+{
+  if (m_next == m_stop) return m_ctr;
+  CandidateSorter sorter(*m_bias);
+  // below: nbest size = 4 * m_samples to allow for failed phrase extraction
+  NBestList<TokenPosition, CandidateSorter> nbest(4*m_samples,sorter);
+  ugdiss::tsa::ArrayEntry I(m_next);
+  while (I.next < m_stop)
+    {
+      ++m_ctr;
+      nbest.add(m_root->readEntry(I.next,I));
+    }
+  for (size_t i = 0; m_stats->good < m_samples && i < nbest.size(); ++i)
+    consider_sample(nbest[i]);
+  return m_ctr;
+}
+  
+// Ranked sampling sorts all samples by score and then considers the top-ranked 
+// candidates for phrase extraction.
+template<typename Token>
+size_t
+BitextSampler<Token>::
+perform_random_sampling()
+{
+  if (m_next == m_stop) return m_ctr;
+  m_bias_total = 0;
+  if (m_bias)
+    {
+      m_stats->raw_cnt = 0;
+      for (ugdiss::tsa::ArrayEntry I(m_next); I.next < m_stop;)
+        {
+          m_root->readEntry(I.next,I);
+          ++m_stats->raw_cnt;
+          m_bias_total += (*m_bias)[I.sid];
+        }
+    }
+      
+  ugdiss::tsa::ArrayEntry I(m_next);
+  while (m_stats->good < m_samples && I.next < m_stop)
+    {
+      ++m_ctr;
+      m_root->readEntry(I.next,I);
+      if (!flip_coin(I.sid, I.offset)) continue;
+      consider_sample(I);
+    }
+  return m_ctr;
+}
+
+template<typename Token>
+bool
+BitextSampler<Token>::
+consider_sample(TokenPosition const& p)
+{
+  std::vector<unsigned char> aln; 
+  bitvector full_aln(100*100);
+  PhraseExtractionRecord 
+    rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln);
+  int docid = m_bias ? m_bias->GetClass(p.sid) : -1;
+  if (!m_bitext->find_trg_phr_bounds(rec))
+    { // no good, probably because phrase is not coherent
+      m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd);
+      return false;
+    }
     
-    // all good: register this sample as valid
-    size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1);
-    m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd);
+  // all good: register this sample as valid
+  size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1);
+  m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd);
     
-    float sample_weight = 1./num_pairs;
-    Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid);
+  float sample_weight = 1./num_pairs;
+  Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid);
     
-    // adjust offsets in phrase-internal aligment
-    for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1;
+  // adjust offsets in phrase-internal aligment
+  for (size_t k = 1; k < aln.size(); k += 2) 
+    aln[k] += rec.s2 - rec.s1;
     
-    vector<uint64_t> seen; seen.reserve(10);
-    // It is possible that the phrase extraction extracts the same
-    // phrase twice, e.g., when word a co-occurs with sequence b b b
-    // but is aligned only to the middle word. We can only count
-    // each phrase pair once per source phrase occurrence, or else
-    // run the risk of having more joint counts than marginal
-    // counts.
+  vector<uint64_t> seen; seen.reserve(10);
+  // It is possible that the phrase extraction extracts the same
+  // phrase twice, e.g., when word a co-occurs with sequence b b b but
+  // is aligned only to the middle word. We can only count each phrase
+  // pair once per source phrase occurrence, or else run the risk of
+  // having more joint counts than marginal counts.
     
-    for (size_t s = rec.s1; s <= rec.s2; ++s)
-      {
-        TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1;
-        sptr<tsa_iter> b = I.find(o + s, rec.e1 - s);
-        UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found");
+  for (size_t s = rec.s1; s <= rec.s2; ++s)
+    {
+      TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1;
+      sptr<tsa_iter> b = I.find(o + s, rec.e1 - s);
+      UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found");
 	
-        for (size_t i = rec.e1; i <= rec.e2; ++i)
-          {
-            uint64_t tpid = b->getPid();
-            
-            // poor man's protection against over-counting
-            size_t s = 0;
-            while (s < seen.size() && seen[s] != tpid) ++s;
-            if (s < seen.size()) continue;
-            seen.push_back(tpid);
-            
-            size_t raw2 = b->approxOccurrenceCount();
-            m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, 
-                         aln, raw2, rec.po_fwd, rec.po_bwd, docid);
-            bool ok = (i == rec.e2) || b->extend(o[i].id());
-            UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
-          }
-        if (s < rec.s2) // shift phrase-internal alignments
-          for (size_t k = 1; k < aln.size(); k += 2)
-            --aln[k];
-      }
-    return true;
-  }
+      for (size_t i = rec.e1; i <= rec.e2; ++i)
+        {
+          uint64_t tpid = b->getPid();
+          if (find(seen.begin(), seen.end(), tpid) != seen.end()) 
+            continue; // don't over-count
+          seen.push_back(tpid);
+          size_t raw2 = b->approxOccurrenceCount();
+          m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, 
+                       aln, raw2, rec.po_fwd, rec.po_bwd, docid);
+          bool ok = (i == rec.e2) || b->extend(o[i].id());
+          UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
+        }
+      if (s < rec.s2) // shift phrase-internal alignments
+        for (size_t k = 1; k < aln.size(); k += 2)
+          --aln[k];
+    }
+  return true;
+}
   
-  template<typename Token>
-  bool
-  BitextSampler<Token>::
-  operator()()
-  {
-    if (m_finished) return true;
-    boost::unique_lock<boost::mutex> lock(m_lock);
+template<typename Token>
+bool
+BitextSampler<Token>::
+operator()()
+{
+  if (m_finished) return true;
+  boost::unique_lock<boost::mutex> lock(m_lock);
+  if (m_method == ranked_sampling)
     perform_ranked_sampling(); 
-    m_finished = true;
-    m_ready.notify_all();
-    return true;
-  }
+  else if (m_method == random_sampling)
+    perform_random_sampling();
+  else UTIL_THROW2("Unsupported sampling method.");
+  m_finished = true;
+  m_ready.notify_all();
+  return true;
+}
 
   
-  template<typename Token>
-  bool
-  BitextSampler<Token>::
-  done() const 
-  {
-    return m_next == m_stop;
-  }
+template<typename Token>
+bool
+BitextSampler<Token>::
+done() const 
+{
+  return m_next == m_stop;
+}
 
-  template<typename Token>
-  sptr<pstats> 
-  BitextSampler<Token>::
-  stats() 
-  {
-    // if (m_ctr == 0) (*this)();
-    // boost::unique_lock<boost::mutex> lock(m_lock);
-    // while (!m_finished)
-    // m_ready.wait(lock);
-    return m_stats;
-  }
+template<typename Token>
+sptr<pstats> 
+BitextSampler<Token>::
+stats() 
+{
+  // if (m_ctr == 0) (*this)();
+  // boost::unique_lock<boost::mutex> lock(m_lock);
+  // while (!m_finished)
+  // m_ready.wait(lock);
+  return m_stats;
+}
 
-  template<typename Token>
-  BitextSampler<Token>::
-  ~BitextSampler()
-  { 
-    m_stats->release();
-  }
+template<typename Token>
+BitextSampler<Token>::
+~BitextSampler()
+{ 
+  m_stats->release();
+}
 
 } // end of namespace bitext
 } // end of namespace Moses
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 3fa76f4d8..651621f96 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -833,17 +833,19 @@ namespace Moses
 	  }
 	if (!context->cache1) context->cache1.reset(new pstats::cache_t);
 	if (!context->cache2) context->cache2.reset(new pstats::cache_t);
-      } else if (!ttask->GetContextWeights().empty()) {
-          if (m_bias_log)
-            {
-              *m_bias_log << HERE << endl
-                          << "BIAS FROM MAP LOOKUP" << endl;
-              context->bias_log = m_bias_log;
-            }
-          context->bias
-            = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log);
-          context->bias->loglevel = m_bias_loglevel;
-          context->bias->log = m_bias_log;
+      } 
+    else if (!ttask->GetContextWeights().empty()) 
+      {
+	if (m_bias_log)
+	  {
+	    *m_bias_log << HERE << endl
+			<< "BIAS FROM MAP LOOKUP" << endl;
+	    context->bias_log = m_bias_log;
+	  }
+	context->bias
+	  = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log);
+	context->bias->loglevel = m_bias_loglevel;
+	context->bias->log = m_bias_log;
         if (!context->cache1) context->cache1.reset(new pstats::cache_t);
         if (!context->cache2) context->cache2.reset(new pstats::cache_t);
       }
@@ -897,13 +899,15 @@ namespace Moses
   Mmsapt::
   InitializeForInput(ttasksptr const& ttask)
   {
-    set_bias_for_ranking(ttask, this->btfix);
-    // to do: depending on method, set bias for ranking, via consulting the bias 
-    // server, or none at al.
-
     sptr<ContextScope> const& scope = ttask->GetScope();
     sptr<ContextForQuery> context = scope->get<ContextForQuery>(btfix.get(), true);
 
+    // set sampling bias, depending on sampling method specified
+    if (m_sampling_method == ranked_sampling)
+      set_bias_for_ranking(ttask, this->btfix);
+    else if (m_sampling_method == random_sampling)
+      set_bias_via_server(ttask);
+
     boost::unique_lock<boost::shared_mutex> mylock(m_lock);
     sptr<TPCollCache> localcache = scope->get<TPCollCache>(cache_key);
     if (!localcache)
author	Ulrich Germann <ugermann@inf.ed.ac.uk>	2015-07-05 15:08:57 +0300
committer	Ulrich Germann <ugermann@inf.ed.ac.uk>	2015-07-05 15:08:57 +0300
commit	4dd2ea3117bdc156cde45d9d126e07a245c6381d (patch)
tree	82142945e1ba423ff6eb1165485bc9ddcc5d223a
parent	e1f31666c3c9b37ff299c7d12e3be1c1cd151f07 (diff)