Distance feature for mmsapt

- Make ttask visible to scorers - Only track sentence ids if using distance feature
author: Michael Denkowski <mdenkows@amazon.com> 2016-06-09 22:53:46 +0300
committer: Michael Denkowski <mdenkows@amazon.com> 2016-08-12 13:05:12 +0300
commit: 5c2b8d843c273ac27462e9522f9f67cdaa1f2959 (patch)
tree: 73595b5bc6a74d8e33b749f4b9797434ded5a792
parent: a407452d3993b88d725a7838f7c76522ac11a7c8 (diff)
28 files changed, 271 insertions, 55 deletions
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 222f19365..53d0c1eb5 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -3654,6 +3654,16 @@
 			<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
+        </link>
+		<link>
+			<name>TranslationModel/UG/sapt_pscore_dist.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_dist.h</locationURI>
+		</link>
+		<link>
+			<name>TranslationModel/UG/sapt_pscore_length_ratio.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_length_ratio.h</locationURI>
 		</link>
 		<link>
 			<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
@@ -3699,6 +3709,11 @@
 			<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
+        </link>
+		<link>
+			<name>TranslationModel/UG/sapt_pscore_cumulative_bias.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h</locationURI>
 		</link>
 		<link>
 			<name>TranslationModel/UG/sim-pe.cc</name>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 2c78adc36..709b260e8 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -124,6 +124,8 @@
       <File Name="../../../moses/TranslationModel/UG/sapt_phrase_scorers.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_base.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_coherence.h"/>
+      <File Name="../../../moses/TranslationModel/UG/sapt_pscore_dist.h"/>
+      <File Name="../../../moses/TranslationModel/UG/sapt_pscore_length_ratio.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_lex1.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_logcnt.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_pbwd.h"/>
@@ -133,6 +135,7 @@
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_rareness.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_unaligned.h"/>
       <File Name="../../../moses/TranslationModel/UG/sapt_pscore_wordcount.h"/>
+      <File Name="../../../moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h"/>
       <File Name="../../../moses/TranslationModel/UG/sim-pe.cc" ExcludeProjConfig="Debug"/>
       <File Name="../../../moses/TranslationModel/UG/spe-check-coverage.cc" ExcludeProjConfig="Debug"/>
       <File Name="../../../moses/TranslationModel/UG/spe-check-coverage2.cc" ExcludeProjConfig="Debug"/>
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 6c3a73457..2c835af7e 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -130,7 +130,6 @@ namespace sapt
     mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
     // caches for unbiased sampling; biased sampling uses the caches that
     // are stored locally on the translation task
-
   public:
     SPTR<Ttrack<char> >  Tx; // word alignments
     SPTR<Ttrack<Token> > T1; // token track
@@ -164,7 +163,8 @@ namespace sapt
 
 #ifndef NO_MOSES
     SPTR<pstats>
-    prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+    prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+          int max_sample = -1) const;
 #endif 
 
   protected:
@@ -189,7 +189,7 @@ namespace sapt
     SPTR<pstats>
     lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
 
-    void prep(ttasksptr const& ttask, iter const& phrase) const;
+    void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
 #endif
 
     void   setDefaultSampleSize(size_t const max_samples);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
index bc038bd03..8865d4cd1 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@@ -33,7 +33,8 @@ public:
   SPTR<pstats>
   add_job(Bitext<Token> const* const theBitext,
 	  typename TSA<Token>::tree_iterator const& phrase,
-	  size_t const max_samples, SPTR<SamplingBias const> const& bias);
+	  size_t const max_samples, SPTR<SamplingBias const> const& bias,
+    bool const track_sids);
     // add_job(Bitext<Token> const* const theBitext,
     // 	  typename TSA<Token>::tree_iterator const& phrase,
     // 	  size_t const max_samples, SamplingBias const* const bias);
@@ -93,13 +94,14 @@ SPTR<pstats> Bitext<Token>
 ::agenda
 ::add_job(Bitext<Token> const* const theBitext,
 	  typename TSA<Token>::tree_iterator const& phrase,
-	  size_t const max_samples, SPTR<SamplingBias const> const& bias)
+	  size_t const max_samples, SPTR<SamplingBias const> const& bias,
+	  bool const track_sids)
 {
   boost::unique_lock<boost::mutex> lk(this->lock);
   static boost::posix_time::time_duration nodelay(0,0,0,0);
   bool fwd = phrase.root == bt.I1.get();
   SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
-		      max_samples, fwd, bias));
+		      max_samples, fwd, bias, track_sids));
   j->stats->register_worker();
 
   joblist.push_back(j);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
index 7312ecef4..2ac7a5c35 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -35,6 +35,8 @@ public:
   SPTR<pstats>     stats; // stores statistics collected during sampling
   SPTR<SamplingBias const> const m_bias; // sentence-level bias for sampling
   float bias_total;
+  bool m_track_sids;  // track sentence ids in sample?
+
   bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
 
   int
@@ -46,7 +48,7 @@ public:
   job(Bitext<Token> const* const theBitext,
       typename TSA<Token>::tree_iterator const& m,
       SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
-      SPTR<SamplingBias const> const& bias);
+      SPTR<SamplingBias const> const& bias, bool const track_sids);
   ~job();
 };
 
@@ -66,7 +68,8 @@ Bitext<Token>::agenda::job
 ::job(Bitext<Token> const* const theBitext,
       typename TSA<Token>::tree_iterator const& m,
       SPTR<TSA<Token> > const& r, size_t maxsmpl,
-      bool isfwd, SPTR<SamplingBias const> const& bias)
+      bool isfwd, SPTR<SamplingBias const> const& bias,
+      bool const track_sids)
   : m_bitext(theBitext)
   , rnd(0)
   , rnddenom(rnd.max() + 1.)
@@ -80,8 +83,9 @@ Bitext<Token>::agenda::job
   , len(m.size())
   , fwd(isfwd)
   , m_bias(bias)
+  , m_track_sids(track_sids)
 {
-  stats.reset(new pstats());
+  stats.reset(new pstats(m_track_sids));
   stats->raw_cnt = m.approxOccurrenceCount();
   bias_total = 0;
 
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
index 628d4364c..ab707cf9d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -57,7 +57,8 @@ namespace sapt
   size_t
   jstats::
   add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
-      uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid)
+      uint32_t fwd_orient, uint32_t bwd_orient, int const docid,
+      uint32_t const sid, bool const track_sid)
   {
     boost::lock_guard<boost::mutex> lk(this->lock);
     my_cnt2 = cnt2;
@@ -77,7 +78,11 @@ namespace sapt
       }
     ++ofwd[fwd_orient];
     ++obwd[bwd_orient];
-    sids.push_back(sid);
+    // Record sentence id if requested
+    if (track_sid)
+      {
+        sids.push_back(sid);
+      }
     if (docid >= 0)
       {
         // while (int(indoc.size()) <= docid) indoc.push_back(0);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
index b66aee126..1068b47b9 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -42,7 +42,8 @@ namespace sapt
 
     size_t 
     add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2, 
-	uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid);
+	uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
+	bool const track_sid);
 
     void invalidate();
     void validate();
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
index c04d87bfd..c024d073a 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
@@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
 template<typename Token>
 void
 Bitext<Token>::
-prep(ttasksptr const& ttask, iter const& phrase) const
+prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
 {
-  prep2(ttask, phrase, m_default_sample_size);
+  prep2(ttask, phrase, track_sids, m_default_sample_size);
 }
 
 
@@ -44,7 +44,8 @@ template<typename Token>
 SPTR<pstats>
 Bitext<Token>
 ::prep2
-( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+  int max_sample) const
 {
   if (max_sample < 0) max_sample = m_default_sample_size;
   SPTR<SamplingBias> bias;
@@ -74,7 +75,7 @@ Bitext<Token>
       if (m_num_workers > 1)
 	ag->add_workers(m_num_workers);
     }
-  ret = ag->add_job(this, phrase, max_sample, bias);
+  ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
   if (cache) cache->set(phrase.getPid(),ret);
   UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
   return ret;
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
index e603def96..f8c93fe3c 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -10,7 +10,7 @@ namespace sapt
 #endif
 
   pstats::
-  pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+  pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids)
   {
     for (int i = 0; i <= LRModel::NONE; ++i)
       ofwd[i] = obwd[i] = 0;
@@ -69,11 +69,11 @@ namespace sapt
       std::vector<unsigned char> const& a,
       uint32_t const cnt2,
       uint32_t fwd_o,
-      uint32_t bwd_o, int const docid, int const sid)
+      uint32_t bwd_o, int const docid, uint32_t const sid)
   {
     boost::lock_guard<boost::mutex> guard(this->lock);
     jstats& entry = this->trg[pid];
-    size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid);
+    size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids);
     if (this->good < entry.rcnt())
       {
         UTIL_THROW(util::Exception, "more joint counts than good counts:"
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
index e4481ee52..47ec33afb 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -35,7 +35,8 @@ namespace sapt
 
     indoc_map_t indoc;
     trg_map_t trg;
-    pstats();
+    bool track_sids;
+    pstats(bool const track_sids);
     ~pstats();
     void release();
     void register_worker();
@@ -50,7 +51,7 @@ namespace sapt
         uint32_t fwd_o,      // fwd. phrase orientation
         uint32_t bwd_o,      // bwd. phrase orientation
         int const docid,     // document where sample was found
-        int const sid);      // index of sentence where sample was found
+        uint32_t const sid); // index of sentence where sample was found
     
     void
     count_sample(int const docid,        // document where sample was found
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
index c94b2b149..ea27f18e8 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
@@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter
   size_t m_num_occurrences; // estimated number of phrase occurrences in corpus
   boost::taus88 m_rnd;  // every job has its own pseudo random generator
   double m_bias_total;
+  bool m_track_sids; // track sentence ids in stats?
 
   size_t consider_sample(TokenPosition const& p);
   size_t perform_random_sampling();
@@ -86,7 +87,8 @@ public:
                 SPTR<SamplingBias const> const& bias, 
                 size_t const min_samples, 
                 size_t const max_samples,
-                sampling_method const method); 
+                sampling_method const method,
+                bool const track_sids);
   ~BitextSampler();
   SPTR<pstats> stats();
   bool done() const;
@@ -185,7 +187,7 @@ BitextSampler<Token>::
 BitextSampler(SPTR<Bitext<Token> const> const& bitext, 
               typename bitext::iter const& phrase,
               SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
-              sampling_method const method)
+              sampling_method const method, bool const track_sids)
   : m_bitext(bitext)
   , m_plen(phrase.size())
   , m_fwd(phrase.root == bitext->I1.get())
@@ -201,8 +203,9 @@ BitextSampler(SPTR<Bitext<Token> const> const& bitext,
   , m_finished(false)
   , m_num_occurrences(phrase.ca())
   , m_rnd(0)
+  , m_track_sids(track_sids)
 {
-  m_stats.reset(new pstats);
+  m_stats.reset(new pstats(m_track_sids));
   m_stats->raw_cnt = phrase.ca();
   m_stats->register_worker();
 }
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 024ae44d3..224e5f91a 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -215,6 +215,7 @@ namespace Moses
     param.insert(pair<string,string>("coh",    "0"));
     param.insert(pair<string,string>("prov",   "0"));
     param.insert(pair<string,string>("cumb",   "0"));
+    param.insert(pair<string,string>("dist",   "0"));
 
     poolCounts = true;
 
@@ -291,6 +292,7 @@ namespace Moses
     known_parameters.push_back("coh");
     known_parameters.push_back("config");
     known_parameters.push_back("cumb");
+    known_parameters.push_back("dist");
     known_parameters.push_back("extra");
     known_parameters.push_back("feature-sets");
     known_parameters.push_back("input-factor");
@@ -466,6 +468,19 @@ namespace Moses
             SPTR<PScoreWC<Token> > ffwcnt(new PScoreWC<Token>("wcnt"));
             register_ff(ffwcnt,m_active_ff_common);
           }
+        // Optional distance feature
+        if(param["dist"] != "0")
+          {
+            // Now using sid coordinate list
+            // (to be populated after bitext load)
+            if(m_sid_coord == NULL) {
+              m_sid_coord.reset(new vector<vector<float> >());
+            }
+            // Track sids when sampling bitext
+            m_track_sids = true;
+            SPTR<PScoreDist<Token> > ff(new PScoreDist<Token>(m_sid_coord, param["dist"]));
+            register_ff(ff,m_active_ff_common);
+          }
       }
     // cerr << "Features: " << Join("|",m_feature_names) << endl;
     this->m_numScoreComponents = this->m_feature_names.size();
@@ -509,6 +524,28 @@ namespace Moses
     if (m_extra_data.size())
       load_extra_data(m_extra_data, false);
 
+    // A feature (such as dist) left a note that we need to populate src
+    // sentence coordinates
+    if (m_sid_coord)
+      {
+        // We know the corpus size from the bitext
+        m_sid_coord->reserve(btfix->T1->size());
+        string coordfile = m_bname + L1 + ".coord.gz";
+        string line;
+        cerr << "Loading coordinate lines from " << coordfile << endl;
+        boost::iostreams::filtering_istream in;
+        ugdiss::open_input_stream(coordfile, in);
+        while(getline(in, line))
+          {
+            m_sid_coord->push_back(Scan<float>(Tokenize(line)));
+          }
+        cerr << "Loaded " << m_sid_coord->size() << " lines" << endl;
+        UTIL_THROW_IF2(m_sid_coord->size() != btfix->T1->size(),
+                       "Coordinates file size does not match bitext size ("
+                       << m_sid_coord->size() << " != " << btfix->T1->size()
+                       << ")");
+      }
+
 #if 0
     // currently not used
     LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
@@ -550,12 +587,12 @@ namespace Moses
     if (fix)
       {
         BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix)
-          (*ff)(*btfix, *fix, &fvals);
+          (*ff)(*btfix, *fix, ttask, &fvals);
       }
     if (dyn)
       {
         BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn)
-          (*ff)(*dynbt, *dyn, &fvals);
+          (*ff)(*dynbt, *dyn, ttask, &fvals);
       }
 
     if (fix && dyn) { pool += *dyn; }
@@ -567,7 +604,7 @@ namespace Moses
           zilch.raw2 = m.approxOccurrenceCount();
         pool += zilch;
         BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn)
-          (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+          (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals);
       }
     else if (dyn)
       {
@@ -577,17 +614,17 @@ namespace Moses
           zilch.raw2 = m.approxOccurrenceCount();
         pool += zilch;
         BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix)
-          (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+          (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals);
       }
     if (fix)
       {
         BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common)
-          (*ff)(*btfix, pool, &fvals);
+          (*ff)(*btfix, pool, ttask, &fvals);
       }
     else
       {
         BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common)
-          (*ff)(*dynbt, pool, &fvals);
+          (*ff)(*dynbt, pool, ttask, &fvals);
       }
 
     TargetPhrase* tp = new TargetPhrase(const_cast<ttasksptr&>(ttask), this);
@@ -730,7 +767,8 @@ namespace Moses
             BitextSampler<Token> s(btfix, mfix, context->bias, 
                                    m_min_sample_size, 
                                    m_default_sample_size, 
-                                   m_sampling_method);
+                                   m_sampling_method,
+                                   m_track_sids);
             s();
             sfix = s.stats();
           }
@@ -918,7 +956,7 @@ namespace Moses
           {
             BitextSampler<Token> s(btfix, mfix, context->bias, 
                                    m_min_sample_size, m_default_sample_size, 
-                                   m_sampling_method);
+                                   m_sampling_method, m_track_sids);
             if (*context->cache1->get(pid, s.stats()) == s.stats())
               m_thread_pool->add(s);
           }
@@ -939,7 +977,7 @@ namespace Moses
         for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
           mdyn.extend(myphrase[i]);
         // let's assume a uniform bias over the foreground corpus
-        if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn);
+        if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_sids);
       }
     return mdyn.size() == myphrase.size();
   }
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index 4a8393c11..5ece3c988 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -119,6 +119,10 @@ namespace Moses
     std::vector<SPTR<pscorer > > m_active_ff_common;
     // activated feature functions (dyn)
 
+    // Coordinates of bitext source sentences for dist feature
+    boost::shared_ptr<std::vector<std::vector<float> > > m_sid_coord;
+    bool m_track_sids; // track sids when sampling bitext?
+
     void
     parse_factor_spec(std::vector<FactorType>& flist, std::string const key);
 
diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h
index 7fee0568d..d2012ee52 100644
--- a/moses/TranslationModel/UG/sapt_phrase_scorers.h
+++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h
@@ -14,3 +14,4 @@
 #include "sapt_pscore_phrasecount.h"  // phrase count
 #include "sapt_pscore_wordcount.h"    // word count
 #include "sapt_pscore_cumulative_bias.h" // cumulative bias score
+#include "sapt_pscore_dist.h"         // sample distance score
diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h
index 1d509dc40..3a90a051c 100644
--- a/moses/TranslationModel/UG/sapt_pscore_base.h
+++ b/moses/TranslationModel/UG/sapt_pscore_base.h
@@ -27,6 +27,7 @@
 
       virtual void
       operator()(Bitext<Token> const& pt, PhrasePair<Token>& pp,
+                 ttasksptr const& ttask,
                  std::vector<float> * dest=NULL) const = 0;
 
       void
diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h
index a3c13fb5b..1d13f7753 100644
--- a/moses/TranslationModel/UG/sapt_pscore_coherence.h
+++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h
@@ -22,6 +22,7 @@ namespace sapt
     void
     operator()(Bitext<Token> const& bt,
 	       PhrasePair<Token>& pp,
+	       ttasksptr const& ttask,
 	       std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
diff --git a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
index b195be290..fddc770fc 100644
--- a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
+++ b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
@@ -28,8 +28,9 @@ namespace sapt  {
     
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       (*dest)[this->m_index] = log(std::max(m_floor,pp.cum_bias));
diff --git a/moses/TranslationModel/UG/sapt_pscore_dist.h b/moses/TranslationModel/UG/sapt_pscore_dist.h
new file mode 100644
index 000000000..841842ec9
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_dist.h
@@ -0,0 +1,124 @@
+// -*- c++ -*-
+//
+// This scorer measures distance between sentences in an arbitrary N-dimensional
+// space on the source side.  It provides two scores for each phrase pair:
+// * Distance to input, the average distance between training sentences and the
+//   input sentence (are training points close to test point?)
+// * Training data consistency, the average distance between training sentences
+//   and their centroid (are training points close to each other?)
+// Here "training sentences" refers to the subset of sentences sampled from the
+// suffix array from which the phrase pair can be extracted.  The two distances
+// reported as feature scores are log-transformed.
+//
+// This requires pre-computing the coordinates of every source sentence in the
+// bitext and computing the coordinates of each input sentence at run-time.
+//
+// Specify the coordinates of bitext source sentences with a file called
+// ${CORPUS}.${L1}.coord.gz that contains lines of space-delimited floats:
+// 0.1 0.5 0.2 ...
+//
+// Specify the coordinates of input sentences (InputType m_coord) with XML input
+// using the coord tag.  See www.statmt.org/moses/?n=Advanced.Hybrid#ntoc1 for
+// turning on XML input:
+// <coord coord="0.1 0.5 0.2 ..." />
+//
+// Activate this feature with "dist=MEASURE" where MEASURE is one of:
+// euc: Euclidean distance (for spaces)
+// var: total variation distance (for distributions)
+
+#pragma once
+#include "sapt_pscore_base.h"
+#include "mmsapt.h"
+
+#include <boost/foreach.hpp>
+
+namespace sapt
+{
+  template<typename Token>
+  class
+  PScoreDist : public PhraseScorer<Token>
+  {
+    enum Measure {
+      EuclideanDistance,
+      TotalVariationDistance,
+    };
+    boost::shared_ptr<std::vector<std::vector<float> > > m_sid_coord;
+    Measure m_measure;
+  public:
+    PScoreDist(boost::shared_ptr<std::vector<std::vector<float> > > const& sid_coord,
+        std::string const description)
+    {
+      this->m_index = -1;
+      this->m_num_feats = 2;
+      this->m_feature_names.push_back("dist-" + description + "-i");
+      this->m_feature_names.push_back("dist-" + description + "-c");
+      this->m_sid_coord = sid_coord;
+      if (description == "euc") {
+        this->m_measure = EuclideanDistance;
+      } else if (description == "var") {
+        this->m_measure = TotalVariationDistance;
+      } else {
+        UTIL_THROW2("Unknown specification \""
+            << description << "\" for dist phrase scorer (one of: euc var)");
+      }
+    }
+
+    void
+    operator()(Bitext<Token> const& bt,
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
+    {
+      if (!dest) {
+        dest = &pp.fvals;
+      }
+      // Coordinates of input
+      std::vector<float> const& input = *(ttask->GetSource()->m_coord);
+      // Coordinates of training data centroid
+      std::vector<float> centroid = std::vector<float>((*m_sid_coord)[0].size());
+      BOOST_FOREACH(int const sid, pp.sids) {
+        std::vector<float> const& point = (*m_sid_coord)[sid];
+        for (size_t i = 0; i < centroid.size(); ++i) {
+          centroid[i] += point[i];
+        }
+      }
+      for (size_t i = 0; i < centroid.size(); ++i) {
+        centroid[i] /= pp.sids.size();
+      }
+      // Compute log-average-distance of specified type from the training points
+      // to both the input sentence and training centroid (max distance with
+      // float epsilon to avoid domain error)
+      float input_distance = 0;
+      float centroid_distance = 0;
+      if (m_measure == EuclideanDistance) {
+        BOOST_FOREACH(int const sid, pp.sids) {
+          std::vector<float> const& point = (*m_sid_coord)[sid];
+          float input_point_distance = 0;
+          float centroid_point_distance = 0;
+          for (size_t i = 0; i < input.size(); ++i) {
+            input_point_distance += pow(input[i] - point[i], 2);
+            centroid_point_distance += pow(centroid[i] - point[i], 2);
+          }
+          input_distance += sqrt(input_point_distance);
+          centroid_distance += sqrt(centroid_point_distance);
+        }
+      } else if (m_measure == TotalVariationDistance) {
+        BOOST_FOREACH(int const sid, pp.sids) {
+          std::vector<float> const& point = (*m_sid_coord)[sid];
+          float input_point_distance = 0;
+          float centroid_point_distance = 0;
+          for (size_t i = 0; i < input.size(); ++i) {
+            input_point_distance += std::abs(input[i] - point[i]);
+            centroid_point_distance += std::abs(centroid[i] - point[i]);
+          }
+          input_distance += input_point_distance / 2;
+          centroid_distance += centroid_point_distance / 2;
+        }
+      }
+      input_distance /= pp.sids.size();
+      centroid_distance /= pp.sids.size();
+      (*dest)[this->m_index] = log(std::max(input_distance, Moses::FLOAT_EPSILON));
+      (*dest)[this->m_index + 1] = log(std::max(centroid_distance, Moses::FLOAT_EPSILON));
+    }
+  };
+}
diff --git a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
index 356217caa..28452ad49 100644
--- a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
+++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
@@ -48,8 +48,9 @@ namespace sapt  {
 
     void
     operator()(Bitext<Token> const& bt,
-               PhrasePair<Token>& pp,
-               std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       float p  = float(bt.T1->numTokens());
diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h
index 4ae94502b..8270db951 100644
--- a/moses/TranslationModel/UG/sapt_pscore_lex1.h
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@@ -36,8 +36,9 @@ namespace sapt
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
index 592d86866..4f4a44b86 100644
--- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@@ -37,8 +37,9 @@ namespace sapt  {
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       assert(pp.raw1);
diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
index 35c7e1fa9..c990dd1ef 100644
--- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@@ -38,8 +38,9 @@ namespace sapt
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       // we use the denominator specification to scale the raw counts on the
diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
index bfa8027d1..291e4c10f 100644
--- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@@ -38,8 +38,10 @@ namespace sapt
     }
 
     void
-    operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
-	       std::vector<float> * dest = NULL) const
+    operator()(Bitext<Token> const& bt,
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       if (pp.joint > pp.good1)
diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
index a1426426a..18b225dac 100644
--- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
@@ -22,8 +22,9 @@ namespace sapt
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       (*dest)[this->m_index] = 1;
diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h
index 67ee74850..4204a1314 100644
--- a/moses/TranslationModel/UG/sapt_pscore_provenance.h
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@@ -28,8 +28,9 @@ namespace sapt {
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       size_t i = this->m_index;
diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h
index c36da1913..aba9bbbcf 100644
--- a/moses/TranslationModel/UG/sapt_pscore_rareness.h
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@@ -26,8 +26,9 @@ namespace sapt  {
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       size_t i = this->m_index;
diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
index 4201b839c..8bff82b1f 100644
--- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@@ -37,8 +37,9 @@ namespace sapt
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
index 6cd9e7c0c..e1747f380 100644
--- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
@@ -22,8 +22,9 @@ namespace sapt
 
     void
     operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         ttasksptr const& ttask,
+         std::vector<float> * dest = NULL) const
     {
       if (!dest) dest = &pp.fvals;
       (*dest)[this->m_index] = pp.len2;
author	Michael Denkowski <mdenkows@amazon.com>	2016-06-09 22:53:46 +0300
committer	Michael Denkowski <mdenkows@amazon.com>	2016-08-12 13:05:12 +0300
commit	5c2b8d843c273ac27462e9522f9f67cdaa1f2959 (patch)
tree	73595b5bc6a74d8e33b749f4b9797434ded5a792
parent	a407452d3993b88d725a7838f7c76522ac11a7c8 (diff)