Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@amazon.com>2016-06-09 22:53:46 +0300
committerMichael Denkowski <mdenkows@amazon.com>2016-08-12 13:05:12 +0300
commit5c2b8d843c273ac27462e9522f9f67cdaa1f2959 (patch)
tree73595b5bc6a74d8e33b749f4b9797434ded5a792
parenta407452d3993b88d725a7838f7c76522ac11a7c8 (diff)
Distance feature for mmsapt
- Make ttask visible to scorers - Only track sentence ids if using distance feature
-rw-r--r--contrib/other-builds/moses/.project15
-rw-r--r--contrib/other-builds/moses/moses.project3
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h6
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda.h8
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h10
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.cc9
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.h3
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_moses.h9
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.cc6
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.h5
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_sampler.h9
-rw-r--r--moses/TranslationModel/UG/mmsapt.cpp56
-rw-r--r--moses/TranslationModel/UG/mmsapt.h4
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_scorers.h1
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_base.h1
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_coherence.h1
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_dist.h124
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_length_ratio.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_lex1.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_logcnt.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pbwd.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pfwd.h6
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_phrasecount.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_provenance.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_rareness.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_unaligned.h5
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_wordcount.h5
28 files changed, 271 insertions, 55 deletions
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 222f19365..53d0c1eb5 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -3654,6 +3654,16 @@
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_dist.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_dist.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_length_ratio.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_length_ratio.h</locationURI>
</link>
<link>
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
@@ -3699,6 +3709,11 @@
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/UG/sapt_pscore_cumulative_bias.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h</locationURI>
</link>
<link>
<name>TranslationModel/UG/sim-pe.cc</name>
diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project
index 2c78adc36..709b260e8 100644
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@@ -124,6 +124,8 @@
<File Name="../../../moses/TranslationModel/UG/sapt_phrase_scorers.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_base.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_coherence.h"/>
+ <File Name="../../../moses/TranslationModel/UG/sapt_pscore_dist.h"/>
+ <File Name="../../../moses/TranslationModel/UG/sapt_pscore_length_ratio.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_lex1.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_logcnt.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_pbwd.h"/>
@@ -133,6 +135,7 @@
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_rareness.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_unaligned.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_wordcount.h"/>
+ <File Name="../../../moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h"/>
<File Name="../../../moses/TranslationModel/UG/sim-pe.cc" ExcludeProjConfig="Debug"/>
<File Name="../../../moses/TranslationModel/UG/spe-check-coverage.cc" ExcludeProjConfig="Debug"/>
<File Name="../../../moses/TranslationModel/UG/spe-check-coverage2.cc" ExcludeProjConfig="Debug"/>
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 6c3a73457..2c835af7e 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -130,7 +130,6 @@ namespace sapt
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
// caches for unbiased sampling; biased sampling uses the caches that
// are stored locally on the translation task
-
public:
SPTR<Ttrack<char> > Tx; // word alignments
SPTR<Ttrack<Token> > T1; // token track
@@ -164,7 +163,8 @@ namespace sapt
#ifndef NO_MOSES
SPTR<pstats>
- prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+ prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+ int max_sample = -1) const;
#endif
protected:
@@ -189,7 +189,7 @@ namespace sapt
SPTR<pstats>
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
- void prep(ttasksptr const& ttask, iter const& phrase) const;
+ void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
#endif
void setDefaultSampleSize(size_t const max_samples);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
index bc038bd03..8865d4cd1 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@@ -33,7 +33,8 @@ public:
SPTR<pstats>
add_job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, SPTR<SamplingBias const> const& bias);
+ size_t const max_samples, SPTR<SamplingBias const> const& bias,
+ bool const track_sids);
// add_job(Bitext<Token> const* const theBitext,
// typename TSA<Token>::tree_iterator const& phrase,
// size_t const max_samples, SamplingBias const* const bias);
@@ -93,13 +94,14 @@ SPTR<pstats> Bitext<Token>
::agenda
::add_job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, SPTR<SamplingBias const> const& bias)
+ size_t const max_samples, SPTR<SamplingBias const> const& bias,
+ bool const track_sids)
{
boost::unique_lock<boost::mutex> lk(this->lock);
static boost::posix_time::time_duration nodelay(0,0,0,0);
bool fwd = phrase.root == bt.I1.get();
SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
- max_samples, fwd, bias));
+ max_samples, fwd, bias, track_sids));
j->stats->register_worker();
joblist.push_back(j);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
index 7312ecef4..2ac7a5c35 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -35,6 +35,8 @@ public:
SPTR<pstats> stats; // stores statistics collected during sampling
SPTR<SamplingBias const> const m_bias; // sentence-level bias for sampling
float bias_total;
+ bool m_track_sids; // track sentence ids in sample?
+
bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
int
@@ -46,7 +48,7 @@ public:
job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& m,
SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
- SPTR<SamplingBias const> const& bias);
+ SPTR<SamplingBias const> const& bias, bool const track_sids);
~job();
};
@@ -66,7 +68,8 @@ Bitext<Token>::agenda::job
::job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& m,
SPTR<TSA<Token> > const& r, size_t maxsmpl,
- bool isfwd, SPTR<SamplingBias const> const& bias)
+ bool isfwd, SPTR<SamplingBias const> const& bias,
+ bool const track_sids)
: m_bitext(theBitext)
, rnd(0)
, rnddenom(rnd.max() + 1.)
@@ -80,8 +83,9 @@ Bitext<Token>::agenda::job
, len(m.size())
, fwd(isfwd)
, m_bias(bias)
+ , m_track_sids(track_sids)
{
- stats.reset(new pstats());
+ stats.reset(new pstats(m_track_sids));
stats->raw_cnt = m.approxOccurrenceCount();
bias_total = 0;
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
index 628d4364c..ab707cf9d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -57,7 +57,8 @@ namespace sapt
size_t
jstats::
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid)
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid,
+ uint32_t const sid, bool const track_sid)
{
boost::lock_guard<boost::mutex> lk(this->lock);
my_cnt2 = cnt2;
@@ -77,7 +78,11 @@ namespace sapt
}
++ofwd[fwd_orient];
++obwd[bwd_orient];
- sids.push_back(sid);
+ // Record sentence id if requested
+ if (track_sid)
+ {
+ sids.push_back(sid);
+ }
if (docid >= 0)
{
// while (int(indoc.size()) <= docid) indoc.push_back(0);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
index b66aee126..1068b47b9 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -42,7 +42,8 @@ namespace sapt
size_t
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid);
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
+ bool const track_sid);
void invalidate();
void validate();
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
index c04d87bfd..c024d073a 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
@@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
template<typename Token>
void
Bitext<Token>::
-prep(ttasksptr const& ttask, iter const& phrase) const
+prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
{
- prep2(ttask, phrase, m_default_sample_size);
+ prep2(ttask, phrase, track_sids, m_default_sample_size);
}
@@ -44,7 +44,8 @@ template<typename Token>
SPTR<pstats>
Bitext<Token>
::prep2
-( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+ int max_sample) const
{
if (max_sample < 0) max_sample = m_default_sample_size;
SPTR<SamplingBias> bias;
@@ -74,7 +75,7 @@ Bitext<Token>
if (m_num_workers > 1)
ag->add_workers(m_num_workers);
}
- ret = ag->add_job(this, phrase, max_sample, bias);
+ ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
if (cache) cache->set(phrase.getPid(),ret);
UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
return ret;
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
index e603def96..f8c93fe3c 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -10,7 +10,7 @@ namespace sapt
#endif
pstats::
- pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+ pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids)
{
for (int i = 0; i <= LRModel::NONE; ++i)
ofwd[i] = obwd[i] = 0;
@@ -69,11 +69,11 @@ namespace sapt
std::vector<unsigned char> const& a,
uint32_t const cnt2,
uint32_t fwd_o,
- uint32_t bwd_o, int const docid, int const sid)
+ uint32_t bwd_o, int const docid, uint32_t const sid)
{
boost::lock_guard<boost::mutex> guard(this->lock);
jstats& entry = this->trg[pid];
- size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid);
+ size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids);
if (this->good < entry.rcnt())
{
UTIL_THROW(util::Exception, "more joint counts than good counts:"
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
index e4481ee52..47ec33afb 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -35,7 +35,8 @@ namespace sapt
indoc_map_t indoc;
trg_map_t trg;
- pstats();
+ bool track_sids;
+ pstats(bool const track_sids);
~pstats();
void release();
void register_worker();
@@ -50,7 +51,7 @@ namespace sapt
uint32_t fwd_o, // fwd. phrase orientation
uint32_t bwd_o, // bwd. phrase orientation
int const docid, // document where sample was found
- int const sid); // index of sentence where sample was found
+ uint32_t const sid); // index of sentence where sample was found
void
count_sample(int const docid, // document where sample was found
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
index c94b2b149..ea27f18e8 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
@@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter
size_t m_num_occurrences; // estimated number of phrase occurrences in corpus
boost::taus88 m_rnd; // every job has its own pseudo random generator
double m_bias_total;
+ bool m_track_sids; // track sentence ids in stats?
size_t consider_sample(TokenPosition const& p);
size_t perform_random_sampling();
@@ -86,7 +87,8 @@ public:
SPTR<SamplingBias const> const& bias,
size_t const min_samples,
size_t const max_samples,
- sampling_method const method);
+ sampling_method const method,
+ bool const track_sids);
~BitextSampler();
SPTR<pstats> stats();
bool done() const;
@@ -185,7 +187,7 @@ BitextSampler<Token>::
BitextSampler(SPTR<Bitext<Token> const> const& bitext,
typename bitext::iter const& phrase,
SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
- sampling_method const method)
+ sampling_method const method, bool const track_sids)
: m_bitext(bitext)
, m_plen(phrase.size())
, m_fwd(phrase.root == bitext->I1.get())
@@ -201,8 +203,9 @@ BitextSampler(SPTR<Bitext<Token> const> const& bitext,
, m_finished(false)
, m_num_occurrences(phrase.ca())
, m_rnd(0)
+ , m_track_sids(track_sids)
{
- m_stats.reset(new pstats);
+ m_stats.reset(new pstats(m_track_sids));
m_stats->raw_cnt = phrase.ca();
m_stats->register_worker();
}
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 024ae44d3..224e5f91a 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -215,6 +215,7 @@ namespace Moses
param.insert(pair<string,string>("coh", "0"));
param.insert(pair<string,string>("prov", "0"));
param.insert(pair<string,string>("cumb", "0"));
+ param.insert(pair<string,string>("dist", "0"));
poolCounts = true;
@@ -291,6 +292,7 @@ namespace Moses
known_parameters.push_back("coh");
known_parameters.push_back("config");
known_parameters.push_back("cumb");
+ known_parameters.push_back("dist");
known_parameters.push_back("extra");
known_parameters.push_back("feature-sets");
known_parameters.push_back("input-factor");
@@ -466,6 +468,19 @@ namespace Moses
SPTR<PScoreWC<Token> > ffwcnt(new PScoreWC<Token>("wcnt"));
register_ff(ffwcnt,m_active_ff_common);
}
+ // Optional distance feature
+ if(param["dist"] != "0")
+ {
+ // Now using sid coordinate list
+ // (to be populated after bitext load)
+ if(m_sid_coord == NULL) {
+ m_sid_coord.reset(new vector<vector<float> >());
+ }
+ // Track sids when sampling bitext
+ m_track_sids = true;
+ SPTR<PScoreDist<Token> > ff(new PScoreDist<Token>(m_sid_coord, param["dist"]));
+ register_ff(ff,m_active_ff_common);
+ }
}
// cerr << "Features: " << Join("|",m_feature_names) << endl;
this->m_numScoreComponents = this->m_feature_names.size();
@@ -509,6 +524,28 @@ namespace Moses
if (m_extra_data.size())
load_extra_data(m_extra_data, false);
+ // A feature (such as dist) left a note that we need to populate src
+ // sentence coordinates
+ if (m_sid_coord)
+ {
+ // We know the corpus size from the bitext
+ m_sid_coord->reserve(btfix->T1->size());
+ string coordfile = m_bname + L1 + ".coord.gz";
+ string line;
+ cerr << "Loading coordinate lines from " << coordfile << endl;
+ boost::iostreams::filtering_istream in;
+ ugdiss::open_input_stream(coordfile, in);
+ while(getline(in, line))
+ {
+ m_sid_coord->push_back(Scan<float>(Tokenize(line)));
+ }
+ cerr << "Loaded " << m_sid_coord->size() << " lines" << endl;
+ UTIL_THROW_IF2(m_sid_coord->size() != btfix->T1->size(),
+ "Coordinates file size does not match bitext size ("
+ << m_sid_coord->size() << " != " << btfix->T1->size()
+ << ")");
+ }
+
#if 0
// currently not used
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
@@ -550,12 +587,12 @@ namespace Moses
if (fix)
{
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix)
- (*ff)(*btfix, *fix, &fvals);
+ (*ff)(*btfix, *fix, ttask, &fvals);
}
if (dyn)
{
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn)
- (*ff)(*dynbt, *dyn, &fvals);
+ (*ff)(*dynbt, *dyn, ttask, &fvals);
}
if (fix && dyn) { pool += *dyn; }
@@ -567,7 +604,7 @@ namespace Moses
zilch.raw2 = m.approxOccurrenceCount();
pool += zilch;
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn)
- (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+ (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals);
}
else if (dyn)
{
@@ -577,17 +614,17 @@ namespace Moses
zilch.raw2 = m.approxOccurrenceCount();
pool += zilch;
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix)
- (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+ (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals);
}
if (fix)
{
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common)
- (*ff)(*btfix, pool, &fvals);
+ (*ff)(*btfix, pool, ttask, &fvals);
}
else
{
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common)
- (*ff)(*dynbt, pool, &fvals);
+ (*ff)(*dynbt, pool, ttask, &fvals);
}
TargetPhrase* tp = new TargetPhrase(const_cast<ttasksptr&>(ttask), this);
@@ -730,7 +767,8 @@ namespace Moses
BitextSampler<Token> s(btfix, mfix, context->bias,
m_min_sample_size,
m_default_sample_size,
- m_sampling_method);
+ m_sampling_method,
+ m_track_sids);
s();
sfix = s.stats();
}
@@ -918,7 +956,7 @@ namespace Moses
{
BitextSampler<Token> s(btfix, mfix, context->bias,
m_min_sample_size, m_default_sample_size,
- m_sampling_method);
+ m_sampling_method, m_track_sids);
if (*context->cache1->get(pid, s.stats()) == s.stats())
m_thread_pool->add(s);
}
@@ -939,7 +977,7 @@ namespace Moses
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
// let's assume a uniform bias over the foreground corpus
- if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn);
+ if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_sids);
}
return mdyn.size() == myphrase.size();
}
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index 4a8393c11..5ece3c988 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -119,6 +119,10 @@ namespace Moses
std::vector<SPTR<pscorer > > m_active_ff_common;
// activated feature functions (dyn)
+ // Coordinates of bitext source sentences for dist feature
+ boost::shared_ptr<std::vector<std::vector<float> > > m_sid_coord;
+ bool m_track_sids; // track sids when sampling bitext?
+
void
parse_factor_spec(std::vector<FactorType>& flist, std::string const key);
diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h
index 7fee0568d..d2012ee52 100644
--- a/moses/TranslationModel/UG/sapt_phrase_scorers.h
+++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h
@@ -14,3 +14,4 @@
#include "sapt_pscore_phrasecount.h" // phrase count
#include "sapt_pscore_wordcount.h" // word count
#include "sapt_pscore_cumulative_bias.h" // cumulative bias score
+#include "sapt_pscore_dist.h" // sample distance score
diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h
index 1d509dc40..3a90a051c 100644
--- a/moses/TranslationModel/UG/sapt_pscore_base.h
+++ b/moses/TranslationModel/UG/sapt_pscore_base.h
@@ -27,6 +27,7 @@
virtual void
operator()(Bitext<Token> const& pt, PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
std::vector<float> * dest=NULL) const = 0;
void
diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h
index a3c13fb5b..1d13f7753 100644
--- a/moses/TranslationModel/UG/sapt_pscore_coherence.h
+++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h
@@ -22,6 +22,7 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
diff --git a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
index b195be290..fddc770fc 100644
--- a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
+++ b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
@@ -28,8 +28,9 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = log(std::max(m_floor,pp.cum_bias));
diff --git a/moses/TranslationModel/UG/sapt_pscore_dist.h b/moses/TranslationModel/UG/sapt_pscore_dist.h
new file mode 100644
index 000000000..841842ec9
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_dist.h
@@ -0,0 +1,124 @@
+// -*- c++ -*-
+//
+// This scorer measures distance between sentences in an arbitrary N-dimensional
+// space on the source side. It provides two scores for each phrase pair:
+// * Distance to input, the average distance between training sentences and the
+// input sentence (are training points close to test point?)
+// * Training data consistency, the average distance between training sentences
+// and their centroid (are training points close to each other?)
+// Here "training sentences" refers to the subset of sentences sampled from the
+// suffix array from which the phrase pair can be extracted. The two distances
+// reported as feature scores are log-transformed.
+//
+// This requires pre-computing the coordinates of every source sentence in the
+// bitext and computing the coordinates of each input sentence at run-time.
+//
+// Specify the coordinates of bitext source sentences with a file called
+// ${CORPUS}.${L1}.coord.gz that contains lines of space-delimited floats:
+// 0.1 0.5 0.2 ...
+//
+// Specify the coordinates of input sentences (InputType m_coord) with XML input
+// using the coord tag. See www.statmt.org/moses/?n=Advanced.Hybrid#ntoc1 for
+// turning on XML input:
+// <coord coord="0.1 0.5 0.2 ..." />
+//
+// Activate this feature with "dist=MEASURE" where MEASURE is one of:
+// euc: Euclidean distance (for spaces)
+// var: total variation distance (for distributions)
+
+#pragma once
+#include "sapt_pscore_base.h"
+#include "mmsapt.h"
+
+#include <boost/foreach.hpp>
+
+namespace sapt
+{
+ template<typename Token>
+ class
+ PScoreDist : public PhraseScorer<Token>
+ {
+ enum Measure {
+ EuclideanDistance,
+ TotalVariationDistance,
+ };
+ boost::shared_ptr<std::vector<std::vector<float> > > m_sid_coord;
+ Measure m_measure;
+ public:
+ PScoreDist(boost::shared_ptr<std::vector<std::vector<float> > > const& sid_coord,
+ std::string const description)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 2;
+ this->m_feature_names.push_back("dist-" + description + "-i");
+ this->m_feature_names.push_back("dist-" + description + "-c");
+ this->m_sid_coord = sid_coord;
+ if (description == "euc") {
+ this->m_measure = EuclideanDistance;
+ } else if (description == "var") {
+ this->m_measure = TotalVariationDistance;
+ } else {
+ UTIL_THROW2("Unknown specification \""
+ << description << "\" for dist phrase scorer (one of: euc var)");
+ }
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
+ {
+ if (!dest) {
+ dest = &pp.fvals;
+ }
+ // Coordinates of input
+ std::vector<float> const& input = *(ttask->GetSource()->m_coord);
+ // Coordinates of training data centroid
+ std::vector<float> centroid = std::vector<float>((*m_sid_coord)[0].size());
+ BOOST_FOREACH(int const sid, pp.sids) {
+ std::vector<float> const& point = (*m_sid_coord)[sid];
+ for (size_t i = 0; i < centroid.size(); ++i) {
+ centroid[i] += point[i];
+ }
+ }
+ for (size_t i = 0; i < centroid.size(); ++i) {
+ centroid[i] /= pp.sids.size();
+ }
+ // Compute log-average-distance of specified type from the training points
+ // to both the input sentence and training centroid (max distance with
+ // float epsilon to avoid domain error)
+ float input_distance = 0;
+ float centroid_distance = 0;
+ if (m_measure == EuclideanDistance) {
+ BOOST_FOREACH(int const sid, pp.sids) {
+ std::vector<float> const& point = (*m_sid_coord)[sid];
+ float input_point_distance = 0;
+ float centroid_point_distance = 0;
+ for (size_t i = 0; i < input.size(); ++i) {
+ input_point_distance += pow(input[i] - point[i], 2);
+ centroid_point_distance += pow(centroid[i] - point[i], 2);
+ }
+ input_distance += sqrt(input_point_distance);
+ centroid_distance += sqrt(centroid_point_distance);
+ }
+ } else if (m_measure == TotalVariationDistance) {
+ BOOST_FOREACH(int const sid, pp.sids) {
+ std::vector<float> const& point = (*m_sid_coord)[sid];
+ float input_point_distance = 0;
+ float centroid_point_distance = 0;
+ for (size_t i = 0; i < input.size(); ++i) {
+ input_point_distance += std::abs(input[i] - point[i]);
+ centroid_point_distance += std::abs(centroid[i] - point[i]);
+ }
+ input_distance += input_point_distance / 2;
+ centroid_distance += centroid_point_distance / 2;
+ }
+ }
+ input_distance /= pp.sids.size();
+ centroid_distance /= pp.sids.size();
+ (*dest)[this->m_index] = log(std::max(input_distance, Moses::FLOAT_EPSILON));
+ (*dest)[this->m_index + 1] = log(std::max(centroid_distance, Moses::FLOAT_EPSILON));
+ }
+ };
+}
diff --git a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
index 356217caa..28452ad49 100644
--- a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
+++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
@@ -48,8 +48,9 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
float p = float(bt.T1->numTokens());
diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h
index 4ae94502b..8270db951 100644
--- a/moses/TranslationModel/UG/sapt_pscore_lex1.h
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@@ -36,8 +36,9 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
index 592d86866..4f4a44b86 100644
--- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@@ -37,8 +37,9 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
assert(pp.raw1);
diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
index 35c7e1fa9..c990dd1ef 100644
--- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@@ -38,8 +38,9 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// we use the denominator specification to scale the raw counts on the
diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
index bfa8027d1..291e4c10f 100644
--- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@@ -38,8 +38,10 @@ namespace sapt
}
void
- operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
- std::vector<float> * dest = NULL) const
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
if (pp.joint > pp.good1)
diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
index a1426426a..18b225dac 100644
--- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
@@ -22,8 +22,9 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = 1;
diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h
index 67ee74850..4204a1314 100644
--- a/moses/TranslationModel/UG/sapt_pscore_provenance.h
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@@ -28,8 +28,9 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;
diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h
index c36da1913..aba9bbbcf 100644
--- a/moses/TranslationModel/UG/sapt_pscore_rareness.h
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@@ -26,8 +26,9 @@ namespace sapt {
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
size_t i = this->m_index;
diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
index 4201b839c..8bff82b1f 100644
--- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@@ -37,8 +37,9 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
index 6cd9e7c0c..e1747f380 100644
--- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
@@ -22,8 +22,9 @@ namespace sapt
void
operator()(Bitext<Token> const& bt,
- PhrasePair<Token>& pp,
- std::vector<float> * dest = NULL) const
+ PhrasePair<Token>& pp,
+ ttasksptr const& ttask,
+ std::vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = pp.len2;