From 5c2b8d843c273ac27462e9522f9f67cdaa1f2959 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Thu, 9 Jun 2016 15:53:46 -0400 Subject: Distance feature for mmsapt - Make ttask visible to scorers - Only track sentence ids if using distance feature --- contrib/other-builds/moses/.project | 15 +++ contrib/other-builds/moses/moses.project | 3 + moses/TranslationModel/UG/mm/ug_bitext.h | 6 +- moses/TranslationModel/UG/mm/ug_bitext_agenda.h | 8 +- .../TranslationModel/UG/mm/ug_bitext_agenda_job.h | 10 +- moses/TranslationModel/UG/mm/ug_bitext_jstats.cc | 9 +- moses/TranslationModel/UG/mm/ug_bitext_jstats.h | 3 +- moses/TranslationModel/UG/mm/ug_bitext_moses.h | 9 +- moses/TranslationModel/UG/mm/ug_bitext_pstats.cc | 6 +- moses/TranslationModel/UG/mm/ug_bitext_pstats.h | 5 +- moses/TranslationModel/UG/mm/ug_bitext_sampler.h | 9 +- moses/TranslationModel/UG/mmsapt.cpp | 56 ++++++++-- moses/TranslationModel/UG/mmsapt.h | 4 + moses/TranslationModel/UG/sapt_phrase_scorers.h | 1 + moses/TranslationModel/UG/sapt_pscore_base.h | 1 + moses/TranslationModel/UG/sapt_pscore_coherence.h | 1 + .../UG/sapt_pscore_cumulative_bias.h | 5 +- moses/TranslationModel/UG/sapt_pscore_dist.h | 124 +++++++++++++++++++++ .../TranslationModel/UG/sapt_pscore_length_ratio.h | 5 +- moses/TranslationModel/UG/sapt_pscore_lex1.h | 5 +- moses/TranslationModel/UG/sapt_pscore_logcnt.h | 5 +- moses/TranslationModel/UG/sapt_pscore_pbwd.h | 5 +- moses/TranslationModel/UG/sapt_pscore_pfwd.h | 6 +- .../TranslationModel/UG/sapt_pscore_phrasecount.h | 5 +- moses/TranslationModel/UG/sapt_pscore_provenance.h | 5 +- moses/TranslationModel/UG/sapt_pscore_rareness.h | 5 +- moses/TranslationModel/UG/sapt_pscore_unaligned.h | 5 +- moses/TranslationModel/UG/sapt_pscore_wordcount.h | 5 +- 28 files changed, 271 insertions(+), 55 deletions(-) create mode 100644 moses/TranslationModel/UG/sapt_pscore_dist.h diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 222f19365..53d0c1eb5 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -3654,6 +3654,16 @@ TranslationModel/UG/sapt_pscore_coherence.h 1 PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h + + + TranslationModel/UG/sapt_pscore_dist.h + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_dist.h + + + TranslationModel/UG/sapt_pscore_length_ratio.h + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_length_ratio.h TranslationModel/UG/sapt_pscore_lex1.h @@ -3699,6 +3709,11 @@ TranslationModel/UG/sapt_pscore_wordcount.h 1 PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h + + + TranslationModel/UG/sapt_pscore_cumulative_bias.h + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h TranslationModel/UG/sim-pe.cc diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 2c78adc36..709b260e8 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -124,6 +124,8 @@ + + @@ -133,6 +135,7 @@ + diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 6c3a73457..2c835af7e 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -130,7 +130,6 @@ namespace sapt mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; // caches for unbiased sampling; biased sampling uses the caches that // are stored locally on the translation task - public: SPTR > Tx; // word alignments SPTR > T1; // token track @@ -164,7 +163,8 @@ namespace sapt #ifndef NO_MOSES SPTR - prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; + prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids, + int max_sample = -1) const; #endif protected: @@ -189,7 +189,7 @@ namespace sapt SPTR lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; - void prep(ttasksptr const& ttask, iter const& phrase) const; + void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const; #endif void setDefaultSampleSize(size_t const max_samples); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h index bc038bd03..8865d4cd1 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -33,7 +33,8 @@ public: SPTR add_job(Bitext const* const theBitext, typename TSA::tree_iterator const& phrase, - size_t const max_samples, SPTR const& bias); + size_t const max_samples, SPTR const& bias, + bool const track_sids); // add_job(Bitext const* const theBitext, // typename TSA::tree_iterator const& phrase, // size_t const max_samples, SamplingBias const* const bias); @@ -93,13 +94,14 @@ SPTR Bitext ::agenda ::add_job(Bitext const* const theBitext, typename TSA::tree_iterator const& phrase, - size_t const max_samples, SPTR const& bias) + size_t const max_samples, SPTR const& bias, + bool const track_sids) { boost::unique_lock lk(this->lock); static boost::posix_time::time_duration nodelay(0,0,0,0); bool fwd = phrase.root == bt.I1.get(); SPTR j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, - max_samples, fwd, bias)); + max_samples, fwd, bias, track_sids)); j->stats->register_worker(); joblist.push_back(j); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 7312ecef4..2ac7a5c35 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -35,6 +35,8 @@ public: SPTR stats; // stores statistics collected during sampling SPTR const m_bias; // sentence-level bias for sampling float bias_total; + bool m_track_sids; // track sentence ids in sample? + bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence int @@ -46,7 +48,7 @@ public: job(Bitext const* const theBitext, typename TSA::tree_iterator const& m, SPTR > const& r, size_t maxsmpl, bool isfwd, - SPTR const& bias); + SPTR const& bias, bool const track_sids); ~job(); }; @@ -66,7 +68,8 @@ Bitext::agenda::job ::job(Bitext const* const theBitext, typename TSA::tree_iterator const& m, SPTR > const& r, size_t maxsmpl, - bool isfwd, SPTR const& bias) + bool isfwd, SPTR const& bias, + bool const track_sids) : m_bitext(theBitext) , rnd(0) , rnddenom(rnd.max() + 1.) @@ -80,8 +83,9 @@ Bitext::agenda::job , len(m.size()) , fwd(isfwd) , m_bias(bias) + , m_track_sids(track_sids) { - stats.reset(new pstats()); + stats.reset(new pstats(m_track_sids)); stats->raw_cnt = m.approxOccurrenceCount(); bias_total = 0; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index 628d4364c..ab707cf9d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -57,7 +57,8 @@ namespace sapt size_t jstats:: add(float w, float b, std::vector const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid) + uint32_t fwd_orient, uint32_t bwd_orient, int const docid, + uint32_t const sid, bool const track_sid) { boost::lock_guard lk(this->lock); my_cnt2 = cnt2; @@ -77,7 +78,11 @@ namespace sapt } ++ofwd[fwd_orient]; ++obwd[bwd_orient]; - sids.push_back(sid); + // Record sentence id if requested + if (track_sid) + { + sids.push_back(sid); + } if (docid >= 0) { // while (int(indoc.size()) <= docid) indoc.push_back(0); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index b66aee126..1068b47b9 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -42,7 +42,8 @@ namespace sapt size_t add(float w, float b, std::vector const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid); + uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid, + bool const track_sid); void invalidate(); void validate(); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h index c04d87bfd..c024d073a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h @@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const template void Bitext:: -prep(ttasksptr const& ttask, iter const& phrase) const +prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const { - prep2(ttask, phrase, m_default_sample_size); + prep2(ttask, phrase, track_sids, m_default_sample_size); } @@ -44,7 +44,8 @@ template SPTR Bitext ::prep2 -( ttasksptr const& ttask, iter const& phrase, int max_sample) const +( ttasksptr const& ttask, iter const& phrase, bool const track_sids, + int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; SPTR bias; @@ -74,7 +75,7 @@ Bitext if (m_num_workers > 1) ag->add_workers(m_num_workers); } - ret = ag->add_job(this, phrase, max_sample, bias); + ret = ag->add_job(this, phrase, max_sample, bias, track_sids); if (cache) cache->set(phrase.getPid(),ret); UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job."); return ret; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index e603def96..f8c93fe3c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -10,7 +10,7 @@ namespace sapt #endif pstats:: - pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0) + pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids) { for (int i = 0; i <= LRModel::NONE; ++i) ofwd[i] = obwd[i] = 0; @@ -69,11 +69,11 @@ namespace sapt std::vector const& a, uint32_t const cnt2, uint32_t fwd_o, - uint32_t bwd_o, int const docid, int const sid) + uint32_t bwd_o, int const docid, uint32_t const sid) { boost::lock_guard guard(this->lock); jstats& entry = this->trg[pid]; - size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid); + size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids); if (this->good < entry.rcnt()) { UTIL_THROW(util::Exception, "more joint counts than good counts:" diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index e4481ee52..47ec33afb 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -35,7 +35,8 @@ namespace sapt indoc_map_t indoc; trg_map_t trg; - pstats(); + bool track_sids; + pstats(bool const track_sids); ~pstats(); void release(); void register_worker(); @@ -50,7 +51,7 @@ namespace sapt uint32_t fwd_o, // fwd. phrase orientation uint32_t bwd_o, // bwd. phrase orientation int const docid, // document where sample was found - int const sid); // index of sentence where sample was found + uint32_t const sid); // index of sentence where sample was found void count_sample(int const docid, // document where sample was found diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index c94b2b149..ea27f18e8 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter size_t m_num_occurrences; // estimated number of phrase occurrences in corpus boost::taus88 m_rnd; // every job has its own pseudo random generator double m_bias_total; + bool m_track_sids; // track sentence ids in stats? size_t consider_sample(TokenPosition const& p); size_t perform_random_sampling(); @@ -86,7 +87,8 @@ public: SPTR const& bias, size_t const min_samples, size_t const max_samples, - sampling_method const method); + sampling_method const method, + bool const track_sids); ~BitextSampler(); SPTR stats(); bool done() const; @@ -185,7 +187,7 @@ BitextSampler:: BitextSampler(SPTR const> const& bitext, typename bitext::iter const& phrase, SPTR const& bias, size_t const min_samples, size_t const max_samples, - sampling_method const method) + sampling_method const method, bool const track_sids) : m_bitext(bitext) , m_plen(phrase.size()) , m_fwd(phrase.root == bitext->I1.get()) @@ -201,8 +203,9 @@ BitextSampler(SPTR const> const& bitext, , m_finished(false) , m_num_occurrences(phrase.ca()) , m_rnd(0) + , m_track_sids(track_sids) { - m_stats.reset(new pstats); + m_stats.reset(new pstats(m_track_sids)); m_stats->raw_cnt = phrase.ca(); m_stats->register_worker(); } diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 024ae44d3..224e5f91a 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -215,6 +215,7 @@ namespace Moses param.insert(pair("coh", "0")); param.insert(pair("prov", "0")); param.insert(pair("cumb", "0")); + param.insert(pair("dist", "0")); poolCounts = true; @@ -291,6 +292,7 @@ namespace Moses known_parameters.push_back("coh"); known_parameters.push_back("config"); known_parameters.push_back("cumb"); + known_parameters.push_back("dist"); known_parameters.push_back("extra"); known_parameters.push_back("feature-sets"); known_parameters.push_back("input-factor"); @@ -466,6 +468,19 @@ namespace Moses SPTR > ffwcnt(new PScoreWC("wcnt")); register_ff(ffwcnt,m_active_ff_common); } + // Optional distance feature + if(param["dist"] != "0") + { + // Now using sid coordinate list + // (to be populated after bitext load) + if(m_sid_coord == NULL) { + m_sid_coord.reset(new vector >()); + } + // Track sids when sampling bitext + m_track_sids = true; + SPTR > ff(new PScoreDist(m_sid_coord, param["dist"])); + register_ff(ff,m_active_ff_common); + } } // cerr << "Features: " << Join("|",m_feature_names) << endl; this->m_numScoreComponents = this->m_feature_names.size(); @@ -509,6 +524,28 @@ namespace Moses if (m_extra_data.size()) load_extra_data(m_extra_data, false); + // A feature (such as dist) left a note that we need to populate src + // sentence coordinates + if (m_sid_coord) + { + // We know the corpus size from the bitext + m_sid_coord->reserve(btfix->T1->size()); + string coordfile = m_bname + L1 + ".coord.gz"; + string line; + cerr << "Loading coordinate lines from " << coordfile << endl; + boost::iostreams::filtering_istream in; + ugdiss::open_input_stream(coordfile, in); + while(getline(in, line)) + { + m_sid_coord->push_back(Scan(Tokenize(line))); + } + cerr << "Loaded " << m_sid_coord->size() << " lines" << endl; + UTIL_THROW_IF2(m_sid_coord->size() != btfix->T1->size(), + "Coordinates file size does not match bitext size (" + << m_sid_coord->size() << " != " << btfix->T1->size() + << ")"); + } + #if 0 // currently not used LexicalPhraseScorer2::table_t & COOC = calc_lex.scorer.COOC; @@ -550,12 +587,12 @@ namespace Moses if (fix) { BOOST_FOREACH(SPTR const& ff, m_active_ff_fix) - (*ff)(*btfix, *fix, &fvals); + (*ff)(*btfix, *fix, ttask, &fvals); } if (dyn) { BOOST_FOREACH(SPTR const& ff, m_active_ff_dyn) - (*ff)(*dynbt, *dyn, &fvals); + (*ff)(*dynbt, *dyn, ttask, &fvals); } if (fix && dyn) { pool += *dyn; } @@ -567,7 +604,7 @@ namespace Moses zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; BOOST_FOREACH(SPTR const& ff, m_active_ff_dyn) - (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); + (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals); } else if (dyn) { @@ -577,17 +614,17 @@ namespace Moses zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; BOOST_FOREACH(SPTR const& ff, m_active_ff_fix) - (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); + (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, ttask, &fvals); } if (fix) { BOOST_FOREACH(SPTR const& ff, m_active_ff_common) - (*ff)(*btfix, pool, &fvals); + (*ff)(*btfix, pool, ttask, &fvals); } else { BOOST_FOREACH(SPTR const& ff, m_active_ff_common) - (*ff)(*dynbt, pool, &fvals); + (*ff)(*dynbt, pool, ttask, &fvals); } TargetPhrase* tp = new TargetPhrase(const_cast(ttask), this); @@ -730,7 +767,8 @@ namespace Moses BitextSampler s(btfix, mfix, context->bias, m_min_sample_size, m_default_sample_size, - m_sampling_method); + m_sampling_method, + m_track_sids); s(); sfix = s.stats(); } @@ -918,7 +956,7 @@ namespace Moses { BitextSampler s(btfix, mfix, context->bias, m_min_sample_size, m_default_sample_size, - m_sampling_method); + m_sampling_method, m_track_sids); if (*context->cache1->get(pid, s.stats()) == s.stats()) m_thread_pool->add(s); } @@ -939,7 +977,7 @@ namespace Moses for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i) mdyn.extend(myphrase[i]); // let's assume a uniform bias over the foreground corpus - if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn); + if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_sids); } return mdyn.size() == myphrase.size(); } diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 4a8393c11..5ece3c988 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -119,6 +119,10 @@ namespace Moses std::vector > m_active_ff_common; // activated feature functions (dyn) + // Coordinates of bitext source sentences for dist feature + boost::shared_ptr > > m_sid_coord; + bool m_track_sids; // track sids when sampling bitext? + void parse_factor_spec(std::vector& flist, std::string const key); diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h index 7fee0568d..d2012ee52 100644 --- a/moses/TranslationModel/UG/sapt_phrase_scorers.h +++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h @@ -14,3 +14,4 @@ #include "sapt_pscore_phrasecount.h" // phrase count #include "sapt_pscore_wordcount.h" // word count #include "sapt_pscore_cumulative_bias.h" // cumulative bias score +#include "sapt_pscore_dist.h" // sample distance score diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h index 1d509dc40..3a90a051c 100644 --- a/moses/TranslationModel/UG/sapt_pscore_base.h +++ b/moses/TranslationModel/UG/sapt_pscore_base.h @@ -27,6 +27,7 @@ virtual void operator()(Bitext const& pt, PhrasePair& pp, + ttasksptr const& ttask, std::vector * dest=NULL) const = 0; void diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h index a3c13fb5b..1d13f7753 100644 --- a/moses/TranslationModel/UG/sapt_pscore_coherence.h +++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h @@ -22,6 +22,7 @@ namespace sapt void operator()(Bitext const& bt, PhrasePair& pp, + ttasksptr const& ttask, std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; diff --git a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h index b195be290..fddc770fc 100644 --- a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h +++ b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h @@ -28,8 +28,9 @@ namespace sapt { void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; (*dest)[this->m_index] = log(std::max(m_floor,pp.cum_bias)); diff --git a/moses/TranslationModel/UG/sapt_pscore_dist.h b/moses/TranslationModel/UG/sapt_pscore_dist.h new file mode 100644 index 000000000..841842ec9 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_dist.h @@ -0,0 +1,124 @@ +// -*- c++ -*- +// +// This scorer measures distance between sentences in an arbitrary N-dimensional +// space on the source side. It provides two scores for each phrase pair: +// * Distance to input, the average distance between training sentences and the +// input sentence (are training points close to test point?) +// * Training data consistency, the average distance between training sentences +// and their centroid (are training points close to each other?) +// Here "training sentences" refers to the subset of sentences sampled from the +// suffix array from which the phrase pair can be extracted. The two distances +// reported as feature scores are log-transformed. +// +// This requires pre-computing the coordinates of every source sentence in the +// bitext and computing the coordinates of each input sentence at run-time. +// +// Specify the coordinates of bitext source sentences with a file called +// ${CORPUS}.${L1}.coord.gz that contains lines of space-delimited floats: +// 0.1 0.5 0.2 ... +// +// Specify the coordinates of input sentences (InputType m_coord) with XML input +// using the coord tag. See www.statmt.org/moses/?n=Advanced.Hybrid#ntoc1 for +// turning on XML input: +// +// +// Activate this feature with "dist=MEASURE" where MEASURE is one of: +// euc: Euclidean distance (for spaces) +// var: total variation distance (for distributions) + +#pragma once +#include "sapt_pscore_base.h" +#include "mmsapt.h" + +#include + +namespace sapt +{ + template + class + PScoreDist : public PhraseScorer + { + enum Measure { + EuclideanDistance, + TotalVariationDistance, + }; + boost::shared_ptr > > m_sid_coord; + Measure m_measure; + public: + PScoreDist(boost::shared_ptr > > const& sid_coord, + std::string const description) + { + this->m_index = -1; + this->m_num_feats = 2; + this->m_feature_names.push_back("dist-" + description + "-i"); + this->m_feature_names.push_back("dist-" + description + "-c"); + this->m_sid_coord = sid_coord; + if (description == "euc") { + this->m_measure = EuclideanDistance; + } else if (description == "var") { + this->m_measure = TotalVariationDistance; + } else { + UTIL_THROW2("Unknown specification \"" + << description << "\" for dist phrase scorer (one of: euc var)"); + } + } + + void + operator()(Bitext const& bt, + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const + { + if (!dest) { + dest = &pp.fvals; + } + // Coordinates of input + std::vector const& input = *(ttask->GetSource()->m_coord); + // Coordinates of training data centroid + std::vector centroid = std::vector((*m_sid_coord)[0].size()); + BOOST_FOREACH(int const sid, pp.sids) { + std::vector const& point = (*m_sid_coord)[sid]; + for (size_t i = 0; i < centroid.size(); ++i) { + centroid[i] += point[i]; + } + } + for (size_t i = 0; i < centroid.size(); ++i) { + centroid[i] /= pp.sids.size(); + } + // Compute log-average-distance of specified type from the training points + // to both the input sentence and training centroid (max distance with + // float epsilon to avoid domain error) + float input_distance = 0; + float centroid_distance = 0; + if (m_measure == EuclideanDistance) { + BOOST_FOREACH(int const sid, pp.sids) { + std::vector const& point = (*m_sid_coord)[sid]; + float input_point_distance = 0; + float centroid_point_distance = 0; + for (size_t i = 0; i < input.size(); ++i) { + input_point_distance += pow(input[i] - point[i], 2); + centroid_point_distance += pow(centroid[i] - point[i], 2); + } + input_distance += sqrt(input_point_distance); + centroid_distance += sqrt(centroid_point_distance); + } + } else if (m_measure == TotalVariationDistance) { + BOOST_FOREACH(int const sid, pp.sids) { + std::vector const& point = (*m_sid_coord)[sid]; + float input_point_distance = 0; + float centroid_point_distance = 0; + for (size_t i = 0; i < input.size(); ++i) { + input_point_distance += std::abs(input[i] - point[i]); + centroid_point_distance += std::abs(centroid[i] - point[i]); + } + input_distance += input_point_distance / 2; + centroid_distance += centroid_point_distance / 2; + } + } + input_distance /= pp.sids.size(); + centroid_distance /= pp.sids.size(); + (*dest)[this->m_index] = log(std::max(input_distance, Moses::FLOAT_EPSILON)); + (*dest)[this->m_index + 1] = log(std::max(centroid_distance, Moses::FLOAT_EPSILON)); + } + }; +} diff --git a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h index 356217caa..28452ad49 100644 --- a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h +++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h @@ -48,8 +48,9 @@ namespace sapt { void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; float p = float(bt.T1->numTokens()); diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h index 4ae94502b..8270db951 100644 --- a/moses/TranslationModel/UG/sapt_pscore_lex1.h +++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h @@ -36,8 +36,9 @@ namespace sapt void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h index 592d86866..4f4a44b86 100644 --- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h +++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h @@ -37,8 +37,9 @@ namespace sapt { void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; assert(pp.raw1); diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h index 35c7e1fa9..c990dd1ef 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -38,8 +38,9 @@ namespace sapt void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; // we use the denominator specification to scale the raw counts on the diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h index bfa8027d1..291e4c10f 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -38,8 +38,10 @@ namespace sapt } void - operator()(Bitext const& bt, PhrasePair & pp, - std::vector * dest = NULL) const + operator()(Bitext const& bt, + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; if (pp.joint > pp.good1) diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h index a1426426a..18b225dac 100644 --- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h +++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h @@ -22,8 +22,9 @@ namespace sapt void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; (*dest)[this->m_index] = 1; diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h index 67ee74850..4204a1314 100644 --- a/moses/TranslationModel/UG/sapt_pscore_provenance.h +++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h @@ -28,8 +28,9 @@ namespace sapt { void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; size_t i = this->m_index; diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h index c36da1913..aba9bbbcf 100644 --- a/moses/TranslationModel/UG/sapt_pscore_rareness.h +++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h @@ -26,8 +26,9 @@ namespace sapt { void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; size_t i = this->m_index; diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h index 4201b839c..8bff82b1f 100644 --- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h +++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h @@ -37,8 +37,9 @@ namespace sapt void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h index 6cd9e7c0c..e1747f380 100644 --- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h +++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h @@ -22,8 +22,9 @@ namespace sapt void operator()(Bitext const& bt, - PhrasePair& pp, - std::vector * dest = NULL) const + PhrasePair& pp, + ttasksptr const& ttask, + std::vector * dest = NULL) const { if (!dest) dest = &pp.fvals; (*dest)[this->m_index] = pp.len2; -- cgit v1.2.3