Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/TranslationModel/UG/mm')
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h6
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda.h8
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h10
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.cc9
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.h3
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_moses.h9
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.cc6
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.h5
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_sampler.h9
9 files changed, 41 insertions, 24 deletions
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 6c3a73457..2c835af7e 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -130,7 +130,6 @@ namespace sapt
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
// caches for unbiased sampling; biased sampling uses the caches that
// are stored locally on the translation task
-
public:
SPTR<Ttrack<char> > Tx; // word alignments
SPTR<Ttrack<Token> > T1; // token track
@@ -164,7 +163,8 @@ namespace sapt
#ifndef NO_MOSES
SPTR<pstats>
- prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+ prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+ int max_sample = -1) const;
#endif
protected:
@@ -189,7 +189,7 @@ namespace sapt
SPTR<pstats>
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
- void prep(ttasksptr const& ttask, iter const& phrase) const;
+ void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
#endif
void setDefaultSampleSize(size_t const max_samples);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
index bc038bd03..8865d4cd1 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@@ -33,7 +33,8 @@ public:
SPTR<pstats>
add_job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, SPTR<SamplingBias const> const& bias);
+ size_t const max_samples, SPTR<SamplingBias const> const& bias,
+ bool const track_sids);
// add_job(Bitext<Token> const* const theBitext,
// typename TSA<Token>::tree_iterator const& phrase,
// size_t const max_samples, SamplingBias const* const bias);
@@ -93,13 +94,14 @@ SPTR<pstats> Bitext<Token>
::agenda
::add_job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& phrase,
- size_t const max_samples, SPTR<SamplingBias const> const& bias)
+ size_t const max_samples, SPTR<SamplingBias const> const& bias,
+ bool const track_sids)
{
boost::unique_lock<boost::mutex> lk(this->lock);
static boost::posix_time::time_duration nodelay(0,0,0,0);
bool fwd = phrase.root == bt.I1.get();
SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
- max_samples, fwd, bias));
+ max_samples, fwd, bias, track_sids));
j->stats->register_worker();
joblist.push_back(j);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
index 7312ecef4..2ac7a5c35 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -35,6 +35,8 @@ public:
SPTR<pstats> stats; // stores statistics collected during sampling
SPTR<SamplingBias const> const m_bias; // sentence-level bias for sampling
float bias_total;
+ bool m_track_sids; // track sentence ids in sample?
+
bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
int
@@ -46,7 +48,7 @@ public:
job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& m,
SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
- SPTR<SamplingBias const> const& bias);
+ SPTR<SamplingBias const> const& bias, bool const track_sids);
~job();
};
@@ -66,7 +68,8 @@ Bitext<Token>::agenda::job
::job(Bitext<Token> const* const theBitext,
typename TSA<Token>::tree_iterator const& m,
SPTR<TSA<Token> > const& r, size_t maxsmpl,
- bool isfwd, SPTR<SamplingBias const> const& bias)
+ bool isfwd, SPTR<SamplingBias const> const& bias,
+ bool const track_sids)
: m_bitext(theBitext)
, rnd(0)
, rnddenom(rnd.max() + 1.)
@@ -80,8 +83,9 @@ Bitext<Token>::agenda::job
, len(m.size())
, fwd(isfwd)
, m_bias(bias)
+ , m_track_sids(track_sids)
{
- stats.reset(new pstats());
+ stats.reset(new pstats(m_track_sids));
stats->raw_cnt = m.approxOccurrenceCount();
bias_total = 0;
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
index 628d4364c..ab707cf9d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -57,7 +57,8 @@ namespace sapt
size_t
jstats::
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid)
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid,
+ uint32_t const sid, bool const track_sid)
{
boost::lock_guard<boost::mutex> lk(this->lock);
my_cnt2 = cnt2;
@@ -77,7 +78,11 @@ namespace sapt
}
++ofwd[fwd_orient];
++obwd[bwd_orient];
- sids.push_back(sid);
+ // Record sentence id if requested
+ if (track_sid)
+ {
+ sids.push_back(sid);
+ }
if (docid >= 0)
{
// while (int(indoc.size()) <= docid) indoc.push_back(0);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
index b66aee126..1068b47b9 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -42,7 +42,8 @@ namespace sapt
size_t
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid);
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
+ bool const track_sid);
void invalidate();
void validate();
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
index c04d87bfd..c024d073a 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
@@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
template<typename Token>
void
Bitext<Token>::
-prep(ttasksptr const& ttask, iter const& phrase) const
+prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
{
- prep2(ttask, phrase, m_default_sample_size);
+ prep2(ttask, phrase, track_sids, m_default_sample_size);
}
@@ -44,7 +44,8 @@ template<typename Token>
SPTR<pstats>
Bitext<Token>
::prep2
-( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+ int max_sample) const
{
if (max_sample < 0) max_sample = m_default_sample_size;
SPTR<SamplingBias> bias;
@@ -74,7 +75,7 @@ Bitext<Token>
if (m_num_workers > 1)
ag->add_workers(m_num_workers);
}
- ret = ag->add_job(this, phrase, max_sample, bias);
+ ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
if (cache) cache->set(phrase.getPid(),ret);
UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
return ret;
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
index e603def96..f8c93fe3c 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -10,7 +10,7 @@ namespace sapt
#endif
pstats::
- pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+ pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids)
{
for (int i = 0; i <= LRModel::NONE; ++i)
ofwd[i] = obwd[i] = 0;
@@ -69,11 +69,11 @@ namespace sapt
std::vector<unsigned char> const& a,
uint32_t const cnt2,
uint32_t fwd_o,
- uint32_t bwd_o, int const docid, int const sid)
+ uint32_t bwd_o, int const docid, uint32_t const sid)
{
boost::lock_guard<boost::mutex> guard(this->lock);
jstats& entry = this->trg[pid];
- size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid);
+ size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids);
if (this->good < entry.rcnt())
{
UTIL_THROW(util::Exception, "more joint counts than good counts:"
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
index e4481ee52..47ec33afb 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -35,7 +35,8 @@ namespace sapt
indoc_map_t indoc;
trg_map_t trg;
- pstats();
+ bool track_sids;
+ pstats(bool const track_sids);
~pstats();
void release();
void register_worker();
@@ -50,7 +51,7 @@ namespace sapt
uint32_t fwd_o, // fwd. phrase orientation
uint32_t bwd_o, // bwd. phrase orientation
int const docid, // document where sample was found
- int const sid); // index of sentence where sample was found
+ uint32_t const sid); // index of sentence where sample was found
void
count_sample(int const docid, // document where sample was found
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
index c94b2b149..ea27f18e8 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
@@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter
size_t m_num_occurrences; // estimated number of phrase occurrences in corpus
boost::taus88 m_rnd; // every job has its own pseudo random generator
double m_bias_total;
+ bool m_track_sids; // track sentence ids in stats?
size_t consider_sample(TokenPosition const& p);
size_t perform_random_sampling();
@@ -86,7 +87,8 @@ public:
SPTR<SamplingBias const> const& bias,
size_t const min_samples,
size_t const max_samples,
- sampling_method const method);
+ sampling_method const method,
+ bool const track_sids);
~BitextSampler();
SPTR<pstats> stats();
bool done() const;
@@ -185,7 +187,7 @@ BitextSampler<Token>::
BitextSampler(SPTR<Bitext<Token> const> const& bitext,
typename bitext::iter const& phrase,
SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
- sampling_method const method)
+ sampling_method const method, bool const track_sids)
: m_bitext(bitext)
, m_plen(phrase.size())
, m_fwd(phrase.root == bitext->I1.get())
@@ -201,8 +203,9 @@ BitextSampler(SPTR<Bitext<Token> const> const& bitext,
, m_finished(false)
, m_num_occurrences(phrase.ca())
, m_rnd(0)
+ , m_track_sids(track_sids)
{
- m_stats.reset(new pstats);
+ m_stats.reset(new pstats(m_track_sids));
m_stats->raw_cnt = phrase.ca();
m_stats->register_worker();
}