Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@amazon.com>2016-06-07 07:54:06 +0300
committerMichael Denkowski <mdenkows@amazon.com>2016-08-12 13:05:11 +0300
commit7db3fedc3b6a2498fec82ea648661140c56e1040 (patch)
tree05a914511d897c2464e9f6ef7a721b66c4e4819d /moses/TranslationModel
parenta8325a3e8e24ade0806bec97492553d0550904b6 (diff)
Track sentence ids in mmsapt sampling
Diffstat (limited to 'moses/TranslationModel')
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h2
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.cc4
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_jstats.h3
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.cc4
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_pstats.h3
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext_sampler.h3
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h4
7 files changed, 16 insertions, 7 deletions
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
index af862bc2d..4ab02648a 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
@@ -90,7 +90,7 @@ Bitext<Token>::agenda
size_t raw2 = b->approxOccurrenceCount();
float bwgt = j->m_bias ? (*j->m_bias)[sid] : 1;
j->stats->add(tpid, sample_weight, bwgt, aln, raw2,
- po_fwd, po_bwd, docid);
+ po_fwd, po_bwd, docid, sid);
bool ok = (i == e2) || b->extend(o[i].id());
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
index 2fa9a49f5..628d4364c 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -29,6 +29,7 @@ namespace sapt
my_wcnt = other.wcnt();
my_bcnt = other.bcnt();
my_aln = other.aln();
+ sids = other.sids;
indoc = other.indoc;
for (int i = 0; i <= LRModel::NONE; i++)
{
@@ -56,7 +57,7 @@ namespace sapt
size_t
jstats::
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid)
{
boost::lock_guard<boost::mutex> lk(this->lock);
my_cnt2 = cnt2;
@@ -76,6 +77,7 @@ namespace sapt
}
++ofwd[fwd_orient];
++obwd[bwd_orient];
+ sids.push_back(sid);
if (docid >= 0)
{
// while (int(indoc.size()) <= docid) indoc.push_back(0);
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
index d8e0bb18a..b66aee126 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -28,6 +28,7 @@ namespace sapt
uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
public:
+ std::vector<uint32_t> sids; // list of sentence ids in this sample
std::map<uint32_t,uint32_t> indoc;
// std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
jstats();
@@ -41,7 +42,7 @@ namespace sapt
size_t
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
- uint32_t fwd_orient, uint32_t bwd_orient, int const docid);
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid, int const sid);
void invalidate();
void validate();
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
index f1602ab96..e603def96 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -69,11 +69,11 @@ namespace sapt
std::vector<unsigned char> const& a,
uint32_t const cnt2,
uint32_t fwd_o,
- uint32_t bwd_o, int const docid)
+ uint32_t bwd_o, int const docid, int const sid)
{
boost::lock_guard<boost::mutex> guard(this->lock);
jstats& entry = this->trg[pid];
- size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid);
+ size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid);
if (this->good < entry.rcnt())
{
UTIL_THROW(util::Exception, "more joint counts than good counts:"
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
index cdc4f0c3d..e4481ee52 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -49,7 +49,8 @@ namespace sapt
uint32_t const cnt2, // raw target phrase count
uint32_t fwd_o, // fwd. phrase orientation
uint32_t bwd_o, // bwd. phrase orientation
- int const docid); // document where sample was found
+ int const docid, // document where sample was found
+ int const sid); // index of sentence where sample was found
void
count_sample(int const docid, // document where sample was found
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
index e62d32e48..c94b2b149 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
@@ -332,7 +332,8 @@ consider_sample(TokenPosition const& p)
size_t raw2 = b->approxOccurrenceCount();
size_t evid = m_stats->add(tpid, sample_weight,
m_bias ? (*m_bias)[p.sid] : 1,
- aln, raw2, rec.po_fwd, rec.po_bwd, docid);
+ aln, raw2, rec.po_fwd, rec.po_bwd, docid,
+ p.sid);
max_evidence = std::max(max_evidence, evid);
bool ok = (i == rec.e2) || b->extend(o[i].id());
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
index 7087c20af..d3fc63ce4 100644
--- a/moses/TranslationModel/UG/mm/ug_phrasepair.h
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -31,6 +31,8 @@ namespace sapt
std::vector<unsigned char> aln;
float score;
bool inverse;
+ std::vector<uint32_t> sids; // list of sampled sentence ids where this
+ // phrase pair was found
// std::vector<uint32_t> indoc;
std::map<uint32_t,uint32_t> indoc;
PhrasePair() { };
@@ -132,6 +134,7 @@ namespace sapt
dbwd[i] = js.dcnt_bwd(po);
}
+ sids = js.sids;
indoc = js.indoc;
return *this;
}
@@ -199,6 +202,7 @@ namespace sapt
, aln(o.aln)
, score(o.score)
, inverse(o.inverse)
+ , sids(o.sids)
, indoc(o.indoc)
{
for (int i = 0; i <= LRModel::NONE; ++i)