Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/training/memscore/phrasetable.cpp')
-rw-r--r--scripts/training/memscore/phrasetable.cpp348
1 files changed, 0 insertions, 348 deletions
diff --git a/scripts/training/memscore/phrasetable.cpp b/scripts/training/memscore/phrasetable.cpp
deleted file mode 100644
index 9f430c289..000000000
--- a/scripts/training/memscore/phrasetable.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-// memscore - in-memory phrase scoring for Statistical Machine Translation
-// Christian Hardmeier, FBK-irst, Trento, 2010
-// $Id$
-
-#include "phrasetable.h"
-#include "statistic.h"
-#include "timestamp.h"
-
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <string>
-
-/* PhraseText */
-
-PhraseText::DictionaryType_ PhraseText::dictionary_;
-Count PhraseText::last_id_ = 1;
-
-PhraseText::PhraseText(const String &s)
-{
- IStringStream is(s);
- while(is.good()) {
- String w;
- getline(is, w, ' ');
- Count *id = boost::fast_pool_allocator<Count>::allocate(1);
- *id = index_word(w);
- word_list_.push_back(id);
- }
-}
-
-std::ostream &operator<<(std::ostream &os, const PhraseText &pt)
-{
- bool print_space = false;
- for(PhraseText::const_string_iterator it = pt.string_begin(); it != pt.string_end(); it++) {
- if(print_space)
- os << ' ';
- else
- print_space = true;
-
- os << *it;
- }
-
- return os;
-}
-
-/* PhraseAlignment */
-
-PhraseAlignment::Alignment::AlignmentMapType_ PhraseAlignment::Alignment::alignment_map_;
-PhraseAlignment::Alignment::AlignmentVectorType_ PhraseAlignment::Alignment::alignment_vector_;
-
-PhraseAlignment::Alignment::Alignment(Count slen, Count tlen, const String &alignment) :
- slen_(slen), tlen_(tlen), matrix_(slen * tlen, false)
-{
- assert(slen_ > 0 && slen_ < 10);
- IStringStream is(alignment);
- while(is.good()) {
- String a;
- getline(is, a, ' ');
- IStringStream ap(a);
- Count s, t;
- char dash;
- ap >> s >> dash >> t;
- assert(s < slen && t < tlen);
- assert(dash == '-');
- matrix_[t * slen + s] = true;
- }
-}
-
-Count PhraseAlignment::Alignment::index_alignment(Count slen, Count tlen, const String &alignment)
-{
- AlignmentTuple_ tup = boost::make_tuple(slen, tlen, alignment);
- AlignmentMapType_::const_iterator it = alignment_map_.find(tup);
-
- if(it == alignment_map_.end()) {
- const Alignment *pa = new Alignment(slen, tlen, alignment);
- Count index = alignment_vector_.size();
- alignment_map_.insert(std::make_pair(tup, index));
- alignment_vector_.push_back(pa);
- return index;
- } else
- return it->second;
-}
-
-std::ostream &operator<<(std::ostream &os, const PhraseAlignment::Alignment &pa)
-{
- bool print_space = false;
- for(Count i = 0; i < pa.matrix_.size(); i++) {
- if(print_space)
- os << ' ';
- else
- print_space = true;
-
- os << (i / pa.slen_) << '-' << (i % pa.slen_);
- }
-
- return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const PhraseAlignment &pa)
-{
- for(Count s = 0; s < pa.get_source_length(); s++) {
- os << '(';
- bool print_comma = false;
- for(Count t = 0; t < pa.get_target_length(); t++) {
- if(pa.is_aligned(s, t)) {
- if(print_comma)
- os << ',';
- else
- print_comma = true;
-
- os << t;
- }
- }
- os << ") ";
- }
-
- os << "|||";
-
- for(Count t = 0; t < pa.get_target_length(); t++) {
- os << " (";
- bool print_comma = false;
- for(Count s = 0; s < pa.get_source_length(); s++) {
- if(pa.is_aligned(s, t)) {
- if(print_comma)
- os << ',';
- else
- print_comma = true;
-
- os << s;
- }
- }
- os << ')';
- }
-
- return os;
-}
-
-/* PhrasePairInfo */
-
-bool PhrasePairInfo::init_phase_ = true;
-Count PhrasePairInfo::data_ncounts_ = COUNT_FREE_IDX;
-Count PhrasePairInfo::data_nscores_ = SCORE_FREE_IDX;
-const Count PhrasePairInfo::CONTINUATION_BIT = 1 << (std::numeric_limits<Count>::digits - 1);
-
-PhrasePairInfo::PhrasePairInfo(Count src, Count tgt, Count alignment, Count count) :
- src_(src), tgt_(tgt), data_(NULL), reverse_(false)
-{
- init_phase_ = false;
- realloc_data(1);
- count_data(COUNT_COUNT_IDX) = count;
- Count *aligd = alignment_data(0);
- aligd[0] = alignment;
- aligd[1] = count;
-}
-
-DataIndex PhrasePairInfo::register_score_data(Count size)
-{
- assert(init_phase_);
-
- Count start = data_nscores_;
- data_nscores_ += size;
- return start;
-}
-
-DataIndex PhrasePairInfo::register_count_data(Count size)
-{
- assert(init_phase_);
-
- Count start = data_ncounts_;
- data_ncounts_ += size;
- return start;
-}
-
-PhrasePairInfo::AlignmentVector PhrasePairInfo::get_alignments() const
-{
- PhrasePairInfo::AlignmentVector vec;
-
- Count i = 0;
- bool last;
- do {
- const Count *aligd = alignment_data(i++);
- last = !(aligd[0] & CONTINUATION_BIT);
- Count alig = aligd[0] & ~CONTINUATION_BIT;
- vec.push_back(std::make_pair(PhraseAlignment(alig, reverse_), aligd[1]));
- } while(!last);
-
- return vec;
-}
-
-void PhrasePairInfo::add_alignment(Count new_alignment)
-{
- Count i = 0;
- bool last;
- do {
- Count *aligd = alignment_data(i++);
- last = !(aligd[0] & CONTINUATION_BIT);
- Count alig = aligd[0] & ~CONTINUATION_BIT;
- if(alig == new_alignment) {
- aligd[1]++;
- return;
- }
- } while(!last);
-
- realloc_data(i + 1);
-
- Count *last_aligd = alignment_data(i - 1);
- last_aligd[0] |= CONTINUATION_BIT;
-
- Count *this_aligd = alignment_data(i);
- this_aligd[0] = new_alignment;
- this_aligd[1] = 1;
-}
-
-void PhrasePairInfo::realloc_data(Count nalignments)
-{
- static boost::pool<> *pool[3] = { NULL, NULL, NULL };
-
- size_t fixed_size = data_nscores_ * sizeof(Score) + data_ncounts_ * sizeof(Count);
- size_t new_data_size = fixed_size + COUNTS_PER_ALIGNMENT * nalignments * sizeof(Count);
-
- PhrasePairData new_data;
- if(nalignments <= 3) {
- if(!pool[nalignments - 1])
- pool[nalignments - 1] = new boost::pool<>(new_data_size);
-
- new_data = reinterpret_cast<PhrasePairData>(pool[nalignments - 1]->malloc());
- } else
- new_data = new char[new_data_size];
-
- if(data_) {
- memcpy(new_data, data_, fixed_size);
- Count i = 0;
- Count *old_aligd, *new_aligd;
- do {
- assert(i < nalignments);
- old_aligd = alignment_data(data_, i);
- new_aligd = alignment_data(new_data, i);
- new_aligd[0] = old_aligd[0];
- new_aligd[1] = old_aligd[1];
- i++;
- } while(old_aligd[0] & CONTINUATION_BIT);
- if(nalignments <= 4)
- pool[nalignments - 2]->free(data_);
- else
- delete[] data_;
- }
-
- data_ = new_data;
-}
-
-/* PhraseInfoList */
-
-Phrase PhraseInfoList::index_phrase(const String &s_phr)
-{
- IDMapType_::const_iterator it = idmap_.find(s_phr);
- if(it != idmap_.end())
- return it->second;
-
- PhraseInfo *pi = phrase_info_pool_.construct(data_size_, s_phr);
-
- list_.push_back(pi);
- idmap_[s_phr] = list_.size() - 1;
- return idmap_[s_phr];
-}
-
-DataIndex PhraseInfoList::register_data(Count size)
-{
- DataIndex start = data_size_;
- data_size_ += size;
- return start;
-}
-
-void PhraseInfoList::attach_statistic(PhraseStatistic &s)
-{
- statistics_.push_back(&s);
- s.attach(*this);
-}
-
-void PhraseInfoList::compute_statistics()
-{
- while(!statistics_.empty()) {
- statistics_.front()->compute_statistic();
- statistics_.pop_front();
- }
-}
-
-/* PhraseTable */
-
-void MemoryPhraseTable::load_data(std::istream &instream)
-{
- Count total_count = 0;
-
- Timestamp t_load;
- Count nlines = 1;
- String line;
- while(getline(instream, line)) {
- size_t sep1 = line.find(" ||| ");
- if(sep1 == line.npos) {
- std::cerr << "Phrase separator not found in: " << line << std::endl;
- abort();
- }
- size_t sep2 = line.find(" ||| ", sep1 + 1);
- String s_src(line, 0, sep1);
- String s_tgt(line, sep1 + 5, sep2 - sep1 - 5);
- String s_alignment(line, sep2 + 5);
-
- Phrase src = src_info_.index_phrase(s_src);
- Phrase tgt = tgt_info_.index_phrase(s_tgt);
- Count alignment = PhraseAlignment::index_alignment(src_info_[src].get_phrase().size(), tgt_info_[tgt].get_phrase().size(), s_alignment);
-
- src_info_[src].inc_count();
- tgt_info_[tgt].inc_count();
- total_count++;
-
- PhrasePair stpair(src, tgt);
- PhrasePairCounts::iterator it = joint_counts_.find(stpair);
-
- if(it == joint_counts_.end()) {
- src_info_[src].inc_distinct();
- tgt_info_[tgt].inc_distinct();
- joint_counts_.insert(std::make_pair(stpair, PhrasePairInfo(src, tgt, alignment, 1).get_phrase_pair_data()));
- } else {
- PhrasePairInfo pi(src, tgt, it->second);
- pi.inc_count();
- pi.add_alignment(alignment);
- it->second = pi.get_phrase_pair_data(); // may have changed by adding the alignment
- }
- if(nlines % 50000 == 0)
- std:: cerr << "Read " << nlines << " lines in " << (t_load.elapsed_time() / 1000) << " ms." << std::endl;
- nlines++;
- }
-}
-
-void MemoryPhraseTable::attach_src_statistic(PhraseStatistic &s)
-{
- src_info_.attach_statistic(s);
-}
-
-void MemoryPhraseTable::attach_tgt_statistic(PhraseStatistic &s)
-{
- tgt_info_.attach_statistic(s);
-}
-
-void MemoryPhraseTable::compute_phrase_statistics()
-{
- src_info_.compute_statistics();
- tgt_info_.compute_statistics();
-}