diff options
Diffstat (limited to 'scripts/training/memscore/phrasetable.h')
-rw-r--r-- | scripts/training/memscore/phrasetable.h | 710 |
1 files changed, 0 insertions, 710 deletions
diff --git a/scripts/training/memscore/phrasetable.h b/scripts/training/memscore/phrasetable.h deleted file mode 100644 index 14d68d702..000000000 --- a/scripts/training/memscore/phrasetable.h +++ /dev/null @@ -1,710 +0,0 @@ -// memscore - in-memory phrase scoring for Statistical Machine Translation -// Christian Hardmeier, FBK-irst, Trento, 2010 -// $Id$ - -#ifndef PHRASETABLE_H -#define PHRASETABLE_H - -#include <cassert> -#include <iostream> -#include <iterator> -#include <list> -#include <map> -#include <set> -#include <string> -#include <vector> - -#include <boost/bimap.hpp> -#include <boost/dynamic_bitset.hpp> -#include <boost/iterator/transform_iterator.hpp> -#include <boost/pool/object_pool.hpp> -#include <boost/pool/pool_alloc.hpp> -#include <boost/ptr_container/ptr_vector.hpp> -#include <boost/tuple/tuple.hpp> -#include <boost/tuple/tuple_comparison.hpp> - -#include "datastorage.h" -#include "memscore.h" - -class PhraseText -{ - friend std::ostream &operator<<(std::ostream &os, const PhraseText &pt); - -private: - typedef boost::ptr_vector<Count,boost::view_clone_allocator> WordListType_; - typedef boost::bimap<String,Count> DictionaryType_; - - WordListType_ word_list_; - - static DictionaryType_ dictionary_; - static Count last_id_; - - typedef const String &(*LookupFunction_)(Count id); - -public: - typedef WordListType_::const_iterator const_iterator; - typedef boost::transform_iterator<LookupFunction_,WordListType_::const_iterator> const_string_iterator; - typedef WordListType_::size_type size_type; - - PhraseText(const String &s); - - const_iterator begin() const { - return word_list_.begin(); - } - - const_iterator end() const { - return word_list_.end(); - } - - const_string_iterator string_begin() const { - return boost::make_transform_iterator(word_list_.begin(), dictionary_lookup); - } - - const_string_iterator string_end() const { - return boost::make_transform_iterator(word_list_.end(), dictionary_lookup); - } - - Count operator[](size_type i) const { - return word_list_[i]; - } - - const String &word(size_type i) const { - return dictionary_lookup(operator[](i)); - } - - size_type size() const { - return word_list_.size(); - } - - static const String &dictionary_lookup(Count id) { - DictionaryType_::right_const_iterator it = dictionary_.right.find(id); - assert(it != dictionary_.right.end()); - return it->second; - } - - static Count index_word(const String &word) { - Count id; - DictionaryType_::left_const_iterator it = dictionary_.left.find(word); - if(it != dictionary_.left.end()) - id = it->second; - else { - id = last_id_++; - dictionary_.insert(DictionaryType_::value_type(word, id)); - } - return id; - } -}; - -class PhraseInfo -{ - friend class boost::object_pool<PhraseInfo>; - friend std::ostream &operator<<(std::ostream &os, const PhraseInfo &pt); - -protected: - Count data_size_; - - Count count_; - Count distinct_; - PhraseText phrase_; - Score *data_; - - Count n1_; - Count n2_; - Count n3plus_; - - PhraseInfo(Count data_size, const String &phrase) : - data_size_(data_size), count_(0), distinct_(0), phrase_(phrase), n1_(0), n2_(0), n3plus_(0) { - data_ = DataStorage<Score>::get_instance().alloc(data_size_); - } - -public: - Score &data(Count base, Count i = 0) { - assert(base + i < data_size_); - return *(data_ + base + i); - } - - const Score &data(Count base, Count i = 0) const { - assert(base + i < data_size_); - return *(data_ + base + i); - } - - Count get_count() const { - return count_; - } - - void inc_count() { - count_++; - } - - Count get_distinct() const { - return distinct_; - } - - void inc_distinct() { - distinct_++; - } - - const PhraseText &get_phrase() const { - return phrase_; - } - - void inc_n1() { - n1_++; - } - - Count get_n1() { - return n1_; - } - - void inc_n2() { - n2_++; - } - - Count get_n2() { - return n2_; - } - - void inc_n3plus() { - n3plus_++; - } - - Count get_n3plus() { - return n3plus_; - } - -}; - -inline std::ostream &operator<<(std::ostream &os, const PhraseInfo &pt) -{ - return os << pt.get_phrase(); -} - -class PhraseInfoList -{ -protected: - typedef std::map<String,Phrase> IDMapType_; - typedef boost::ptr_vector<PhraseInfo,boost::view_clone_allocator> ListType_; - //typedef std::vector<PhraseInfo *,boost::pool_allocator<PhraseInfo *> > ListType_; - //typedef std::vector<PhraseInfo> ListType_; - typedef std::list<PhraseStatistic *> StatListType_; - - IDMapType_ idmap_; - ListType_ list_; - StatListType_ statistics_; - boost::object_pool<PhraseInfo> phrase_info_pool_; - - Count data_size_; -public: - typedef ListType_::iterator iterator; - typedef ListType_::const_iterator const_iterator; - typedef ListType_::size_type size_type; - - PhraseInfoList() : data_size_(0) {} - - Phrase index_phrase(const String &s_phr); - DataIndex register_data(Count size); - void attach_statistic(PhraseStatistic &s); - void compute_statistics(); - - PhraseInfo &operator[](Phrase phr) { - return list_[phr]; - } - - iterator begin() { - return list_.begin(); - } - - iterator end() { - return list_.end(); - } - - const_iterator begin() const { - return list_.begin(); - } - - const_iterator end() const { - return list_.end(); - } - - size_type size() const { - return list_.size(); - } - -}; - -class PhraseAlignment -{ - friend std::ostream &operator<<(std::ostream &os, const PhraseAlignment &pa); - -private: - class Alignment - { - friend std::ostream &operator<<(std::ostream &os, const Alignment &pa); - - private: - typedef boost::tuple<Count,Count,String> AlignmentTuple_; - typedef std::map<AlignmentTuple_,Count> AlignmentMapType_; - typedef std::vector<const Alignment *> AlignmentVectorType_; - - static AlignmentMapType_ alignment_map_; - static AlignmentVectorType_ alignment_vector_; - - Count slen_, tlen_; - boost::dynamic_bitset<unsigned int> matrix_; - - Alignment(Count slen, Count tlen, const String &alignment); - - public: - bool is_aligned(Count s, Count t) const { - assert(t < tlen_); - assert(s < slen_); - return matrix_[t * slen_ + s]; - } - - Count get_source_length() const { - return slen_; - } - - Count get_target_length() const { - return tlen_; - } - - bool operator<(const Alignment &pa) const { - if(slen_ < pa.slen_) return true; - if(tlen_ < pa.tlen_) return true; - return (matrix_ < pa.matrix_); - } - - static Count index_alignment(Count slen, Count tlen, const String &alignment); - - static const Alignment *find(Count index) { - return alignment_vector_[index]; - } - }; - friend std::ostream &operator<<(std::ostream &os, const Alignment &pa); - - const Alignment *alignment_; - bool reverse_; - -public: - PhraseAlignment(Count index, bool reverse = false) : - alignment_(Alignment::find(index)), reverse_(reverse) {} - - bool is_aligned(Count s, Count t) const { - if(!reverse_) - return alignment_->is_aligned(s, t); - else - return alignment_->is_aligned(t, s); - } - - Count get_source_length() const { - if(!reverse_) - return alignment_->get_source_length(); - else - return alignment_->get_target_length(); - } - - Count get_target_length() const { - if(!reverse_) - return alignment_->get_target_length(); - else - return alignment_->get_source_length(); - } - - static Count index_alignment(Count slen, Count tlen, const String &alignment) { - return Alignment::index_alignment(slen, tlen, alignment); - } -}; - -typedef std::map<PhrasePair,PhrasePairData> PhrasePairCounts; - -class PhrasePairInfo -{ -protected: - static const Count CONTINUATION_BIT; - - static bool init_phase_; - static Count data_nscores_; - static Count data_ncounts_; - - enum { COUNT_COUNT_IDX = 0, COUNT_FREE_IDX }; // COUNT_FREE_IDX must remain last! - enum { SCORE_FREE_IDX = 0 }; // SCORE_FREE_IDX must remain last! - - Phrase src_, tgt_; - PhrasePairData data_; - bool reverse_; - - void realloc_data(Count nalignments); - -public: - typedef std::vector<std::pair<PhraseAlignment,Count> > AlignmentVector; - - static DataIndex register_score_data(Count size); - static DataIndex register_count_data(Count size); - - PhrasePairInfo(Count src, Count tgt, Count alignment, Count count); - - PhrasePairInfo(Count src, Count tgt, PhrasePairData data, bool reverse = false) : src_(src), tgt_(tgt), data_(data), reverse_(reverse) { - init_phase_ = false; - } - - PhrasePairInfo(const PhrasePairCounts::const_iterator &in) : - src_(in->first.first), tgt_(in->first.second), data_(in->second), reverse_(false) {} - - PhrasePairData get_phrase_pair_data() { - return data_; - } - - Phrase get_src() const { - return !reverse_ ? src_ : tgt_; - } - - Phrase get_tgt() const { - return !reverse_ ? tgt_ : src_; - } - - Count get_count() const { - return count_data(COUNT_COUNT_IDX); - } - - Score &score_data(DataIndex base, DataIndex index = 0) { - return score_data(data_, base, index); - } - - const Score &score_data(DataIndex base, DataIndex index = 0) const { - return score_data(data_, base, index); - } - - Count &count_data(DataIndex base, DataIndex index = 0) { - return count_data(data_, base, index); - } - - const Count &count_data(DataIndex base, DataIndex index = 0) const { - return count_data(data_, base, index); - } - - void inc_count() { - count_data(data_, COUNT_COUNT_IDX)++; - } - - AlignmentVector get_alignments() const; - void add_alignment(Count alignment); - -private: - static Score &score_data(PhrasePairData data, DataIndex base, DataIndex index = 0) { - return *reinterpret_cast<Score *>(data + (base + index) * sizeof(Score)); - } - - static Count &count_data(PhrasePairData data, DataIndex base, DataIndex index = 0) { - return *reinterpret_cast<Count *>(data + data_nscores_ * sizeof(Score) + (base + index) * sizeof(Count)); - } - - static const Count COUNTS_PER_ALIGNMENT = 2; - - static Count *alignment_data(PhrasePairData data, Count index) { - return reinterpret_cast<Count *>(data + data_nscores_ * sizeof(Score) + data_ncounts_ * sizeof(Count) + COUNTS_PER_ALIGNMENT * index * sizeof(Count)); - } - - Count *alignment_data(Count index) { - return alignment_data(data_, index); - } - - const Count *alignment_data(Count index) const { - return alignment_data(data_, index); - } -}; - -class PhraseTable -{ -public: - typedef PhrasePairInfo value_type; - -protected: - typedef std::iterator_traits<PhrasePairCounts::iterator>::value_type InputEntry_; - typedef value_type (*EntryTransformer_)(InputEntry_); - - static value_type pass_entry(InputEntry_ in) { - return PhrasePairInfo(in.first.first, in.first.second, in.second, false); - } - - static value_type swap_src_tgt(InputEntry_ in) { - return PhrasePairInfo(in.first.first, in.first.second, in.second, true); - } - -public: - typedef boost::transform_iterator<EntryTransformer_,PhrasePairCounts::iterator> iterator; - typedef boost::transform_iterator<EntryTransformer_,PhrasePairCounts::const_iterator> const_iterator; - - virtual ~PhraseTable() {} - - virtual PhraseInfo &get_src_phrase(Phrase src) = 0; - virtual Count n_src_phrases() const = 0; - virtual PhraseInfo &get_tgt_phrase(Phrase tgt) = 0; - virtual Count n_tgt_phrases() const = 0; - virtual PhrasePairCounts &get_joint_counts() = 0; - virtual void attach_src_statistic(PhraseStatistic &s) = 0; - virtual void attach_tgt_statistic(PhraseStatistic &s) = 0; - virtual void compute_phrase_statistics() = 0; - virtual DataIndex register_src_data(Count n) = 0; - virtual DataIndex register_tgt_data(Count n) = 0; - virtual PhraseTable &reverse() = 0; - - virtual iterator begin() = 0; - virtual iterator end() = 0; - virtual iterator find(PhrasePair p) = 0; - virtual iterator find(const PhrasePairCounts::iterator &it) = 0; - - virtual const_iterator begin() const = 0; - virtual const_iterator end() const = 0; - virtual const_iterator find(PhrasePair p) const = 0; - virtual const_iterator find(const PhrasePairCounts::const_iterator &it) const = 0; - - virtual PhrasePairCounts::iterator raw_begin() = 0; - virtual PhrasePairCounts::iterator raw_end() = 0; - virtual PhrasePairCounts::iterator raw_find(PhrasePair p) = 0; - - virtual PhrasePairCounts::const_iterator raw_begin() const = 0; - virtual PhrasePairCounts::const_iterator raw_end() const = 0; - virtual PhrasePairCounts::const_iterator raw_find(PhrasePair p) const = 0; - - /* - static iterator swap_iterator(const iterator &it) { - if(it.functor() == swap_src_tgt) - return boost::make_transform_iterator(it.base(), pass_entry); - else if(it.functor() == pass_entry) - return boost::make_transform_iterator(it.base(), swap_src_tgt); - else - abort(); - } - - static const_iterator swap_iterator(const const_iterator &it) { - if(it.functor() == swap_src_tgt) - return boost::make_transform_iterator(it.base(), pass_entry); - else if(it.functor() == pass_entry) - return boost::make_transform_iterator(it.base(), swap_src_tgt); - else - abort(); - } - */ -}; - -class ReversePhraseTable : public PhraseTable -{ -protected: - PhraseTable &phrase_table_; - -public: - ReversePhraseTable(PhraseTable &phrase_table) : - phrase_table_(phrase_table) {} - - virtual PhraseInfo &get_src_phrase(Phrase src) { - return phrase_table_.get_tgt_phrase(src); - } - - virtual Count n_src_phrases() const { - return phrase_table_.n_tgt_phrases(); - } - - virtual PhraseInfo &get_tgt_phrase(Phrase tgt) { - return phrase_table_.get_src_phrase(tgt); - } - - virtual Count n_tgt_phrases() const { - return phrase_table_.n_src_phrases(); - } - - virtual PhrasePairCounts &get_joint_counts() { - return phrase_table_.get_joint_counts(); - } - - virtual void attach_src_statistic(PhraseStatistic &s) { - return phrase_table_.attach_tgt_statistic(s); - } - - virtual void attach_tgt_statistic(PhraseStatistic &s) { - return phrase_table_.attach_src_statistic(s); - } - - virtual void compute_phrase_statistics() { - phrase_table_.compute_phrase_statistics(); - } - - virtual DataIndex register_src_data(Count n) { - return phrase_table_.register_tgt_data(n); - } - - virtual DataIndex register_tgt_data(Count n) { - return phrase_table_.register_src_data(n); - } - - virtual PhraseTable &reverse() { - return phrase_table_; - } - - virtual iterator begin() { - return boost::make_transform_iterator(phrase_table_.raw_begin(), swap_src_tgt); - } - - virtual iterator end() { - return boost::make_transform_iterator(raw_end(), swap_src_tgt); - } - - virtual iterator find(PhrasePair p) { - return boost::make_transform_iterator(raw_find(p), swap_src_tgt); - } - - virtual iterator find(const PhrasePairCounts::iterator &it) { - return boost::make_transform_iterator(it, swap_src_tgt); - } - - virtual const_iterator begin() const { - return boost::make_transform_iterator(phrase_table_.raw_begin(), swap_src_tgt); - } - - virtual const_iterator end() const { - return boost::make_transform_iterator(raw_end(), swap_src_tgt); - } - - virtual const_iterator find(PhrasePair p) const { - return boost::make_transform_iterator(raw_find(p), swap_src_tgt); - } - - virtual const_iterator find(const PhrasePairCounts::const_iterator &it) const { - return boost::make_transform_iterator(it, swap_src_tgt); - } - - virtual PhrasePairCounts::iterator raw_begin() { - return phrase_table_.raw_begin(); - } - - virtual PhrasePairCounts::iterator raw_end() { - return phrase_table_.raw_end(); - } - - virtual PhrasePairCounts::iterator raw_find(PhrasePair p) { - return phrase_table_.raw_find(std::make_pair(p.second, p.first)); - } - - virtual PhrasePairCounts::const_iterator raw_begin() const { - return phrase_table_.raw_begin(); - } - - virtual PhrasePairCounts::const_iterator raw_end() const { - return phrase_table_.raw_end(); - } - - virtual PhrasePairCounts::const_iterator raw_find(PhrasePair p) const { - return phrase_table_.raw_find(std::make_pair(p.second, p.first)); - } -}; - -class MemoryPhraseTable : public PhraseTable -{ -protected: - PhraseInfoList src_info_; - PhraseInfoList tgt_info_; - PhrasePairCounts joint_counts_; - - ReversePhraseTable reverse_; - -public: - MemoryPhraseTable() : reverse_(*this) {} - - void load_data(std::istream &instream); - - virtual PhraseInfo &get_src_phrase(Phrase src) { - assert(src < src_info_.size()); - return src_info_[src]; - } - - virtual Count n_src_phrases() const { - return src_info_.size(); - } - - virtual PhraseInfo &get_tgt_phrase(Phrase tgt) { - assert(tgt < tgt_info_.size()); - return tgt_info_[tgt]; - } - - virtual Count n_tgt_phrases() const { - return tgt_info_.size(); - } - - virtual PhrasePairCounts &get_joint_counts() { - return joint_counts_; - } - - virtual void attach_src_statistic(PhraseStatistic &s); - virtual void attach_tgt_statistic(PhraseStatistic &s); - virtual void compute_phrase_statistics(); - - virtual DataIndex register_src_data(Count n) { - return src_info_.register_data(n); - } - - virtual DataIndex register_tgt_data(Count n) { - return tgt_info_.register_data(n); - } - - virtual PhraseTable &reverse() { - return reverse_; - } - - virtual iterator begin() { - return boost::make_transform_iterator(raw_begin(), pass_entry); - } - - virtual iterator end() { - return boost::make_transform_iterator(raw_end(), pass_entry); - } - - virtual iterator find(PhrasePair p) { - return boost::make_transform_iterator(raw_find(p), pass_entry); - } - - virtual iterator find(const PhrasePairCounts::iterator &it) { - return boost::make_transform_iterator(it, pass_entry); - } - - virtual const_iterator begin() const { - return boost::make_transform_iterator(raw_begin(), pass_entry); - } - - virtual const_iterator end() const { - return boost::make_transform_iterator(raw_end(), pass_entry); - } - - virtual const_iterator find(PhrasePair p) const { - return boost::make_transform_iterator(raw_find(p), pass_entry); - } - - virtual const_iterator find(const PhrasePairCounts::const_iterator &it) const { - return boost::make_transform_iterator(it, pass_entry); - } - - virtual PhrasePairCounts::iterator raw_begin() { - return joint_counts_.begin(); - } - - virtual PhrasePairCounts::iterator raw_end() { - return joint_counts_.end(); - } - - virtual PhrasePairCounts::iterator raw_find(PhrasePair p) { - return joint_counts_.find(p); - } - - virtual PhrasePairCounts::const_iterator raw_begin() const { - return joint_counts_.begin(); - } - - virtual PhrasePairCounts::const_iterator raw_end() const { - return joint_counts_.end(); - } - - virtual PhrasePairCounts::const_iterator raw_find(PhrasePair p) const { - return joint_counts_.find(p); - } -}; - -#endif |