#ifndef moses_BilingualDynSuffixArray_h #define moses_BilingualDynSuffixArray_h #include "TargetPhrase.h" #include "DynSuffixArray.h" #include "DynSAInclude/vocab.h" #include "DynSAInclude/types.h" #include "DynSAInclude/utils.h" #include "InputFileStream.h" #include "FactorTypeSet.h" namespace Moses { /** @todo ask Abbey Levenberg */ class SAPhrase { public: std::vector words; SAPhrase(size_t phraseSize) :words(phraseSize) {} void SetId(size_t pos, wordID_t id) { CHECK(pos < words.size()); words[pos] = id; } bool operator<(const SAPhrase& phr2) const { return words < phr2.words; } }; /** @todo ask Abbey Levenberg */ class PhrasePair { public: int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex; PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex) : m_startTarget(startTarget) , m_endTarget(endTarget) , m_startSource(startSource) , m_endSource(endSource) , m_sntIndex(sntIndex) {} size_t GetTargetSize() const { return m_endTarget - m_startTarget + 1; } }; /** @todo ask Abbey Levenberg */ class SentenceAlignment { public: SentenceAlignment(int sntIndex, int sourceSize, int targetSize); int m_sntIndex; std::vector* trgSnt; std::vector* srcSnt; std::vector numberAligned; std::vector< std::vector > alignedList; bool Extract(int maxPhraseLength, std::vector &ret, int startSource, int endSource) const; }; class ScoresComp { public: ScoresComp(const std::vector& weights): m_weights(weights) {} bool operator()(const Scores& s1, const Scores& s2) const { return s1[0] < s2[0]; // just p(e|f) as approximation /*float score1(0), score2(0); int idx1(0), idx2(0); for (Scores::const_iterator itr = s1.begin(); itr != s1.end(); ++itr) { score1 += log(*itr * m_weights.at(idx1++)); } for (Scores::const_iterator itr = s2.begin(); itr != s2.end(); ++itr) { score2 += log(*itr * m_weights.at(idx2++)); } return score1 < score2;*/ } private: const std::vector& m_weights; }; /** @todo ask Abbey Levenberg */ class BilingualDynSuffixArray { public: BilingualDynSuffixArray(); ~BilingualDynSuffixArray(); bool Load( const std::vector& inputFactors, const std::vector& outputTactors, std::string source, std::string target, std::string alignments, const std::vector &weight); void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair >& target) const; void CleanUp(); void addSntPair(string& source, string& target, string& alignment); private: DynSuffixArray* m_srcSA; DynSuffixArray* m_trgSA; std::vector* m_srcCorpus; std::vector* m_trgCorpus; std::vector m_inputFactors; std::vector m_outputFactors; std::vector m_srcSntBreaks, m_trgSntBreaks; Vocab* m_srcVocab, *m_trgVocab; ScoresComp* m_scoreCmp; std::vector m_alignments; std::vector > m_rawAlignments; mutable std::map, std::pair > m_wordPairCache; mutable std::set m_freqWordsCached; const size_t m_maxPhraseLength, m_maxSampleSize; int LoadCorpus(InputFileStream&, const std::vector& factors, std::vector&, std::vector&, Vocab*); int LoadAlignments(InputFileStream& aligs); int LoadRawAlignments(InputFileStream& aligs); int LoadRawAlignments(string& aligs); bool ExtractPhrases(const int&, const int&, const int&, std::vector&, bool=false) const; SentenceAlignment GetSentenceAlignment(const int, bool=false) const; int SampleSelection(std::vector&, int = 300) const; std::vector GetSntIndexes(std::vector&, int, const std::vector&) const; TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const; SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; void CacheWordProbs(wordID_t) const; void CacheFreqWords() const; void ClearWordInCache(wordID_t); std::pair GetLexicalWeight(const PhrasePair&) const; int GetSourceSentenceSize(size_t sentenceId) const { return (sentenceId==m_srcSntBreaks.size()-1) ? m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); } int GetTargetSentenceSize(size_t sentenceId) const { return (sentenceId==m_trgSntBreaks.size()-1) ? m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); } }; } // end namespace #endif