moses/src/BilingualDynSuffixArray.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

#ifndef moses_BilingualDynSuffixArray_h
#define moses_BilingualDynSuffixArray_h

#include "TargetPhrase.h"
#include "DynSuffixArray.h" 
#include "DynSAInclude/vocab.h"
#include "DynSAInclude/types.h"
#include "DynSAInclude/utils.h"
#include "InputFileStream.h"
#include "FactorTypeSet.h"

namespace Moses {

class SAPhrase
{
public:
	std::vector<wordID_t> words;		
	
	SAPhrase(size_t phraseSize)
	:words(phraseSize)
	{}
	
	void SetId(size_t pos, wordID_t id)
	{
    CHECK(pos < words.size());
		words[pos] = id;
	}
	bool operator<(const SAPhrase& phr2) const
  { return words < phr2.words; }
};

class PhrasePair
{
public:
	int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
	PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
	: m_startTarget(startTarget)
	, m_endTarget(endTarget)
	, m_startSource(startSource)
	, m_endSource(endSource)
	, m_sntIndex(sntIndex)
	{}

	size_t GetTargetSize() const
	{ return m_endTarget - m_startTarget + 1; }
};
	
class SentenceAlignment 
{
public:
	SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
	int m_sntIndex;
	std::vector<wordID_t>* trgSnt;
	std::vector<wordID_t>* srcSnt;
	std::vector<int> numberAligned; 
	std::vector< std::vector<int> > alignedList; 
	bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
};
class ScoresComp {
public: 
  ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
  bool operator()(const Scores& s1, const Scores& s2) const { 
    return s1[0] < s2[0]; // just p(e|f) as approximation
    /*float score1(0), score2(0);
    int idx1(0), idx2(0);
    for (Scores::const_iterator itr = s1.begin(); 
            itr != s1.end(); ++itr) {
        score1 += log(*itr * m_weights.at(idx1++)); 
    }
    for (Scores::const_iterator itr = s2.begin();
        itr != s2.end(); ++itr) {
        score2 += log(*itr * m_weights.at(idx2++));
    }
    return score1 < score2;*/
  }
private: 
  const std::vector<float>& m_weights;
};
	
class BilingualDynSuffixArray {
public: 
	BilingualDynSuffixArray();
	~BilingualDynSuffixArray();
	bool Load( const std::vector<FactorType>& inputFactors,
		const std::vector<FactorType>& outputTactors,
		std::string source, std::string target, std::string alignments, 
		const std::vector<float> &weight);
	void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
	void CleanUp();
  void addSntPair(string& source, string& target, string& alignment);
private:
	DynSuffixArray* m_srcSA;
	DynSuffixArray* m_trgSA;
	std::vector<wordID_t>* m_srcCorpus;
	std::vector<wordID_t>* m_trgCorpus;
  std::vector<FactorType> m_inputFactors;
  std::vector<FactorType> m_outputFactors;

	std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;

	Vocab* m_srcVocab, *m_trgVocab;
	ScoresComp* m_scoreCmp;

	std::vector<SentenceAlignment> m_alignments;
	std::vector<std::vector<short> > m_rawAlignments;

	mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache; 
  mutable std::set<wordID_t> m_freqWordsCached;
	const size_t m_maxPhraseLength, m_maxSampleSize;

	int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors, 
		std::vector<wordID_t>&, std::vector<wordID_t>&,
    Vocab*);
	int LoadAlignments(InputFileStream& aligs);
	int LoadRawAlignments(InputFileStream& aligs);
	int LoadRawAlignments(string& aligs);

	bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
	SentenceAlignment GetSentenceAlignment(const int, bool=false) const; 
	int SampleSelection(std::vector<unsigned>&, int = 300) const;

	std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;	
	TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
	SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
	bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
	void CacheWordProbs(wordID_t) const;
  void CacheFreqWords() const;
  void ClearWordInCache(wordID_t);
	std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;

	int GetSourceSentenceSize(size_t sentenceId) const
	{ 
		return (sentenceId==m_srcSntBreaks.size()-1) ? 
			m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : 
			m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); 
	}
	int GetTargetSentenceSize(size_t sentenceId) const
	{ 
		return (sentenceId==m_trgSntBreaks.size()-1) ?
			m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : 
			m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); 
	}
};
} // end namespace
#endif