Welcome to mirror list, hosted at ThFree Co, Russian Federation.

BilingualDynSuffixArray.h « TranslationModel « moses - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 1c4ceae34afc2ccf7ffa87d938294ed4f9103838 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#ifndef moses_BilingualDynSuffixArray_h
#define moses_BilingualDynSuffixArray_h

#include "DynSuffixArray.h"
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
#include "moses/TranslationModel/WordCoocTable.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
#include <boost/dynamic_bitset.hpp>
#include "moses/TargetPhraseCollection.h"
#include <map>

using namespace std;
namespace Moses
{
class PhraseDictionaryDynSuffixArray;

/** @todo ask Abbey Levenberg
 */
class SAPhrase
{
public:
  vector<wordID_t> words;

  SAPhrase(size_t phraseSize)
    :words(phraseSize) {
  }

  void SetId(size_t pos, wordID_t id) {
    words.at(pos) = id;
  }
  bool operator<(const SAPhrase& phr2) const {
    return words < phr2.words;
  }
};

/** @todo ask Abbey Levenberg
 */
class PhrasePair
{
public:
  int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
  PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
    : m_startTarget(startTarget)
    , m_endTarget(endTarget)
    , m_startSource(startSource)
    , m_endSource(endSource)
    , m_sntIndex(sntIndex) {
  }

  size_t GetTargetSize() const {
    return m_endTarget - m_startTarget + 1;
  }

  size_t GetSourceSize() const {
    return m_endSource - m_startSource + 1;
  }
};

/** @todo ask Abbey Levenberg
 */
class SentenceAlignment
{
public:
  SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
  int m_sntIndex;
  vector<wordID_t>* trgSnt;
  vector<wordID_t>* srcSnt;
  vector<int> numberAligned;
  vector< vector<int> > alignedList;
  bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
               int startSource, int endSource) const;
};

class ScoresComp
{
public:
  ScoresComp(const vector<float>& weights) {
  }
  bool operator()(const Scores& s1, const Scores& s2) const {
    return s1[0] < s2[0]; // just p(e|f) as approximation
    // float score1(0), score2(0);
    // int idx1(0), idx2(0);
    // for (Scores::const_iterator itr = s1.begin();
    // 	 itr != s1.end(); ++itr) {
    //   score1 += log(*itr * m_weights.at(idx1++));
    // }
    // for (Scores::const_iterator itr = s2.begin();
    // 	 itr != s2.end(); ++itr) {
    //   score2 += log(*itr * m_weights.at(idx2++));
    // }
    // return score1 < score2;
  }
};

struct BetterPhrase {
  ScoresComp const& cmp;
  BetterPhrase(ScoresComp const& sc);
  // bool operator()(pair<Scores, TargetPhrase const*> const& a,
  // pair<Scores, TargetPhrase const*> const& b) const;
  bool operator()(pair<Scores, SAPhrase const*> const& a,
                  pair<Scores, SAPhrase const*> const& b) const;
};

/** @todo ask Abbey Levenberg
 */
class BilingualDynSuffixArray
{
public:
  BilingualDynSuffixArray();
  ~BilingualDynSuffixArray();
  bool Load( const vector<FactorType>& inputFactors,
             const vector<FactorType>& outputTactors,
             string source, string target, string alignments,
             const vector<float> &weight);
  // bool LoadTM( const vector<FactorType>& inputFactors,
  // 	     const vector<FactorType>& outputTactors,
  // 	     string source, string target, string alignments,
  // 	     const vector<float> &weight);
  void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;

  void CleanUp(const InputType& source);
  void addSntPair(string& source, string& target, string& alignment);
  pair<float,float>
  GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;

  TargetPhrase*
  GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase, const PhraseDictionary *pt) const;

private:


  mutable WordCoocTable m_wrd_cooc;
  DynSuffixArray * m_srcSA;
  DynSuffixArray * m_trgSA;
  vector<wordID_t>* m_srcCorpus;
  vector<wordID_t>* m_trgCorpus;
  vector<FactorType> m_inputFactors;
  vector<FactorType> m_outputFactors;

  vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;

  Vocab* m_srcVocab, *m_trgVocab;
  ScoresComp* m_scoreCmp;

  vector<SentenceAlignment> m_alignments;
  vector<vector<short> > m_rawAlignments;

  mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
  mutable set<wordID_t> m_freqWordsCached;
  const size_t m_maxPhraseLength, m_maxSampleSize;
  const size_t m_maxPTEntries;
  int LoadCorpus(FactorDirection direction,
                 InputFileStream&, const vector<FactorType>& factors,
                 vector<wordID_t>&, vector<wordID_t>&,
                 Vocab*);
  int LoadAlignments(InputFileStream& aligs);
  int LoadRawAlignments(InputFileStream& aligs);
  int LoadRawAlignments(string& aligs);

  bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
  SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
  int SampleSelection(vector<unsigned>&, int = 300) const;

  vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
  SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
  bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
  void CacheWordProbs(wordID_t) const;
  void CacheFreqWords() const;
  void ClearWordInCache(wordID_t);
  pair<float, float> GetLexicalWeight(const PhrasePair&) const;

  int GetSourceSentenceSize(size_t sentenceId) const;
  int GetTargetSentenceSize(size_t sentenceId) const;

};
} // end namespace
#endif