Welcome to mirror list, hosted at ThFree Co, Russian Federation.

BilingualDynSuffixArray.h « TranslationModel « moses - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 08637d095ef17d01ad1383ad83584b53b47780a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#ifndef moses_BilingualDynSuffixArray_h
#define moses_BilingualDynSuffixArray_h

#include "DynSuffixArray.h"
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"

namespace Moses
{

/** @todo ask Abbey Levenberg
 */
class SAPhrase
{
public:
  std::vector<wordID_t> words;

  SAPhrase(size_t phraseSize)
    :words(phraseSize)
  {}

  void SetId(size_t pos, wordID_t id) {
    CHECK(pos < words.size());
    words[pos] = id;
  }
  bool operator<(const SAPhrase& phr2) const {
    return words < phr2.words;
  }
};

/** @todo ask Abbey Levenberg
 */
class PhrasePair
{
public:
  int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
  PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
    : m_startTarget(startTarget)
    , m_endTarget(endTarget)
    , m_startSource(startSource)
    , m_endSource(endSource)
    , m_sntIndex(sntIndex)
  {}

  size_t GetTargetSize() const {
    return m_endTarget - m_startTarget + 1;
  }
};

/** @todo ask Abbey Levenberg
 */
class SentenceAlignment
{
public:
  SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
  int m_sntIndex;
  std::vector<wordID_t>* trgSnt;
  std::vector<wordID_t>* srcSnt;
  std::vector<int> numberAligned;
  std::vector< std::vector<int> > alignedList;
  bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
};
class ScoresComp
{
public:
  ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
  bool operator()(const Scores& s1, const Scores& s2) const {
    return s1[0] < s2[0]; // just p(e|f) as approximation
    /*float score1(0), score2(0);
    int idx1(0), idx2(0);
    for (Scores::const_iterator itr = s1.begin();
            itr != s1.end(); ++itr) {
        score1 += log(*itr * m_weights.at(idx1++));
    }
    for (Scores::const_iterator itr = s2.begin();
        itr != s2.end(); ++itr) {
        score2 += log(*itr * m_weights.at(idx2++));
    }
    return score1 < score2;*/
  }
private:
  const std::vector<float>& m_weights;
};

/** @todo ask Abbey Levenberg
 */
class BilingualDynSuffixArray
{
public:
  BilingualDynSuffixArray();
  ~BilingualDynSuffixArray();
  bool Load( const std::vector<FactorType>& inputFactors,
             const std::vector<FactorType>& outputTactors,
             std::string source, std::string target, std::string alignments,
             const std::vector<float> &weight);
  bool LoadTM( const std::vector<FactorType>& inputFactors,
               const std::vector<FactorType>& outputTactors,
               std::string source, std::string target, std::string alignments,
               const std::vector<float> &weight);
  void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
  void addSntPair(string& source, string& target, string& alignment);
private:
  DynSuffixArray* m_srcSA;
  DynSuffixArray* m_trgSA;
  std::vector<wordID_t>* m_srcCorpus;
  std::vector<wordID_t>* m_trgCorpus;
  std::vector<FactorType> m_inputFactors;
  std::vector<FactorType> m_outputFactors;

  std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;

  Vocab* m_srcVocab, *m_trgVocab;
  ScoresComp* m_scoreCmp;

  std::vector<SentenceAlignment> m_alignments;
  std::vector<std::vector<short> > m_rawAlignments;

  mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
  mutable std::set<wordID_t> m_freqWordsCached;
  const size_t m_maxPhraseLength, m_maxSampleSize;

  int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
                 std::vector<wordID_t>&, std::vector<wordID_t>&,
                 Vocab*);
  int LoadAlignments(InputFileStream& aligs);
  int LoadRawAlignments(InputFileStream& aligs);
  int LoadRawAlignments(string& aligs);

  bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
  SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
  int SampleSelection(std::vector<unsigned>&, int = 300) const;

  std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
  TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
  SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
  bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
  void CacheWordProbs(wordID_t) const;
  void CacheFreqWords() const;
  void ClearWordInCache(wordID_t);
  std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;

  int GetSourceSentenceSize(size_t sentenceId) const {
    return (sentenceId==m_srcSntBreaks.size()-1) ?
           m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
           m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
  }
  int GetTargetSentenceSize(size_t sentenceId) const {
    return (sentenceId==m_trgSntBreaks.size()-1) ?
           m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
           m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
  }
};
} // end namespace
#endif