phrase-extract/extract-lex.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

#pragma once

#include <map>
#include <set>
#include <sstream>
#include <fstream>
#include <iostream>

namespace MosesTraining
{


//! convert string to variable of type T. Used to reading floats, int etc from files
template<typename T>
inline T Scan(const std::string &input)
{
  std::stringstream stream(input);
  T ret;
  stream >> ret;
  return ret;
}


//! speeded up version of above
template<typename T>
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
{
  output.resize(input.size());
  for (size_t i = 0 ; i < input.size() ; i++) {
    output[i] = Scan<T>( input[i] );
  }
}


inline void Tokenize(std::vector<std::string> &output
                     , const std::string& str
                     , const std::string& delimiters = " \t")
{
  // Skip delimiters at beginning.
  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
  // Find first "non-delimiter".
  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);

  while (std::string::npos != pos || std::string::npos != lastPos) {
    // Found a token, add it to the vector.
    output.push_back(str.substr(lastPos, pos - lastPos));
    // Skip delimiters.  Note the "not_of"
    lastPos = str.find_first_not_of(delimiters, pos);
    // Find next "non-delimiter"
    pos = str.find_first_of(delimiters, lastPos);
  }
}

// speeded up version of above
template<typename T>
inline void Tokenize( std::vector<T> &output
                      , const std::string &input
                      , const std::string& delimiters = " \t")
{
  std::vector<std::string> stringVector;
  Tokenize(stringVector, input, delimiters);
  return Scan<T>(output, stringVector );
}

class WordCount
{
  friend std::ostream& operator<<(std::ostream&, const WordCount&);
public:
  float m_count;

  std::map<const std::string*, WordCount> m_coll;

  WordCount()
    :m_count(0) {
  }

  //WordCount(const WordCount &copy);

  WordCount(float count)
    :m_count(count) {
  }

  void AddCount(float incr);

  std::map<const std::string*, WordCount> &GetColl() {
    return m_coll;
  }
  const std::map<const std::string*, WordCount> &GetColl() const {
    return m_coll;
  }

  const float GetCount() const {
    return m_count;
  }

};

class Vocab
{
  std::set<std::string> m_coll;
public:
  const std::string *GetOrAdd(const std::string &word);
};

class ExtractLex
{
  Vocab m_vocab;
  std::map<const std::string*, WordCount> m_collS2T, m_collT2S;

  void Process(const std::string *target, const std::string *source);
  void Process(WordCount &wcIn, const std::string *out);
  void ProcessUnaligned(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource
                        , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned);

  void Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream);

public:
  void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign, size_t lineCount);
  void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);

};

} // namespace