1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
#ifndef moses_DynSAInclude_vocab_h
#define moses_DynSAInclude_vocab_h
#include <map>
#include <string>
#include "types.h"
#include "file.h"
#include "utils.h"
#include "../TypeDef.h"
#include "../Word.h"
namespace Moses
{
// Vocab maps between strings and uint32 ids.
class Vocab
{
public:
typedef std::map<Word, wordID_t> Word2Id;
typedef std::map<wordID_t, Word> Id2Word;
Vocab(bool sntMarkers = true):
m_closed(false),
m_kOOVWordID(0),
m_kBOSWordID(1) {
InitSpecialWords();
if(sntMarkers) {
GetWordID(m_kBOSWord); // added in case not observed in corpus
GetWordID(m_kEOSWord);
}
}
// if no file then must allow new words
// specify whether more words can be added via 'closed'
// assume that if a vocab is loaded from file then it should be closed.
Vocab(const std::string & vocab_path, const FactorDirection& direction,
const FactorList& factors, bool closed = true):
m_kOOVWordID(0),
m_kBOSWordID(1) {
InitSpecialWords();
bool ret = Load(vocab_path, direction, factors, closed);
CHECK(ret);
}
Vocab(FileHandler * fin, const FactorDirection& direction,
const FactorList& factors, bool closed = true):
m_kOOVWordID(0),
m_kBOSWordID(1) {
InitSpecialWords();
bool ret = Load(fin, direction, factors, closed);
CHECK(ret);
}
Vocab(FileHandler *fin):
m_kOOVWordID(0),
m_kBOSWordID(1) {
Load(fin);
}
~Vocab() {}
// parse 'word' into factored Word and get id
wordID_t GetWordID(const std::string& word, const FactorDirection& direction,
const FactorList& factors, bool isNonTerminal);
wordID_t GetWordID(const Word& word);
wordID_t GetWordID(const string& word);
Word& GetWord(wordID_t id);
inline wordID_t GetkOOVWordID() {
return m_kOOVWordID;
}
inline wordID_t GetBOSWordID() {
return m_kBOSWordID;
}
inline const Word& GetkOOVWord() {
return m_kOOVWord;
}
inline const Word& GetkBOSWord() {
return m_kBOSWord;
}
inline const Word& GetkEOSWord() {
return m_kEOSWord;
}
bool InVocab(wordID_t id);
bool InVocab(const Word& word);
uint32_t Size() {
return m_words2ids.size();
}
void MakeClosed() {
m_closed = true;
}
void MakeOpen() {
m_closed = false;
}
bool IsClosed() {
return m_closed;
}
bool Save(const std::string & vocab_path);
bool Save(FileHandler* fout);
bool Load(const std::string & vocab_path, const FactorDirection& direction,
const FactorList& factors, bool closed = true);
bool Load(FileHandler* fin, const FactorDirection& direction,
const FactorList& factors, bool closed = true);
bool Load(FileHandler* fin);
void PrintVocab();
Word2Id::const_iterator VocabStart() {
return m_words2ids.begin();
}
Word2Id::const_iterator VocabEnd() {
return m_words2ids.end();
}
protected:
bool m_closed; // can more words be added
const wordID_t m_kOOVWordID; // out of vocabulary word id
const wordID_t m_kBOSWordID;
Word m_kBOSWord; // beginning of sentence marker
Word m_kEOSWord; // end of sentence marker
Word m_kOOVWord; // <unk>
const Word InitSpecialWord( const string& type); // initialize special word like kBOS, kEOS
void InitSpecialWords();
Word2Id m_words2ids; // map from words to word ids
Id2Word m_ids2words; // map from ids to words
};
}
#endif
|