1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
/******************************************************************************
IrstLM: IRST Language Model Toolkit
Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
******************************************************************************/
/*
IrstLM: IRST Language Model Toolkit
Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef MF_DICTIONARY_H
#define MF_DICTIONARY_H
#include <string.h>
#include <iostream>
#define MAX_WORD 100
#define LOAD_FACTOR 5
#ifndef GROWTH_STEP
#define GROWTH_STEP 100000
#endif
#ifndef DICT_INITSIZE
#define DICT_INITSIZE 100000
#endif
//Begin of sentence symbol
#ifndef BOS_
#define BOS_ "<s>"
#endif
//End of sentence symbol
#ifndef EOS_
#define EOS_ "</s>"
#endif
//End of sentence symbol
#ifndef OOV_
#define OOV_ "_unk_"
#endif
typedef struct{
char *word;
int code;
int freq;
}dict_entry;
class strstack;
class htable;
class dictionary{
strstack *st; //!< stack of strings
dict_entry *tb; //!< entry table
htable *htb; //!< hash table
int n; //!< number of entries
int N; //!< total frequency
int lim; //!< limit of entries
int oov_code; //!< code assigned to oov words
char* is; //!< interruption symbol list
char ifl; //!< increment flag
int dubv; //!< dictionary size upper bound
int in_oov_lex; //!< flag
int oov_lex_code; //< dictionary
char* oov_str; //!< oov string
public:
friend class dictionary_iter;
dictionary* oovlex; //<! additional dictionary
inline int dub(){return dubv;}
inline int dub(int value){return (dubv=value);}
inline char *OOV(){return (OOV_);}
inline char *BoS(){return (BOS_);}
inline char *EoS(){return (EOS_);}
inline int oovcode(int v=-1){return oov_code=(v>=0?v:oov_code);}
inline char *intsymb(char* isymb=NULL){
if (isymb==NULL) return is;
if (is!=NULL) delete [] is;
is=new char[strlen(isymb+1)];
strcpy(is,isymb);
return is=isymb;
}
inline int incflag(){return ifl;}
inline int incflag(int v){return ifl=v;}
inline int oovlexsize(){return oovlex?oovlex->n:0;}
inline int inoovlex(){return in_oov_lex;}
inline int oovlexcode(){return oov_lex_code;}
int isprintable(char* w){
char buffer[MAX_WORD];
sprintf(buffer,"%s",w);
return strcmp(w,buffer)==0;
}
inline void genoovcode(){
int c=encode(OOV());
std::cerr << "OOV code is "<< c << std::endl;
oovcode(c);
}
inline dictionary* oovlexp(char *fname=NULL){
if (fname==NULL) return oovlex;
if (oovlex!=NULL) delete oovlex;
oovlex=new dictionary(fname,DICT_INITSIZE);
return oovlex;
}
inline int setoovrate(double oovrate){
encode(OOV()); //be sure OOV code exists
int oovfreq=(int)(oovrate * totfreq());
std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl;
return freq(oovcode(),oovfreq);
return 1;
}
inline int incfreq(int code,int value){N+=value;return tb[code].freq+=value;}
inline int multfreq(int code,double value){
N+=(int)(value * tb[code].freq)-tb[code].freq;
return tb[code].freq=(int)(value * tb[code].freq);
}
inline int freq(int code,int value=-1){
if (value>=0){
N+=value-tb[code].freq;
tb[code].freq=value;
}
return tb[code].freq;
}
inline int totfreq(){return N;}
void grow();
//dictionary(int size=400,char* isym=NULL,char* oovlex=NULL);
dictionary(char *filename=NULL,int size=DICT_INITSIZE,char* isymb=NULL,char* oovlex=NULL);
dictionary(dictionary* d);
~dictionary();
void generate(char *filename);
void load(char *filename);
void save(char *filename,int freqflag=0);
void load(std::istream& fd);
void save(std::ostream& fd);
int size(){return n;};
int getcode(const char *w);
int encode(const char *w);
char *decode(int c);
void stat();
void cleanfreq(){
for (int i=0;i<n;tb[i++].freq=0);
N=0;
}
};
class dictionary_iter {
public:
dictionary_iter(dictionary *dict);
dict_entry* next();
private:
dictionary* m_dict;
};
#endif
|