Welcome to mirror list, hosted at ThFree Co, Russian Federation.

dictionary.h « src « irstlm - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 494b240d267fd1a766582f4ce1f39e4f0154764b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/******************************************************************************
 IrstLM: IRST Language Model Toolkit
 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy

 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.

 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA

******************************************************************************/

/*
 IrstLM: IRST Language Model Toolkit 
 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
 
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.
 
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#ifndef MF_DICTIONARY_H
#define MF_DICTIONARY_H

#include <string.h>
#include <iostream>

#define MAX_WORD 100
#define LOAD_FACTOR  5

#ifndef GROWTH_STEP 
#define GROWTH_STEP 100000
#endif

#ifndef DICT_INITSIZE
#define DICT_INITSIZE 100000
#endif


//Begin of sentence symbol
#ifndef BOS_
#define BOS_ "<s>"
#endif


//End of sentence symbol
#ifndef EOS_ 
#define EOS_ "</s>"
#endif

//End of sentence symbol
#ifndef OOV_ 
#define OOV_ "_unk_"
#endif


typedef struct{
  char *word;
  int  code;
  int  freq;
}dict_entry;

class strstack;
class htable;

class dictionary{
  strstack   *st;  //!< stack of strings
  dict_entry *tb;  //!< entry table
  htable    *htb;  //!< hash table
  int          n;  //!< number of entries
  int          N;  //!< total frequency
  int        lim;  //!< limit of entries
  int   oov_code;  //!< code assigned to oov words
  char*       is;  //!< interruption symbol list
  char       ifl;  //!< increment flag
  int        dubv; //!< dictionary size upper bound
  int in_oov_lex;  //!< flag
  int oov_lex_code; //< dictionary
  char* oov_str;   //!< oov string

 public:

  friend class dictionary_iter;

  dictionary* oovlex; //<! additional dictionary 

  inline int dub(){return dubv;}
  inline int dub(int value){return (dubv=value);}

  inline char *OOV(){return (OOV_);} 
  inline char *BoS(){return (BOS_);}
  inline char *EoS(){return (EOS_);}

  inline int oovcode(int v=-1){return oov_code=(v>=0?v:oov_code);}
  
  inline char *intsymb(char* isymb=NULL){
    if (isymb==NULL) return is;
    if (is!=NULL) delete [] is;
    is=new char[strlen(isymb+1)];
    strcpy(is,isymb);
    return is=isymb;
  }

  inline int incflag(){return ifl;}
  inline int incflag(int v){return ifl=v;}
  inline int oovlexsize(){return oovlex?oovlex->n:0;}
  inline int inoovlex(){return in_oov_lex;}
  inline int oovlexcode(){return oov_lex_code;}
  

  int isprintable(char* w){
    char buffer[MAX_WORD];
    sprintf(buffer,"%s",w);
    return strcmp(w,buffer)==0;
  }

  inline void genoovcode(){
    int c=encode(OOV());
    std::cerr << "OOV code is "<< c << std::endl;
    oovcode(c);
  }
  
  inline dictionary* oovlexp(char *fname=NULL){
    if (fname==NULL) return oovlex;
    if (oovlex!=NULL) delete oovlex;
    oovlex=new dictionary(fname,DICT_INITSIZE);
    return oovlex;
  }

  inline int setoovrate(double oovrate){ 
    encode(OOV()); //be sure OOV code exists
    int oovfreq=(int)(oovrate * totfreq());
    std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl;
    return freq(oovcode(),oovfreq);
   
    return 1;
  }


  inline int incfreq(int code,int value){N+=value;return tb[code].freq+=value;}

  inline int multfreq(int code,double value){
    N+=(int)(value * tb[code].freq)-tb[code].freq;
    return tb[code].freq=(int)(value * tb[code].freq);
  }
  
  inline int freq(int code,int value=-1){
    if (value>=0){
      N+=value-tb[code].freq; 
      tb[code].freq=value;
    }
    return tb[code].freq;
  }

  inline int totfreq(){return N;}

  void grow();
  //dictionary(int size=400,char* isym=NULL,char* oovlex=NULL);
  dictionary(char *filename=NULL,int size=DICT_INITSIZE,char* isymb=NULL,char* oovlex=NULL);
  dictionary(dictionary* d);

  ~dictionary();
  void generate(char *filename);
  void load(char *filename);
  void save(char *filename,int freqflag=0);
  void load(std::istream& fd);
  void save(std::ostream& fd);

  int size(){return n;};
  int getcode(const char *w);
  int encode(const char *w);
  char *decode(int c);
  void stat();

  void cleanfreq(){
    for (int i=0;i<n;tb[i++].freq=0); 
    N=0;
  }

};

class dictionary_iter {
 public:
  dictionary_iter(dictionary *dict);
  dict_entry* next();
 private:
  dictionary* m_dict;
};

#endif