1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
/******************************************************************************
IrstLM: IRST Language Model Toolkit
Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
******************************************************************************/
// n-gram tables
// by M. Federico
// Copyright Marcello Federico, ITC-irst, 1998
#ifndef MF_NGRAM_H
#define MF_NGRAM_H
#include <fstream>
#include <cassert>
#include "dictionary.h"
#ifdef MYMAXNGRAM
#define MAX_NGRAM MYMAXNGRAM
#else
#define MAX_NGRAM 20
#endif
class dictionary;
//typedef int code;
class ngram{
int word[MAX_NGRAM]; //encoded ngram
public:
dictionary *dict; //dictionary
char* link; // ngram-tree pointer
int midx[MAX_NGRAM]; // ngram-tree scan pointer
int lev; // ngram-tree level
int size; // ngram size
int freq; // ngram frequency or integer prob
int succ; // number of successors
int bow; // back-off weight
int prob; // probability
unsigned char info; // ngram-tree info flags
unsigned char pinfo; // ngram-tree parent info flags
int isym; // last interruption symbol
ngram(dictionary* d,int sz=0);
ngram(ngram& ng);
int *wordp()// n-gram pointer
{return wordp(size);};
int *wordp(int k) // n-gram pointer
{return size>=k?&word[MAX_NGRAM-k]:0;};
const int *wordp() const // n-gram pointer
{return wordp(size);};
const int *wordp(int k) const // n-gram pointer
{return size>=k?&word[MAX_NGRAM-k]:0;};
int shift(){
for (int i=(MAX_NGRAM-1);i>0;i--){
word[i]=word[i-1];
}
size--;
return 1;
}
int containsWord(char* s,int lev){
int c=dict->encode(s);
if (c == -1) return 0;
assert(lev <= size);
for (int i=0;i<lev;i++){
if (*wordp(size-i)== c) return 1;
}
return 0;
}
void trans(const ngram& ng);
friend std::ifstream& operator>> (std::ifstream& fi,ngram& ng);
friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng);
friend std::istream& operator>> (std::istream& fi,ngram& ng);
friend std::ostream& operator<< (std::ostream& fi,ngram& ng);
inline int ckhisto(int sz){
for (int i=sz;i>1;i--)
if (*wordp(i)==dict->oovcode())
return 0;
return 1;
}
int pushc(int c);
int pushw(char* w);
//~ngram();
};
#endif
|