1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include "vocab.h"
void vcbList::readVocabList()
// reads a vocabulary file from fname. It expects the following format:
//
// token_id token_string frequency
{
int freq=0;
WordIndex word_id ;
WordEntry entry("NULL",0) ;
string line, word ;
cerr << "Reading vocabulary file from:" << fname << "\n";
// total = 0 ;
ifstream vFile(fname);
if(!vFile){
cerr << "\nCannot open vocabulary file " << fname << "file";
exit(1);
}
list.push_back(entry);
s2i[entry.word]=list.size()-1;
while(getline(vFile, line)){
istringstream buffer(line);
if(!(buffer >> word_id >> word >> freq))
cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
if (word_id == 0){
cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
exit(-1);
}
else if (word_id >= MAX_VOCAB_SIZE){
cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
<< MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
exit(-1);
}
else if (freq < 0){
cerr << "ERROR: frequency must be a positive integer, in line :\n"
<< line <<"\n";
exit(-1);
}
else if(word_id >= list.size()){
list.resize(word_id+1);
list[word_id].word = word ;
s2i[word]=word_id;
list[word_id].freq = 0 ;
noUniqueTokens = word_id + 1 ;
// noUniqueTokens++ ;
// total += freq ;
}
else if(list[word_id].word != "\0"){
cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
<< line <<"\n";
cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
list[word_id].word << "\n";
exit(-1);
}
else { // line has valid information
list[word_id].word = word ;
s2i[word]=word_id;
list[word_id].freq = 0 ;
// noUniqueTokens++ ;
noUniqueTokens = word_id + 1 ;
// total += freq ;
}
} // end of while
}
|