GIZA++-v2/vocab.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

/*

EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, 
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
USA.

*/
#include "vocab.h" 

void vcbList::readVocabList()
     // reads a vocabulary file from fname. It expects the following format:
     // 
     // token_id token_string frequency
{

  int freq=0;
  WordIndex word_id ;
  WordEntry entry("NULL",0) ;

  string line, word ;
  cerr << "Reading vocabulary file from:" << fname << "\n";    
  //  total = 0 ;
  ifstream vFile(fname);
  if(!vFile){
    cerr <<  "\nCannot open vocabulary file " << fname << "file";
    exit(1);
  }
  
  list.push_back(entry);
  s2i[entry.word]=list.size()-1;

  while(getline(vFile, line)){
    istringstream buffer(line);
    if(!(buffer >> word_id >> word >> freq))
      cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
    if (word_id == 0){
      cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
      exit(-1);
    }
    else if (word_id >= MAX_VOCAB_SIZE){
      cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
	   << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
      exit(-1);
    }	
    else if (freq < 0){
      cerr << "ERROR: frequency must be a positive integer, in line :\n"
	   << line <<"\n";
      exit(-1);
    }
    else if(word_id >= list.size()){
      list.resize(word_id+1);
      list[word_id].word = word ;
      s2i[word]=word_id;
      list[word_id].freq = 0 ;
      noUniqueTokens = word_id + 1 ;
      //      noUniqueTokens++ ;
      //      total += freq ;
    }      
    else if(list[word_id].word != "\0"){
      cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
	   << line <<"\n";
      cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
	list[word_id].word << "\n";
      exit(-1);
    }
    else { // line  has valid information
      list[word_id].word = word ;
      s2i[word]=word_id;
      list[word_id].freq = 0 ;
      //      noUniqueTokens++ ;
      noUniqueTokens  = word_id + 1 ;
      //      total += freq ;
    }
  } // end of while
}