author: edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> 2010-01-23 16:31:42 +0300
committer: edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> 2010-01-23 16:31:42 +0300
commit: e6691700cb8c925ef44942054c76141a682a5e22 (patch)
tree: 4cb112a1ac78c0f35a28aa94a2503fa8310d4053 /mgizapp/src/vocab.cpp
1 files changed, 120 insertions, 0 deletions
diff --git a/mgizapp/src/vocab.cpp b/mgizapp/src/vocab.cpp
new file mode 100644
index 0000000..e7bf13a
--- /dev/null
+++ b/mgizapp/src/vocab.cpp
@@ -0,0 +1,120 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
+USA.
+
+*/
+#include "vocab.h" 
+
+void vcbList::readVocabList()
+     // reads a vocabulary file from fname. It expects the following format:
+     // 
+     // token_id token_string frequency
+{
+
+  int freq=0;
+  WordIndex word_id ;
+  WordEntry entry("NULL",0) ;
+
+  string line, word ;
+  cerr << "Reading vocabulary file from:" << fname << "\n";    
+  //  total = 0 ;
+  ifstream ifs(fname);
+
+  if(!ifs){
+    cerr <<  "\nCannot open vocabulary file " << fname << "file";
+    exit(1);
+  }
+  size_t sline = 0;
+  while(getline(ifs, line)){
+	  sline ++;
+  }
+
+  ifs.close();
+
+  ifstream vFile(fname);
+  if(!vFile){
+    cerr <<  "\nCannot open vocabulary file " << fname << "file";
+    exit(1);
+  }
+
+  list.reserve(sline+100); // Reserve space to prevent re-allocating
+  
+  list.push_back(entry);
+  s2i[entry.word]=list.size()-1;
+
+  while(getline(vFile, line)){
+    istrstream buffer(line.c_str());
+    if(!(buffer >> word_id >> word >> freq))
+      cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
+    if (word_id == 0){
+      cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
+      exit(-1);
+    }
+    else if (word_id >= MAX_VOCAB_SIZE){
+      cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
+	   << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
+      exit(-1);
+    }	
+    else if (freq < 0){
+      cerr << "ERROR: frequency must be a positive integer, in line :\n"
+	   << line <<"\n";
+      exit(-1);
+    }
+    else if(word_id >= list.size()){
+      list.resize(word_id+1);
+      list[word_id].word = word ;
+      s2i[word]=word_id;
+      list[word_id].freq = 0 ;
+      noUniqueTokens = word_id + 1 ;
+      //      noUniqueTokens++ ;
+      //      total += freq ;
+    }      
+    else if(list[word_id].word != "\0"){
+      cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
+	   << line <<"\n";
+      cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
+	list[word_id].word << "\n";
+      exit(-1);
+    }
+    else { // line  has valid information
+      list[word_id].word = word ;
+      s2i[word]=word_id;
+      list[word_id].freq = 0 ;
+      //      noUniqueTokens++ ;
+      noUniqueTokens  = word_id + 1 ;
+      //      total += freq ;
+    }
+  } // end of while
+}
+
+
+void vcbList::compact(const std::set<WordIndex>& evoc){
+	int del = 0;
+	for(int i=0; i< list.size() ; i++){
+		if(evoc.find(i)==evoc.end()){ // Not appear in corpus
+			s2i.erase(list[i].word);
+			list[i].word = "";
+			del++;
+		}
+	}
+	cerr << "Compacted Vocabulary, eliminated " << del << " entries " 
+		<< s2i.size() << " remains " << endl;
+}
+
+
author	edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>	2010-01-23 16:31:42 +0300
committer	edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>	2010-01-23 16:31:42 +0300
commit	e6691700cb8c925ef44942054c76141a682a5e22 (patch)
tree	4cb112a1ac78c0f35a28aa94a2503fa8310d4053 /mgizapp/src/vocab.cpp