1 files changed, 110 insertions, 0 deletions
diff --git a/mgizapp/src/vocab.h b/mgizapp/src/vocab.h
new file mode 100644
index 0000000..8bf5de7
--- /dev/null
+++ b/mgizapp/src/vocab.h
@@ -0,0 +1,110 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
+USA.
+
+*/
+#ifndef _vocab_h
+#define _vocab_h 1
+
+#include "defs.h" 
+#include "Vector.h" 
+
+#include <fstream>
+#include <strstream>
+#include <map>
+#include <set>
+
+class WordEntry {
+ public:
+  string word ;
+  double freq ;
+  WordEntry():word("\0"), freq(0){};
+  WordEntry(string w, int f):word(w), freq(f){};
+};
+
+class vcbList{
+ private:
+  Vector<WordEntry>& list ;
+  map<string,int> s2i;
+  double total;
+  WordIndex noUniqueTokens ;
+  WordIndex noUniqueTokensInCorpus ;
+  const char* fname ;
+ public:
+  vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){};
+  void setName(const char*f)
+    { fname=f; }
+  vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){};
+  void compact(const std::set<WordIndex>& evoc);		
+  inline WordIndex size()const {return (list.size());};
+  inline WordIndex uniqTokens()const {return noUniqueTokens;};
+  inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;};
+  inline double totalVocab() const {return total;};
+  inline Vector<WordEntry>& getVocabList() { return(list);}; 
+  inline const Vector<WordEntry>& getVocabList()const { return(list);}; 
+  void readVocabList();
+  void incFreq(WordIndex id , double f){
+    if(id < list.size()){
+      if (list[id].freq == 0)
+	noUniqueTokensInCorpus++;
+      list[id].freq += f ;
+      total += f ;
+    }
+  };
+  void clearAllFreq(){
+    for (WordIndex id = 0 ; id < list.size() ; id++)
+      list[id].freq = 0 ;
+    total = 0 ;
+    noUniqueTokensInCorpus = 0 ;
+  };
+
+  const bool has_word(const string& x) const{
+	  map<string,int>::const_iterator i=s2i.find(x);
+	  return i!=s2i.end();
+  }
+  int operator()(const string&x)const
+    {
+      map<string,int>::const_iterator i=s2i.find(x);
+      if( i!=s2i.end() )
+	return i->second;
+      else
+	{
+	  cerr << "ERROR: no word index for '"<<x<<"'\n";
+	  return 0;
+	}
+    }
+  const string operator()(WordIndex id) const { // Yaser - 2000-12-13
+    if (id < list.size())
+      return list[id].word ;
+    else return 0 ;
+  }
+  const string operator[](WordIndex id) const { // Yaser - 2000-12-13
+    if (id < list.size())
+      return list[id].word ;
+    else return 0 ;
+  }
+  void printVocabList(ostream& of){
+    for (WordIndex i = 1 ; i < list.size() ; i++){
+      if (list[i].word != "" && list[i].freq > 0)
+	of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
+    }
+  }
+  
+};
+#endif