diff options
Diffstat (limited to 'mgizapp/src/vocab.h')
-rw-r--r-- | mgizapp/src/vocab.h | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/mgizapp/src/vocab.h b/mgizapp/src/vocab.h new file mode 100644 index 0000000..8bf5de7 --- /dev/null +++ b/mgizapp/src/vocab.h @@ -0,0 +1,110 @@ +/* + +EGYPT Toolkit for Statistical Machine Translation +Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ +#ifndef _vocab_h +#define _vocab_h 1 + +#include "defs.h" +#include "Vector.h" + +#include <fstream> +#include <strstream> +#include <map> +#include <set> + +class WordEntry { + public: + string word ; + double freq ; + WordEntry():word("\0"), freq(0){}; + WordEntry(string w, int f):word(w), freq(f){}; +}; + +class vcbList{ + private: + Vector<WordEntry>& list ; + map<string,int> s2i; + double total; + WordIndex noUniqueTokens ; + WordIndex noUniqueTokensInCorpus ; + const char* fname ; + public: + vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){}; + void setName(const char*f) + { fname=f; } + vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){}; + void compact(const std::set<WordIndex>& evoc); + inline WordIndex size()const {return (list.size());}; + inline WordIndex uniqTokens()const {return noUniqueTokens;}; + inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;}; + inline double totalVocab() const {return total;}; + inline Vector<WordEntry>& getVocabList() { return(list);}; + inline const Vector<WordEntry>& getVocabList()const { return(list);}; + void readVocabList(); + void incFreq(WordIndex id , double f){ + if(id < list.size()){ + if (list[id].freq == 0) + noUniqueTokensInCorpus++; + list[id].freq += f ; + total += f ; + } + }; + void clearAllFreq(){ + for (WordIndex id = 0 ; id < list.size() ; id++) + list[id].freq = 0 ; + total = 0 ; + noUniqueTokensInCorpus = 0 ; + }; + + const bool has_word(const string& x) const{ + map<string,int>::const_iterator i=s2i.find(x); + return i!=s2i.end(); + } + int operator()(const string&x)const + { + map<string,int>::const_iterator i=s2i.find(x); + if( i!=s2i.end() ) + return i->second; + else + { + cerr << "ERROR: no word index for '"<<x<<"'\n"; + return 0; + } + } + const string operator()(WordIndex id) const { // Yaser - 2000-12-13 + if (id < list.size()) + return list[id].word ; + else return 0 ; + } + const string operator[](WordIndex id) const { // Yaser - 2000-12-13 + if (id < list.size()) + return list[id].word ; + else return 0 ; + } + void printVocabList(ostream& of){ + for (WordIndex i = 1 ; i < list.size() ; i++){ + if (list[i].word != "" && list[i].freq > 0) + of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n'; + } + } + +}; +#endif |