Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/src/vocab.h')
-rw-r--r--mgizapp/src/vocab.h110
1 files changed, 110 insertions, 0 deletions
diff --git a/mgizapp/src/vocab.h b/mgizapp/src/vocab.h
new file mode 100644
index 0000000..8bf5de7
--- /dev/null
+++ b/mgizapp/src/vocab.h
@@ -0,0 +1,110 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _vocab_h
+#define _vocab_h 1
+
+#include "defs.h"
+#include "Vector.h"
+
+#include <fstream>
+#include <strstream>
+#include <map>
+#include <set>
+
+class WordEntry {
+ public:
+ string word ;
+ double freq ;
+ WordEntry():word("\0"), freq(0){};
+ WordEntry(string w, int f):word(w), freq(f){};
+};
+
+class vcbList{
+ private:
+ Vector<WordEntry>& list ;
+ map<string,int> s2i;
+ double total;
+ WordIndex noUniqueTokens ;
+ WordIndex noUniqueTokensInCorpus ;
+ const char* fname ;
+ public:
+ vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){};
+ void setName(const char*f)
+ { fname=f; }
+ vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){};
+ void compact(const std::set<WordIndex>& evoc);
+ inline WordIndex size()const {return (list.size());};
+ inline WordIndex uniqTokens()const {return noUniqueTokens;};
+ inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;};
+ inline double totalVocab() const {return total;};
+ inline Vector<WordEntry>& getVocabList() { return(list);};
+ inline const Vector<WordEntry>& getVocabList()const { return(list);};
+ void readVocabList();
+ void incFreq(WordIndex id , double f){
+ if(id < list.size()){
+ if (list[id].freq == 0)
+ noUniqueTokensInCorpus++;
+ list[id].freq += f ;
+ total += f ;
+ }
+ };
+ void clearAllFreq(){
+ for (WordIndex id = 0 ; id < list.size() ; id++)
+ list[id].freq = 0 ;
+ total = 0 ;
+ noUniqueTokensInCorpus = 0 ;
+ };
+
+ const bool has_word(const string& x) const{
+ map<string,int>::const_iterator i=s2i.find(x);
+ return i!=s2i.end();
+ }
+ int operator()(const string&x)const
+ {
+ map<string,int>::const_iterator i=s2i.find(x);
+ if( i!=s2i.end() )
+ return i->second;
+ else
+ {
+ cerr << "ERROR: no word index for '"<<x<<"'\n";
+ return 0;
+ }
+ }
+ const string operator()(WordIndex id) const { // Yaser - 2000-12-13
+ if (id < list.size())
+ return list[id].word ;
+ else return 0 ;
+ }
+ const string operator[](WordIndex id) const { // Yaser - 2000-12-13
+ if (id < list.size())
+ return list[id].word ;
+ else return 0 ;
+ }
+ void printVocabList(ostream& of){
+ for (WordIndex i = 1 ; i < list.size() ; i++){
+ if (list[i].word != "" && list[i].freq > 0)
+ of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
+ }
+ }
+
+};
+#endif