Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/src/vocab.cpp')
-rw-r--r--mgizapp/src/vocab.cpp120
1 files changed, 120 insertions, 0 deletions
diff --git a/mgizapp/src/vocab.cpp b/mgizapp/src/vocab.cpp
new file mode 100644
index 0000000..e7bf13a
--- /dev/null
+++ b/mgizapp/src/vocab.cpp
@@ -0,0 +1,120 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "vocab.h"
+
+void vcbList::readVocabList()
+ // reads a vocabulary file from fname. It expects the following format:
+ //
+ // token_id token_string frequency
+{
+
+ int freq=0;
+ WordIndex word_id ;
+ WordEntry entry("NULL",0) ;
+
+ string line, word ;
+ cerr << "Reading vocabulary file from:" << fname << "\n";
+ // total = 0 ;
+ ifstream ifs(fname);
+
+ if(!ifs){
+ cerr << "\nCannot open vocabulary file " << fname << "file";
+ exit(1);
+ }
+ size_t sline = 0;
+ while(getline(ifs, line)){
+ sline ++;
+ }
+
+ ifs.close();
+
+ ifstream vFile(fname);
+ if(!vFile){
+ cerr << "\nCannot open vocabulary file " << fname << "file";
+ exit(1);
+ }
+
+ list.reserve(sline+100); // Reserve space to prevent re-allocating
+
+ list.push_back(entry);
+ s2i[entry.word]=list.size()-1;
+
+ while(getline(vFile, line)){
+ istrstream buffer(line.c_str());
+ if(!(buffer >> word_id >> word >> freq))
+ cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
+ if (word_id == 0){
+ cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
+ exit(-1);
+ }
+ else if (word_id >= MAX_VOCAB_SIZE){
+ cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
+ << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
+ exit(-1);
+ }
+ else if (freq < 0){
+ cerr << "ERROR: frequency must be a positive integer, in line :\n"
+ << line <<"\n";
+ exit(-1);
+ }
+ else if(word_id >= list.size()){
+ list.resize(word_id+1);
+ list[word_id].word = word ;
+ s2i[word]=word_id;
+ list[word_id].freq = 0 ;
+ noUniqueTokens = word_id + 1 ;
+ // noUniqueTokens++ ;
+ // total += freq ;
+ }
+ else if(list[word_id].word != "\0"){
+ cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
+ << line <<"\n";
+ cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
+ list[word_id].word << "\n";
+ exit(-1);
+ }
+ else { // line has valid information
+ list[word_id].word = word ;
+ s2i[word]=word_id;
+ list[word_id].freq = 0 ;
+ // noUniqueTokens++ ;
+ noUniqueTokens = word_id + 1 ;
+ // total += freq ;
+ }
+ } // end of while
+}
+
+
+void vcbList::compact(const std::set<WordIndex>& evoc){
+ int del = 0;
+ for(int i=0; i< list.size() ; i++){
+ if(evoc.find(i)==evoc.end()){ // Not appear in corpus
+ s2i.erase(list[i].word);
+ list[i].word = "";
+ del++;
+ }
+ }
+ cerr << "Compacted Vocabulary, eliminated " << del << " entries "
+ << s2i.size() << " remains " << endl;
+}
+
+