diff options
author | edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> | 2010-01-23 16:31:42 +0300 |
---|---|---|
committer | edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> | 2010-01-23 16:31:42 +0300 |
commit | e6691700cb8c925ef44942054c76141a682a5e22 (patch) | |
tree | 4cb112a1ac78c0f35a28aa94a2503fa8310d4053 /mgizapp/src/vocab.cpp |
Diffstat (limited to 'mgizapp/src/vocab.cpp')
-rw-r--r-- | mgizapp/src/vocab.cpp | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/mgizapp/src/vocab.cpp b/mgizapp/src/vocab.cpp new file mode 100644 index 0000000..e7bf13a --- /dev/null +++ b/mgizapp/src/vocab.cpp @@ -0,0 +1,120 @@ +/* + +EGYPT Toolkit for Statistical Machine Translation +Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ +#include "vocab.h" + +void vcbList::readVocabList() + // reads a vocabulary file from fname. It expects the following format: + // + // token_id token_string frequency +{ + + int freq=0; + WordIndex word_id ; + WordEntry entry("NULL",0) ; + + string line, word ; + cerr << "Reading vocabulary file from:" << fname << "\n"; + // total = 0 ; + ifstream ifs(fname); + + if(!ifs){ + cerr << "\nCannot open vocabulary file " << fname << "file"; + exit(1); + } + size_t sline = 0; + while(getline(ifs, line)){ + sline ++; + } + + ifs.close(); + + ifstream vFile(fname); + if(!vFile){ + cerr << "\nCannot open vocabulary file " << fname << "file"; + exit(1); + } + + list.reserve(sline+100); // Reserve space to prevent re-allocating + + list.push_back(entry); + s2i[entry.word]=list.size()-1; + + while(getline(vFile, line)){ + istrstream buffer(line.c_str()); + if(!(buffer >> word_id >> word >> freq)) + cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl; + if (word_id == 0){ + cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ; + exit(-1); + } + else if (word_id >= MAX_VOCAB_SIZE){ + cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size " + << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ; + exit(-1); + } + else if (freq < 0){ + cerr << "ERROR: frequency must be a positive integer, in line :\n" + << line <<"\n"; + exit(-1); + } + else if(word_id >= list.size()){ + list.resize(word_id+1); + list[word_id].word = word ; + s2i[word]=word_id; + list[word_id].freq = 0 ; + noUniqueTokens = word_id + 1 ; + // noUniqueTokens++ ; + // total += freq ; + } + else if(list[word_id].word != "\0"){ + cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n" + << line <<"\n"; + cerr << "TOKEN ID " << word_id << " has already been assigned to: " << + list[word_id].word << "\n"; + exit(-1); + } + else { // line has valid information + list[word_id].word = word ; + s2i[word]=word_id; + list[word_id].freq = 0 ; + // noUniqueTokens++ ; + noUniqueTokens = word_id + 1 ; + // total += freq ; + } + } // end of while +} + + +void vcbList::compact(const std::set<WordIndex>& evoc){ + int del = 0; + for(int i=0; i< list.size() ; i++){ + if(evoc.find(i)==evoc.end()){ // Not appear in corpus + s2i.erase(list[i].word); + list[i].word = ""; + del++; + } + } + cerr << "Compacted Vocabulary, eliminated " << del << " entries " + << s2i.size() << " remains " << endl; +} + + |