diff options
Diffstat (limited to 'GIZA++-v2/getSentence.cpp')
-rw-r--r-- | GIZA++-v2/getSentence.cpp | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/GIZA++-v2/getSentence.cpp b/GIZA++-v2/getSentence.cpp new file mode 100644 index 0000000..78aafcf --- /dev/null +++ b/GIZA++-v2/getSentence.cpp @@ -0,0 +1,340 @@ +/* + +EGYPT Toolkit for Statistical Machine Translation +Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ +/* --------------------------------------------------------------------------* + * * + * Module : getSentece * + * * + * Method Definitions File: getSentence.cc * + * * + * Objective: Defines clases and methods for handling I/O for the parallel * + * corpus. * + *****************************************************************************/ + + +#include "getSentence.h" +#include <iostream> +#include <strstream> +#include "Parameter.h" +#include "errno.h" + +int PrintedTooLong=0; + +/* -------------- Method Defnitions for Class sentenceHandler ---------------*/ + +GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0); +GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0); +GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0); + +sentenceHandler::sentenceHandler(const char* filename, vcbList* elist, + vcbList* flist) : realCount(0) + // This method is the constructor of the class, it also intitializes the + // sentence pair sequential number (count) to zero. + +{ + readflag = false ; + allInMemory = false ; + inputFilename = filename ; + inputFile = new ifstream(filename); + pair_no = 0 ; + if(!(*inputFile)){ + cerr << "\nERROR:(a) Cannot open " << filename; + exit(1); + } + currentSentence = 0; + totalPairs1 = 0 ; + totalPairs2 =0; + pair_no = 0 ; + noSentInBuffer = 0 ; + Buffer.clear(); + bool isNegative=0; + if (elist && flist){ + cout << "Calculating vocabulary frequencies from corpus " << filename << '\n'; + sentPair s ; + while (getNextSentence(s, elist, flist)) + { + totalPairs1++; + totalPairs2+=s.realCount; + // NOTE: this value might change during training + // for words from the manual dictionary, yet this is ignored! + + if( s.noOcc<0 ) + isNegative=1; + } + } + if( isNegative==1 ) + { + cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n"; + realCount=new Vector<double>(totalPairs1,1.0); + } + else + realCount=0; +} + +void sentenceHandler::rewind() +{ + currentSentence = 0; + readflag = false ; + if (!allInMemory || + !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){ + // check if the buffer doe not already has the first chunk of pairs + if (Buffer.size() > 0) + cerr << ' ' << Buffer[currentSentence].sentenceNo << '\n'; + // totalPairs = 0 ; + pair_no = 0 ; + noSentInBuffer = 0 ; + Buffer.clear(); + } + if (!allInMemory){ + delete inputFile; + inputFile = new ifstream(inputFilename); + if(!(*inputFile)){ + cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno; + } + } +} + + +bool sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist) +{ + sentPair s ; + if (readflag){ + cerr << "Attempting to read from the end of corpus, rewinding\n"; + rewind(); + return(false); + } + if (currentSentence >= noSentInBuffer){ + if (allInMemory) + return(false); + /* no more sentences in buffer */ + noSentInBuffer = 0 ; + currentSentence = 0 ; + Buffer.clear(); + cout << "Reading more sentence pairs into memory ... \n"; + while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){ + if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){ + cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<< + "the maximum allowed limit for a source word fertility\n"<< + " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 << + " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " << + MAX_FERTILITY-1 << '\n'; + cerr << "Shortening sentence \n"; + cerr << s; + s.eSent.resize(min(s.eSent.size(),s.fSent.size())); + s.fSent.resize(min(s.eSent.size(),s.fSent.size())); + } + Buffer.push_back(s) ; + if (elist && flist){ + if ((*elist).size() > 0) + for (WordIndex i= 0 ; i < s.eSent.size() ; i++){ + if (s.eSent[i] >= (*elist).uniqTokens()){ + if( PrintedTooLong++<100) + cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n"; + exit(-1); + } + (*elist).incFreq(s.eSent[i], s.realCount); + } + if ((*flist).size() > 0) + for (WordIndex j= 1 ; j < s.fSent.size() ; j++){ + if (s.fSent[j] >= (*flist).uniqTokens()){ + cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n"; + exit(-1); + } + (*flist).incFreq(s.fSent[j], s.realCount); + } + } + noSentInBuffer++; + } + if (inputFile->eof()){ + allInMemory = (Buffer.size() >= 1 && + Buffer[currentSentence].sentenceNo == 1) ; + if (allInMemory) + cout << "Corpus fits in memory, corpus has: " << Buffer.size() << + " sentence pairs.\n"; + } + } + if(noSentInBuffer <= 0 ){ + //cerr << "# sent in buffer " << noSentInBuffer << '\n'; + readflag = true ; + return(false); + } + sent = Buffer[currentSentence++] ; + if( sent.noOcc<0 && realCount ) + { + if( Manlexfactor1 && sent.noOcc==-1.0 ) + sent.realCount=Manlexfactor1; + else if( Manlexfactor2 && sent.noOcc==-2.0 ) + sent.realCount=Manlexfactor2; + else + sent.realCount=(*realCount)[sent.getSentenceNo()-1]; + } + return true ; +} +bool sentenceHandler::readNextSentence(sentPair& sent) + /* This method reads in a new pair of sentences, each pair is read from the + corpus file as line triples. The first line the no of times this line + pair occured in the corpus, the second line is the source sentence and + the third is the target sentence. The sentences are represented by a space + separated positive integer token ids. */ +{ + + string line; + bool fail(false) ; + + sent.clear(); + if (getline(*inputFile, line)){ + istrstream buffer(line.c_str()); + buffer >> sent.noOcc; + if( sent.noOcc<0 ) + { + if( realCount ) + { + if( Manlexfactor1 && sent.noOcc==-1.0 ) + sent.realCount=Manlexfactor1; + else if( Manlexfactor2 && sent.noOcc==-2.0 ) + sent.realCount=Manlexfactor2; + else + { + sent.realCount=(*realCount)[pair_no]; + } + } + else + sent.realCount=1.0; + } + else + sent.realCount=sent.noOcc; + } + else { + fail = true ;; + } + if (getline(*inputFile, line)){ + istrstream buffer(line.c_str()); + WordIndex w; // w is a local variabe for token id + sent.eSent.push_back(0); // each source word is assumed to have 0 == + // a null word (id 0) at the begining of the sentence. + while(buffer>>w){ // read source sentece , word by word . + if (sent.eSent.size() < MAX_SENTENCE_LENGTH) + sent.eSent.push_back(w); + else { + if( PrintedTooLong++<100) + cerr << "{WARNING:(a)truncated sentence "<<pair_no<<"}"; + //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n'; + //cerr << "The following sentence will be truncated\n" << line; + break ; + } + } + } + else { + fail = true ; + } + if (getline(*inputFile, line)){ + istrstream buffer(line.c_str()); + WordIndex w; // w is a local variabe for token id + sent.fSent.push_back(0); //0 is inserted for program uniformity + while(buffer>>w){ // read target sentece , word by word . + if (sent.fSent.size() < MAX_SENTENCE_LENGTH) + sent.fSent.push_back(w); + else { + if( PrintedTooLong++<100) + cerr << "{WARNING:(b)truncated sentence "<<pair_no<<"}"; + //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n'; + //cerr << "The following sentence will be truncated\n" << line; + break ; + } + } + } + else { + fail = true ; + } + if (fail){ + sent.eSent.clear(); + sent.fSent.clear(); + sent.sentenceNo = 0 ; + sent.noOcc = 0 ; + sent.realCount=0; + return(false); + } + if( sent.eSent.size()==1||sent.fSent.size()==1 ) + cerr << "ERROR: Forbidden zero sentence length " << sent.sentenceNo << endl; + sent.sentenceNo = ++pair_no; + if(pair_no % 100000 == 0) + cout << "[sent:" << sent.sentenceNo << "]"<< '\n'; + return true; +} + +double optimize_lambda(Vector<double>&vd) +{ + Vector<double> l; + for(double lambda=1.0;lambda<ManlexMAX_MULTIPLICITY;lambda+=0.33) + { + double prod=0.0; + for(unsigned int i=0;i<vd.size();++i) + { + prod += vd[i]*exp(lambda*vd[i])/(exp(lambda*vd[i])-1.0); + } + l.push_back(fabs(prod-1.0)); + } + double lam=double(min_element(l.begin(),l.end())-l.begin())*0.33+1.0; + if( lam<1.0 ) + { + cerr << "ERROR: lambda is smaller than one: " << lam << endl; + for(unsigned int i=0;i<vd.size();++i) + cerr << vd[i] << ' '; + cerr << endl; + } + return lam; +} + +void sentenceHandler::setProbOfSentence(const sentPair&s,double d) +{ + if( realCount==0 ) + return; + else + { + if( s.noOcc<=0 ) + { + double ed=exp(d); + if( oldPairs.size()>0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) ) + { + double lambda=optimize_lambda(oldProbs); + for(unsigned int i=0;i<oldPairs.size();++i) + { + if( oldProbs[i]<1e-5 ) + (*realCount)[oldPairs[i].getSentenceNo()-1]=1.0; + else + (*realCount)[oldPairs[i].getSentenceNo()-1]=lambda*oldProbs[i]/(1-exp(-lambda*oldProbs[i])); + } + oldPairs.clear(); + oldProbs.clear(); + } + oldPairs.push_back(s); + oldProbs.push_back(ed); + } + } +} + +/* ------------- End of Method Definition of Class sentenceHandler ----------*/ + + + + + + |