1 files changed, 340 insertions, 0 deletions
diff --git a/GIZA++-v2/getSentence.cpp b/GIZA++-v2/getSentence.cpp
new file mode 100644
index 0000000..78aafcf
--- /dev/null
+++ b/GIZA++-v2/getSentence.cpp
@@ -0,0 +1,340 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
+USA.
+
+*/
+/* --------------------------------------------------------------------------*
+ *                                                                           *
+ * Module : getSentece                                                       *
+ *                                                                           *
+ * Method Definitions File: getSentence.cc                                   *
+ *                                                                           *
+ * Objective: Defines clases and methods for handling I/O for the parallel   *
+ *            corpus.                                                        *
+ *****************************************************************************/
+
+
+#include "getSentence.h"
+#include <iostream>
+#include <strstream>
+#include "Parameter.h"
+#include "errno.h"
+
+int PrintedTooLong=0;
+
+/* -------------- Method Defnitions for Class sentenceHandler ---------------*/
+
+GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0);
+GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0);
+GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0);
+
+sentenceHandler::sentenceHandler(const char*  filename, vcbList* elist, 
+				 vcbList*  flist) : realCount(0)
+  // This method is the constructor of the class, it also intitializes the 
+  // sentence pair sequential number (count) to zero.
+
+{
+  readflag = false ;
+  allInMemory = false ;
+  inputFilename = filename ;
+  inputFile = new ifstream(filename);
+  pair_no = 0 ;
+  if(!(*inputFile)){
+    cerr << "\nERROR:(a) Cannot open " << filename;
+    exit(1);
+  }
+  currentSentence = 0;
+  totalPairs1 = 0 ;
+  totalPairs2 =0;
+  pair_no = 0 ;
+  noSentInBuffer = 0 ;
+  Buffer.clear();
+  bool isNegative=0;
+  if (elist && flist){
+    cout << "Calculating vocabulary frequencies from corpus " << filename << '\n';
+    sentPair s ;
+    while (getNextSentence(s, elist, flist))
+      {
+	totalPairs1++;
+	totalPairs2+=s.realCount; 
+	// NOTE: this value might change during training 
+	// for words from the manual dictionary, yet this is ignored!
+
+	if( s.noOcc<0 )
+	  isNegative=1;
+      }
+  }
+  if( isNegative==1 )
+    {
+      cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n";
+      realCount=new Vector<double>(totalPairs1,1.0);
+    }
+  else
+    realCount=0;
+}
+
+void sentenceHandler::rewind()
+{
+  currentSentence = 0;
+  readflag = false ;
+  if (!allInMemory || 
+      !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){
+    // check if the buffer doe not already has the first chunk of pairs 
+    if (Buffer.size() > 0)
+      cerr << ' ' <<  Buffer[currentSentence].sentenceNo << '\n';
+    //    totalPairs = 0 ;
+    pair_no = 0 ;
+    noSentInBuffer = 0 ;
+    Buffer.clear();
+  }
+  if (!allInMemory){
+    delete inputFile;
+    inputFile = new ifstream(inputFilename);
+    if(!(*inputFile)){
+      cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno;
+    }
+  }
+}
+
+  
+bool sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist)
+{
+  sentPair s ;
+  if (readflag){
+    cerr << "Attempting to read from the end of corpus, rewinding\n";
+    rewind();
+    return(false);
+  } 
+  if (currentSentence >= noSentInBuffer){
+    if (allInMemory)
+      return(false);
+    /* no more sentences in buffer */
+    noSentInBuffer = 0 ;
+    currentSentence = 0 ;
+    Buffer.clear();
+    cout << "Reading more sentence pairs into memory ... \n";
+    while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){
+      if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){
+	cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<<
+	  "the maximum allowed limit for a source word fertility\n"<<
+	  " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 <<
+	  " ratio " << double(s.fSent.size()-1)/  (s.eSent.size()-1) << " ferility limit : " <<
+	  MAX_FERTILITY-1 << '\n';
+	cerr << "Shortening sentence \n";
+	cerr << s;
+	s.eSent.resize(min(s.eSent.size(),s.fSent.size()));
+	s.fSent.resize(min(s.eSent.size(),s.fSent.size()));
+      }
+      Buffer.push_back(s) ;
+      if (elist && flist){
+	if ((*elist).size() > 0)
+	  for (WordIndex i= 0 ; i < s.eSent.size() ; i++){
+	    if (s.eSent[i] >= (*elist).uniqTokens()){
+	      if( PrintedTooLong++<100)
+		cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n";
+	      exit(-1);
+	    }
+	    (*elist).incFreq(s.eSent[i], s.realCount);
+	  }
+	if ((*flist).size() > 0)
+	  for (WordIndex j= 1 ; j < s.fSent.size() ; j++){
+	    if (s.fSent[j] >= (*flist).uniqTokens()){
+	      cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n";
+	      exit(-1);
+	    }
+	    (*flist).incFreq(s.fSent[j], s.realCount);
+	  }
+      }
+      noSentInBuffer++;
+    }
+    if (inputFile->eof()){
+      allInMemory = (Buffer.size() >= 1 && 
+		     Buffer[currentSentence].sentenceNo == 1) ;
+      if (allInMemory)
+	cout << "Corpus fits in memory, corpus has: " << Buffer.size() <<
+	  " sentence pairs.\n";
+    }
+  }
+  if(noSentInBuffer <= 0 ){
+    //cerr << "# sent in buffer " << noSentInBuffer << '\n';
+    readflag = true ;
+    return(false);
+  }
+  sent = Buffer[currentSentence++] ;
+  if( sent.noOcc<0 && realCount )
+    {
+      if( Manlexfactor1 && sent.noOcc==-1.0 )
+	sent.realCount=Manlexfactor1;
+      else if( Manlexfactor2 && sent.noOcc==-2.0 )
+	sent.realCount=Manlexfactor2;
+      else
+	sent.realCount=(*realCount)[sent.getSentenceNo()-1];
+    }
+  return true ;
+}
+bool sentenceHandler::readNextSentence(sentPair& sent)
+  /* This method reads in a new pair of sentences, each pair is read from the 
+     corpus file as line triples. The first line the no of times this line 
+     pair occured in the corpus, the second line is the source sentence and 
+     the third is the target sentence. The sentences are represented by a space
+     separated positive integer token ids. */
+{
+
+  string line;
+  bool fail(false) ;
+  
+  sent.clear();
+  if (getline(*inputFile, line)){
+    istrstream buffer(line.c_str());
+    buffer >> sent.noOcc;
+    if( sent.noOcc<0 )
+      {
+	if( realCount )
+	  {
+	    if( Manlexfactor1 && sent.noOcc==-1.0 )
+	      sent.realCount=Manlexfactor1;
+	    else if( Manlexfactor2 && sent.noOcc==-2.0 )
+	      sent.realCount=Manlexfactor2;
+	    else
+	      {
+		sent.realCount=(*realCount)[pair_no];
+	      }
+	  }
+	else
+	  sent.realCount=1.0;
+      }
+    else
+      sent.realCount=sent.noOcc;
+  }
+  else {
+    fail = true ;;
+  }
+  if (getline(*inputFile, line)){
+    istrstream buffer(line.c_str());
+    WordIndex w;  // w is a local variabe for token id
+    sent.eSent.push_back(0); // each source word is assumed to have 0 == 
+    // a null word (id 0) at the begining of the sentence. 
+    while(buffer>>w){ // read source sentece , word by word .
+      if (sent.eSent.size() < MAX_SENTENCE_LENGTH)
+	sent.eSent.push_back(w);
+      else {
+	if( PrintedTooLong++<100)
+	  cerr << "{WARNING:(a)truncated sentence "<<pair_no<<"}";
+	//cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n';
+	//cerr << "The following sentence will be truncated\n" << line;
+	break ;
+      }
+    }
+  }
+  else {
+    fail = true ;
+  }
+  if (getline(*inputFile, line)){
+    istrstream buffer(line.c_str());
+    WordIndex w;  // w is a local variabe for token id
+    sent.fSent.push_back(0); //0 is inserted for program uniformity
+    while(buffer>>w){ // read target sentece , word by word .
+      if (sent.fSent.size() < MAX_SENTENCE_LENGTH)
+	sent.fSent.push_back(w);
+      else {
+	if( PrintedTooLong++<100)
+	  cerr << "{WARNING:(b)truncated sentence "<<pair_no<<"}";
+	//cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n';
+	//cerr << "The following sentence will be truncated\n" << line;
+	break ;
+      }
+    }
+  }
+  else {
+    fail = true ;
+  }
+  if (fail){
+    sent.eSent.clear();
+    sent.fSent.clear();
+    sent.sentenceNo = 0 ;
+    sent.noOcc = 0 ;
+    sent.realCount=0;
+    return(false);
+  }  
+  if( sent.eSent.size()==1||sent.fSent.size()==1 )
+    cerr << "ERROR: Forbidden zero sentence length " << sent.sentenceNo << endl;
+  sent.sentenceNo = ++pair_no;
+  if(pair_no % 100000 == 0) 
+    cout << "[sent:" << sent.sentenceNo  << "]"<< '\n';
+  return true;
+}
+
+double optimize_lambda(Vector<double>&vd)
+{
+  Vector<double> l;
+  for(double lambda=1.0;lambda<ManlexMAX_MULTIPLICITY;lambda+=0.33)
+    {
+      double prod=0.0;
+      for(unsigned int i=0;i<vd.size();++i)
+	{
+	  prod += vd[i]*exp(lambda*vd[i])/(exp(lambda*vd[i])-1.0);
+	}
+      l.push_back(fabs(prod-1.0));
+    }
+  double lam=double(min_element(l.begin(),l.end())-l.begin())*0.33+1.0;
+  if( lam<1.0 )
+    {
+      cerr << "ERROR: lambda is smaller than one: " << lam << endl;
+      for(unsigned int i=0;i<vd.size();++i)
+	cerr << vd[i] << ' ';
+      cerr << endl;
+    }
+  return lam;
+}
+
+void sentenceHandler::setProbOfSentence(const sentPair&s,double d)
+{
+  if( realCount==0 )
+    return;
+  else
+    {
+      if( s.noOcc<=0 )
+	{
+	  double ed=exp(d);
+	  if( oldPairs.size()>0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) )
+	    {
+	      double lambda=optimize_lambda(oldProbs);
+	      for(unsigned int i=0;i<oldPairs.size();++i)
+		{
+		  if( oldProbs[i]<1e-5 )
+		    (*realCount)[oldPairs[i].getSentenceNo()-1]=1.0;
+		  else
+		    (*realCount)[oldPairs[i].getSentenceNo()-1]=lambda*oldProbs[i]/(1-exp(-lambda*oldProbs[i]));		    
+		}
+	      oldPairs.clear();
+	      oldProbs.clear();
+	    }
+	  oldPairs.push_back(s);
+	  oldProbs.push_back(ed);
+	}
+    }
+}
+
+/* ------------- End of Method Definition of Class sentenceHandler ----------*/
+
+
+
+
+
+