deseg initial commit

author: msalameh83 <msalameh83@gmail.com> 2016-04-01 20:53:54 +0300
committer: msalameh83 <msalameh83@gmail.com> 2016-04-01 20:53:54 +0300
commit: 9b260d48c99851a4ab11c69cb569178ba182799f (patch)
tree: 1d9250f6c1b12dae995a89a2802b9023f6d3d3b8 /moses/FF
parent: f23beaa27dbc73c6eb3af4213ca4eb66f592cc53 (diff)
8 files changed, 1032 insertions, 0 deletions
diff --git a/moses/FF/Dsg-Feature/Desegmenter.cpp b/moses/FF/Dsg-Feature/Desegmenter.cpp
new file mode 100644
index 000000000..93b6db528
--- /dev/null
+++ b/moses/FF/Dsg-Feature/Desegmenter.cpp
@@ -0,0 +1,133 @@
+#include <iostream>     // std::cout
+#include <fstream>      // std::ifstream
+#include<string>
+#include<sstream>
+#include<vector>
+#include<map>
+
+#include "Desegmenter.h"
+#include <boost/algorithm/string/replace.hpp>
+
+using namespace std;
+
+namespace Moses
+{
+void Desegmenter::Load(const string filename){
+
+	std::ifstream myFile(filename.c_str() );//, std::ifstream::in);
+	if (myFile.is_open()){
+		cerr << "Desegmentation File open successful." << endl;
+		string line;
+		while (getline(myFile, line)){
+			stringstream ss(line);
+			string token;
+			vector<string> myline;
+			while (getline(ss, token, '\t')){
+				myline.push_back(token);
+			}
+			mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
+		}
+		myFile.close();
+	}
+	else
+		cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
+}
+
+
+
+
+vector<string> Desegmenter::Search(string myKey){
+	multimap<string, string>::const_iterator  mmiPairFound = mmDesegTable.find(myKey);
+	vector<string> result;
+	if (mmiPairFound != mmDesegTable.end()){
+		size_t nNumPairsInMap = mmDesegTable.count(myKey);
+		
+		for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter){
+			
+			if (mmiPairFound != mmDesegTable.end())	{
+				result.push_back(mmiPairFound->second);
+			}
+			++mmiPairFound;
+		}
+		return result;
+	}
+	else{
+		string rule_deseg ;
+		rule_deseg = ApplyRules(myKey);
+		result.push_back(rule_deseg);
+		return result;
+	}
+}
+
+string Desegmenter::ApplyRules(string & segToken){
+	string desegToken=segToken;
+	
+	boost::replace_all(desegToken, "l+ All", "ll");
+	boost::replace_all(desegToken, "l+ Al", "ll"); 
+	boost::replace_all(desegToken, "y+ y ", "y"); 
+	boost::replace_all(desegToken, "p+ ", "t"); 
+	boost::replace_all(desegToken, "' +", "}"); 
+	boost::replace_all(desegToken, "y +", "A"); 
+	boost::replace_all(desegToken, "n +n", "n"); 
+	boost::replace_all(desegToken, "mn +m", "mm"); 
+	boost::replace_all(desegToken, "En +m", "Em"); 
+	boost::replace_all(desegToken, "An +lA", "Em");
+	boost::replace_all(desegToken, "-LRB-", "(");
+	boost::replace_all(desegToken, "-RRB-", ")");
+	boost::replace_all(desegToken, "+ +", "");
+	
+	boost::replace_all(desegToken, "+ ", "");
+	boost::replace_all(desegToken, " +", "");
+	
+	return desegToken;
+}
+
+
+Desegmenter::~Desegmenter()
+{}
+
+
+
+/*
+void Completer::Load(const string filename){
+
+	std::ifstream myFile(filename.c_str() );
+	if (myFile.is_open()){
+		cerr << "Completer File open successful." << endl;
+		string line;
+		while (getline(myFile, line)){
+			stringstream ss(line);
+			string token;
+			vector<string> myline;
+			while (getline(ss, token, '\t')){
+				myline.push_back(token);
+			}
+			mmDetok.insert(pair<string, string>(myline[0], myline[1] ));
+		}
+		myFile.close();
+	}
+	else
+		cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
+	//return mmDetok;
+}
+
+string Completer::Search(string myKey){
+	
+	//unordered_multimap<string, string>::const_iterator  mmiPairFound = mmDetok.find(myKey);
+	map<string, string>::const_iterator  mi = mmDetok.find(myKey);
+	//vector<string> result;
+	string result="";
+	if (mi != mmDetok.end()){
+		result=mi->second;		
+		return result;
+	}
+	else{
+		return result;
+	}
+}
+
+Completer::~Completer()
+{}
+*/
+
+}
diff --git a/moses/FF/Dsg-Feature/Desegmenter.h b/moses/FF/Dsg-Feature/Desegmenter.h
new file mode 100644
index 000000000..133bcee4d
--- /dev/null
+++ b/moses/FF/Dsg-Feature/Desegmenter.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include<string>
+#include<map>
+
+
+using namespace std;
+
+namespace Moses
+{
+class Desegmenter
+{
+private:
+	std::multimap<string, string> mmDesegTable;
+	std::string filename;
+	void Load(const string filename);
+
+public:
+	Desegmenter(const std::string& file){
+		filename = file; 
+		Load(filename);//, mmDetok);
+	}
+	string getFileName(){ return filename; }
+	
+	vector<string> Search(string myKey);
+	string ApplyRules(string &);
+
+	~Desegmenter();
+};
+
+
+/*class Completer
+{
+private:
+	//std::multimap<string, string,std::less< std::string > > mmDetok;
+	std::map<string, string> mmDetok;
+	std::string filename;
+	void Load(const string filename);
+
+public:
+	Completer(const std::string& file){
+		filename = file; 
+		Load(filename);//, mmDetok);
+	}
+	string getFileName(){ return filename; }	
+	string Search(string myKey);
+
+	~Completer();
+};
+*/
+
+}
diff --git a/moses/FF/Dsg-Feature/DsgModel.cpp b/moses/FF/Dsg-Feature/DsgModel.cpp
new file mode 100644
index 000000000..635109774
--- /dev/null
+++ b/moses/FF/Dsg-Feature/DsgModel.cpp
@@ -0,0 +1,155 @@
+#include <fstream>
+#include "DsgModel.h"
+#include "dsgHyp.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+
+using namespace std;
+using namespace lm::ngram;
+
+namespace Moses
+{
+
+  DesegModel::DesegModel(const std::string &line)
+    :StatefulFeatureFunction(5, line )
+  {
+    tFactor = 0;
+    order=5;
+    numFeatures = 5;
+    optimistic = 1;
+    ReadParameters();
+  }
+
+  DesegModel::~DesegModel()
+  {
+    delete DSGM;
+  }
+
+  void DesegModel :: readLanguageModel(const char *lmFile)
+  {
+    DSGM = ConstructDsgLM(m_lmPath.c_str());
+    State startState = DSGM->NullContextState(); // MSAL
+    desegT=new Desegmenter(m_desegPath);// Desegmentation Table
+  }
+
+
+  void DesegModel::Load(AllOptions::ptr const& opts)
+  {
+    m_options = opts; //ADDED
+    readLanguageModel(m_lmPath.c_str());
+  }
+
+
+
+  void DesegModel:: EvaluateInIsolation(const Phrase &source
+					, const TargetPhrase &targetPhrase
+					, ScoreComponentCollection &scoreBreakdown
+					, ScoreComponentCollection &estimatedScores) const
+  {
+
+    dsgHypothesis obj;
+    vector <string> myTargetPhrase;
+    vector<float> scores;
+    vector<string> targ_phrase; //stores the segmented tokens in the target phrase
+    const AlignmentInfo &align = targetPhrase.GetAlignTerm();
+
+    for (int i = 0; i < targetPhrase.GetSize(); i++) {
+      targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
+    }
+
+    obj.setState(DSGM->NullContextState());
+    obj.setPhrases(targ_phrase);
+    obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
+    obj.populateScores(scores,numFeatures);
+    estimatedScores.PlusEquals(this, scores);
+
+  }
+
+
+  FFState* DesegModel::EvaluateWhenApplied(
+					   const Hypothesis& cur_hypo,
+					   const FFState* prev_state,
+					   ScoreComponentCollection* accumulator) const
+  {
+
+    const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
+    const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
+    const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
+    size_t sourceOffset = src_rng.GetStartPos();
+
+    dsgHypothesis obj;
+    vector<float> scores;
+    vector<string> targ_phrase; //stores the segmented tokens in the target phrase
+    bool isCompleted;
+
+    isCompleted=cur_hypo.IsSourceCompleted();
+    for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
+      targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
+    }
+
+    obj.setState(prev_state);
+    obj.setPhrases( targ_phrase );
+    obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
+    obj.populateScores(scores,numFeatures);
+    accumulator->PlusEquals(this, scores);
+    return obj.saveState();
+
+  }
+
+  FFState* DesegModel::EvaluateWhenApplied(
+					   const ChartHypothesis& /* cur_hypo */,
+					   int /* featureID - used to index the state in the previous hypotheses */,
+					   ScoreComponentCollection* accumulator) const
+  {
+    UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
+
+  }
+
+  const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
+  {
+    VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
+    State startState = DSGM->BeginSentenceState();
+    dsgState ss= dsgState(startState);
+    /////ss.setDelta(0.0);
+    return new dsgState(ss);
+  }
+
+  std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
+  {
+    return "dsg";
+  }
+
+
+  void DesegModel::SetParameter(const std::string& key, const std::string& value)
+  {
+
+    if (key == "path") {
+      m_lmPath = value;
+    } else if (key == "contiguity-features") {
+      if(value == "no")
+        numFeatures = 1;
+      else
+        numFeatures = 5;
+    } else if (key == "output-factor") {
+      tFactor = Scan<int>(value);
+    } else if (key == "optimistic") {
+      if (value == "n")
+	optimistic = 0;
+      else
+	optimistic = 1;
+    } else if (key == "deseg-path") {
+      m_desegPath = value;
+    } else if (key == "order") {
+      order = Scan<int>(value);
+    } else {
+      StatefulFeatureFunction::SetParameter(key, value);
+    }
+  }
+
+  bool DesegModel::IsUseable(const FactorMask &mask) const
+  {
+    bool ret = mask[0];
+    return ret;
+  }
+
+} // namespace
diff --git a/moses/FF/Dsg-Feature/DsgModel.h b/moses/FF/Dsg-Feature/DsgModel.h
new file mode 100644
index 000000000..8db0eea05
--- /dev/null
+++ b/moses/FF/Dsg-Feature/DsgModel.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <string>
+#include <map>
+#include <vector>
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/Manager.h"
+#include "moses/FF/Dsg-Feature/dsgHyp.h"
+#include "moses/FF/Dsg-Feature/Desegmenter.h"
+#include "KenDsg.h"
+
+
+namespace Moses
+{
+
+  class DesegModel : public StatefulFeatureFunction
+  {
+  public:
+
+    DsgLM * DSGM;
+    Desegmenter* desegT; //MSAL
+    int tFactor;// Target Factor ...
+    int order; //MSAL
+    int numFeatures;   // Number of features used an be 1 (unsegmented LM)or 4 (with 3 contiguity features)
+    bool optimistic;
+
+    DesegModel(const std::string &line);
+    ~DesegModel();
+
+    void readLanguageModel(const char *);
+    void Load(AllOptions::ptr const& opts);
+
+    FFState* EvaluateWhenApplied(
+				 const Hypothesis& cur_hypo,
+				 const FFState* prev_state,
+				 ScoreComponentCollection* accumulator) const;
+
+    virtual FFState* EvaluateWhenApplied(
+					 const ChartHypothesis& /* cur_hypo */,
+					 int /* featureID - used to index the state in the previous hypotheses */,
+					 ScoreComponentCollection* accumulator) const;
+
+    void  EvaluateInIsolation(const Phrase &source
+			      , const TargetPhrase &targetPhrase
+			      , ScoreComponentCollection &scoreBreakdown
+			      , ScoreComponentCollection &estimatedScores) const;
+
+    virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+
+    virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
+
+    void SetParameter(const std::string& key, const std::string& value);
+
+    bool IsUseable(const FactorMask &mask) const;
+
+  protected:
+    typedef std::vector<float> Scores;
+    std::string m_lmPath;
+    std::string m_desegPath;
+  };
+
+
+}
diff --git a/moses/FF/Dsg-Feature/KenDsg.cpp b/moses/FF/Dsg-Feature/KenDsg.cpp
new file mode 100644
index 000000000..08a8dd0ed
--- /dev/null
+++ b/moses/FF/Dsg-Feature/KenDsg.cpp
@@ -0,0 +1,34 @@
+#include "KenDsg.h"
+
+namespace Moses
+{
+
+  DsgLM* ConstructDsgLM(const char *file)
+  {
+    lm::ngram::ModelType model_type;                                           
+    lm::ngram::Config config;                                                  
+    if (lm::ngram::RecognizeBinary(file, model_type)) {                        
+      switch(model_type) {                                                     
+      case lm::ngram::PROBING:                                                 
+	return new KenDsg<lm::ngram::ProbingModel>(file, config);
+      case lm::ngram::REST_PROBING:                                            
+	return new KenDsg<lm::ngram::RestProbingModel>(file, config);
+      case lm::ngram::TRIE:                                                    
+	return new KenDsg<lm::ngram::TrieModel>(file, config);
+      case lm::ngram::QUANT_TRIE:                                              
+	return new KenDsg<lm::ngram::QuantTrieModel>(file, config);
+      case lm::ngram::ARRAY_TRIE:                                              
+	return new KenDsg<lm::ngram::ArrayTrieModel>(file, config);
+      case lm::ngram::QUANT_ARRAY_TRIE:                                        
+	return new KenDsg<lm::ngram::QuantArrayTrieModel>(file, config);
+      default:                                                                 
+	UTIL_THROW2("Unrecognized kenlm model type " << model_type);           
+      }                                                                        
+    } else {                                                                   
+      return new KenDsg<lm::ngram::ProbingModel>(file, config);
+    }                                                                          
+  }                                                                            
+                                                                             
+} // namespace                                                               
+                                                                             
+                                                                             
diff --git a/moses/FF/Dsg-Feature/KenDsg.h b/moses/FF/Dsg-Feature/KenDsg.h
new file mode 100644
index 000000000..44d7ea6eb
--- /dev/null
+++ b/moses/FF/Dsg-Feature/KenDsg.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <string>
+#include "lm/model.hh"
+//#include <boost/shared_ptr.hpp>
+
+namespace Moses
+{
+
+class KenDsgBase
+{
+ public:
+  virtual ~KenDsgBase() {}
+
+  virtual float Score(const lm::ngram::State&, StringPiece,
+                      lm::ngram::State&) const = 0;
+
+  virtual const lm::ngram::State &BeginSentenceState() const = 0;
+
+  virtual const lm::ngram::State &NullContextState() const = 0;
+
+  virtual float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const = 0;
+};
+
+template <class KenModel>
+  class KenDsg : public KenDsgBase
+{
+ public:
+  KenDsg(const char *file, const lm::ngram::Config &config)
+    : m_kenlm(file, config) {}
+
+  float Score(const lm::ngram::State &in_state,
+	      StringPiece word,
+	      lm::ngram::State &out_state) const {
+    return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
+			 out_state);
+  }
+
+  const lm::ngram::State &BeginSentenceState() const {
+    return m_kenlm.BeginSentenceState();
+  }
+
+  const lm::ngram::State &NullContextState() const {
+    return m_kenlm.NullContextState();
+  }
+
+  float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const {
+    return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().EndSentence(), out_state);
+  }
+
+
+ private:
+  //  boost::shared_ptr<KenModel> m_kenlm;
+  KenModel m_kenlm;
+};
+
+ typedef KenDsgBase DsgLM;
+
+ DsgLM* ConstructDsgLM(const char *file);
+
+
+} // namespace
diff --git a/moses/FF/Dsg-Feature/dsgHyp.cpp b/moses/FF/Dsg-Feature/dsgHyp.cpp
new file mode 100644
index 000000000..6329b5ebd
--- /dev/null
+++ b/moses/FF/Dsg-Feature/dsgHyp.cpp
@@ -0,0 +1,424 @@
+#include "dsgHyp.h"
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+#include <algorithm>
+#include <cstdlib> //NEW
+#include <math.h> //NEW
+#include <map>  //NEW
+
+
+using namespace std;
+using namespace lm::ngram;
+
+namespace Moses
+{
+  dsgState::dsgState(const State & val)
+  {
+    lmState = val;
+  }
+
+  void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
+  {
+    //gap.clear();
+    buffer = danglingTok;
+    span=srcSpans;
+    delta=deltaValue;//NEW
+  }
+
+
+  size_t dsgState::hash() const  //CHECKKKKKKKKKK
+  {
+
+    size_t ret = 0;
+    boost::hash_combine(ret, lmState);
+
+    /*size_t ret = delta;
+
+  boost::hash_combine(ret, buffer);
+  boost::hash_combine(ret, span);
+  boost::hash_combine(ret, lmState.length);
+
+  return ret;*/
+  }
+
+  bool dsgState::operator==(const FFState& otherBase) const   //CHECK
+  {
+    const dsgState &other = static_cast<const dsgState&>(otherBase);
+
+    if (lmState < other.lmState) return false;
+    if (lmState == other.lmState) return true;
+    return false;
+
+    /*if (buffer.size()!=other.buffer.size()){return false;}
+  if (span.size()!=other.span.size()){return false;};
+  if (delta!=other.delta){return false;}
+  if (lmState.length!=other.lmState.length){return false;}
+  //if (lmState == other.lmState) {return true;}
+  return true;*/
+
+  }
+
+  std::string dsgState :: getName() const
+  {
+    return "done";
+  }
+
+  //////////////////////////////////////////////////
+
+  dsgHypothesis :: dsgHypothesis()
+  {
+    lmProb = 0;
+    discontig0 = 0;
+    discontig1 = 0;
+    discontig2 = 0;
+    UnsegWP = 0;
+    m_buffer.clear();//="";
+    //delta=0.0;
+  }
+
+  void dsgHypothesis :: setState(const FFState* prev_state)
+  {
+    if(prev_state != NULL) {
+      m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
+      m_span = static_cast <const dsgState *> (prev_state)->getSpan();
+      lmState = static_cast <const dsgState *> (prev_state)->getLMState();
+      delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
+    }
+  }
+
+  dsgState * dsgHypothesis :: saveState()
+  {
+    dsgState * statePtr = new dsgState(lmState);
+    statePtr->saveState(m_buffer, m_span, delta);
+    //statePtr->saveState(gap,span,0.0);
+    return statePtr;
+  }
+
+  void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
+  {
+    scores.clear();
+    scores.push_back(lmProb); //TODAY
+
+    if (numFeatures == 1)
+      return;
+    scores.push_back(discontig0);
+    scores.push_back(discontig1);
+    scores.push_back(discontig2);
+    scores.push_back(UnsegWP);
+  }
+
+
+
+  bool dsgHypothesis::isPrefix(const std::string &tok){
+    if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { return true; }
+    else  { return false; };
+  }
+
+  bool dsgHypothesis::isSuffix(const std::string &tok){
+    if ((tok.at(0) == '+' )&& (tok != "+")) { return true; }
+    else  { return false; };
+  }
+
+  bool dsgHypothesis::isStem(const std::string &tok){
+    if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')){ return true; }
+    else  { return false; };
+  }
+
+
+
+  /**
+   * chain stores segmented tokens that are in process of building a word
+   * The function checks if tok contributes to the word being formed in chain
+   *
+   */
+  bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain){
+    std::string last_tok;
+    if (chain.size() >= 1){
+      last_tok = chain[chain.size() - 1];
+    }
+    else{
+      last_tok = "NULL";
+    }
+    if(tok=="+"){return false;}
+    if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
+    else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { return true; } // allows one suffix ONLY
+    //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
+    else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
+    else { return false; }
+  }
+
+  /**
+   * grouper function groups tokens that form a word together
+   */
+  vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation){
+
+    std::vector<std::string> chain;
+    std::vector<int> chain_ids;
+    std::vector<std::string> allchains;
+    chain_ids=m_span;//MSAL
+
+    if (!m_buffer.empty() && !isolation){// if evaluate in isolation is called, then do not add buffer content
+      for (int i = 0; i < m_buffer.size(); i++){   // initialize chain with the content of the buffer
+	chain.push_back(m_buffer[i]);
+      }
+    }
+
+    for (int i = 0; i < phr_vec.size(); i++){
+      std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
+
+      if (isValidChain(phr_vec[i], chain)){
+	chain.push_back(phr_vec[i]);
+	if (sourcePosSet.empty()==false){
+	  for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
+	    int cur=*it;
+	    chain_ids.push_back(cur+sourceOffset); //MSAL
+	  }
+	}
+      }
+
+      else if (chain.size() == 0) {  // start of a suffix at hypothesis0
+	allchains.push_back(phr_vec[i]);
+	allchain_ids.push_back(chain_ids);
+	chain_ids.clear();//={};
+      }
+
+      else {  // tokens formed a complete word; add tokens segmented by space to allchains
+	std::string joined = boost::algorithm::join(chain, " ");
+	allchains.push_back(joined);
+	allchain_ids.push_back(chain_ids);
+
+	chain.clear();// = {};
+	chain_ids.clear();//={};
+
+	chain.push_back(phr_vec[i]);
+	if (sourcePosSet.empty()==false){
+	  for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
+	    int cur=*it;
+	    chain_ids.push_back(cur+sourceOffset); //MSAL
+	  }
+	}
+	/*else {
+	  //chain_ids.push_back(sourceOffset);
+	  //std::cout  << sourceOffset <<" $ ";
+	  //chain_ids.push_back({});
+	  std::cout  << "NONE $ ";
+	  }*/
+	//chain_ids.push_back(i+sourceOffset);//MSAL
+      }
+
+    }
+
+    if (!chain.empty()){
+      std::string joined = boost::algorithm::join(chain, " ");
+      allchains.push_back(joined);
+      allchain_ids.push_back(chain_ids);
+    }
+    return allchains;
+  }
+
+
+
+  void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ){
+    lmProb = 0;
+    State currState = lmState;
+    State temp;
+    string desegmented="";
+    vector <string> words;
+    vector <string> currFVec;
+
+    discontig0=0;
+    discontig1=0;
+    discontig2=0;
+    UnsegWP=0;
+
+    currFVec = m_buffer;
+
+    /*
+      std::cout << "GAP: ";
+      for (int j=0 ; j< m_buffer.size();j++){cout << "   " << m_buffer[j];}
+      std::cout << endl;
+      std::cout << "Phrase: ";
+      for (int j=0 ; j< m_curr_phr.size();j++){cout << "   " << m_curr_phr[j];}
+      std::cout << endl; */
+
+    currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
+
+    //std::cout << "First: ";
+    //for (int j=0 ; j< currFVec.size();j++){cout << "   " << currFVec[j];}
+    //std::cout << endl;
+
+    int vecSize=currFVec.size();
+
+    // phrases with suffix-starts and prefix-end
+    if (currFVec.size()>0  && isPrefix (currFVec.back())) {
+      UnsegWP-=0.5;}
+    if (currFVec.size()>0  && isSuffix (currFVec.front())) {
+      UnsegWP-=0.5;}
+
+
+    /* //Dropping prefix-end and suffix-start
+       while  (currFVec.size()>0 && isPrefix (currFVec.back())){
+       currFVec.pop_back(); //drop prefix appearing at end of phrase
+       }
+
+       while (currFVec.size()>0 && isSuffix (currFVec.front())){
+       currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
+       } */
+
+
+    vector<vector<int> > chain_ids;
+    words = grouper(currFVec,chain_ids,0,align,1);
+
+    for (int i = 0; i<words.size(); i++) {
+      UnsegWP+=1;
+      temp = currState;
+      if (words[i].find(" ")!=std::string::npos){
+	desegmented=desegT.Search(words[i])[0];
+	lmProb += ptrDsgLM.Score(temp,desegmented,currState);
+      }
+      else{
+	boost::replace_all(words[i], "-LRB-", "(");
+	boost::replace_all(words[i], "-RRB-", ")");
+	lmProb += ptrDsgLM.Score(temp,words[i],currState);
+      }
+    }
+    //opProb=TransformLMScore(opProb);
+    lmState = currState;
+  }
+
+  void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int  sourceOffset, bool optimistic)
+  {
+    lmProb = 0;
+    discontig0=0;
+    discontig1=0;
+    discontig2=0;
+    UnsegWP=0;
+
+    State currState = lmState;
+    State temp;
+    string desegmented="";
+    vector <string> words;
+    vector <string> currFVec;
+    bool completePhraseSuffixEnd = false;
+    vector<vector<int> > all_chain_ids;
+    double pscore;
+    currFVec=m_curr_phr;
+
+    // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
+    if (isSuffix (currFVec.back()) && (currFVec.back()!="+")){completePhraseSuffixEnd=true;}
+
+    words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
+
+    for (int i = 0; i < words.size(); i++) {
+      temp = currState; //NEW ADDED
+
+      if (i==words.size()-1){
+	if (completePhraseSuffixEnd){   //i.e if phrase ends with suffix, which marks an end of a word
+	  m_buffer.clear();// ="";
+	  m_span.clear();// ={};//MSAL
+	  //delta=0.0; //Dont enable this, wrong
+	}
+	else if (!isCompleted) {  // not end of sentence( or final hypothesis), and probably the last token is not a complete word
+	  m_buffer.clear();
+	  if (optimistic == 1){
+	    //  (2)Comment the below if you want delayed scoring
+	    if ( isPrefix (currFVec.back())){  // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
+
+	      pscore = ptrDsgLM.Score(temp,desegmented,currState);
+
+	      // enable the 3 lines below with (1) and disable lines below it
+	      //opProb = opProb + pscore - delta; //NEW
+	      //delta=pscore;
+	      //currState=temp;
+
+	      lmProb -= delta;
+	      delta = 0.0;
+	    }//*/
+
+	     //Comments these else statements below with (2) if you want to delay prefix-end scoring
+	    else if (words[i].find(" ")!=std::string::npos){ //NEW
+	      desegmented=desegT.Search(words[i])[0];     //NEW
+	      pscore=ptrDsgLM.Score(temp,desegmented,currState);
+	      //opProb += pscore-delta;
+	      lmProb = lmProb + pscore - delta; //NEW
+	      delta=pscore;
+	      currState=temp;
+	    }
+	    else{
+	      boost::replace_all(words[i], "-LRB-", "("); //NEW CHECK
+	      boost::replace_all(words[i], "-RRB-", ")"); //NEW CHECK
+	      pscore=ptrDsgLM.Score(temp,words[i],currState);
+	      //opProb += pscore-delta; //NEW
+	      lmProb = lmProb + pscore - delta; //NEW
+	      delta=pscore; //NEW
+	      currState=temp;
+	    }}//*/
+
+	  m_buffer.push_back(words.back());
+	  //gap=words.back();
+	  m_span=all_chain_ids.back();//MSAL
+	  //opProb=TransformLMScore(opProb);
+	  //lmState = currState;
+	  break;
+	}
+      }
+
+      //temp = currState; NEW COMMENTED
+      if (words[i].find(" ")!=std::string::npos){
+	UnsegWP+=1;
+	desegmented=desegT.Search(words[i])[0];
+	std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
+	if (cur_chain_ids.size()>1){
+	  vector<int> dsc;
+	  for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it);it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
+	    int cur=*it;
+	    int mynext=*next;
+	    if (std::abs(cur - mynext)>= 3) {
+	      dsc.push_back(3);
+	      //discontig2+=1;
+	      //break;
+	    }
+	    else if (std::abs(cur - mynext)== 2){
+	      //discontig1+=1;
+	      dsc.push_back(2);
+	      //break;
+	    }
+	    else if (std::abs(cur - mynext)<= 1){
+	      //discontig0+=1;
+	      dsc.push_back(1);
+	    }
+	  }
+	  int mymax=*std::max_element(dsc.begin(),dsc.end());
+	  if (mymax==3){discontig2+=1;}
+	  else if (mymax==2){discontig1+=1;}
+	  else{discontig0+=1;}
+	}
+	else{
+	  discontig0 += 1;
+	}
+
+	//opProb += ptrDsgLM.Score(temp,ptrDsgLM.GetVocabulary().Index(desegmented),currState);
+	lmProb += ptrDsgLM.Score(temp,desegmented,currState);
+      }
+      else{
+	UnsegWP+=1;
+	boost::replace_all(words[i], "-LRB-", "(");
+	boost::replace_all(words[i], "-RRB-", ")");
+	lmProb += ptrDsgLM.Score(temp,words[i],currState);
+      }
+    }
+
+    if (isCompleted){
+      temp = currState;
+      lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
+    }
+    //opProb=TransformLMScore(opProb);
+    lmState = currState;
+  }
+
+
+  void dsgHypothesis :: print()
+  {}
+
+
+} // namespace
diff --git a/moses/FF/Dsg-Feature/dsgHyp.h b/moses/FF/Dsg-Feature/dsgHyp.h
new file mode 100644
index 000000000..a609b7fb6
--- /dev/null
+++ b/moses/FF/Dsg-Feature/dsgHyp.h
@@ -0,0 +1,109 @@
+#pragma once
+
+
+# include "moses/FF/FFState.h"
+# include "moses/Manager.h"
+# include <set>
+# include <map>
+# include <string>
+# include <vector>
+# include "moses/FF/Dsg-Feature/Desegmenter.h"
+# include "KenDsg.h"
+
+
+namespace Moses
+{
+
+  class dsgState : public FFState
+  {
+  public:
+
+    dsgState(const lm::ngram::State & val);
+    //int Compare(const FFState& other) const;
+    virtual bool operator==(const FFState& other) const; //CHECK
+    void saveState( std::vector<std::string>  bufferVal,std::vector<int> spanVal, float deltaValue);
+
+    std::vector<std::string> getBuffer() const {
+      return buffer;
+    }
+
+    std::vector<int> getSpan() const {
+      return span;
+    }
+
+    lm::ngram::State getLMState() const {
+      return lmState;
+    }
+
+    float getDelta() const {  //NEW
+      return delta;
+    }
+
+    void setDelta(double val1 ) { //NEWWWW
+      delta = val1;
+    }
+
+    void print() const;
+    std::string getName() const;
+
+    virtual size_t hash() const;
+
+
+  protected:
+    std::vector<std::string> buffer;
+    std::vector<int> span;
+    lm::ngram::State lmState;
+    double delta; //NEW
+  };
+
+
+
+class dsgHypothesis
+{
+
+ private:
+  std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
+  std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
+  lm::ngram::State lmState; // KenLM's Model State ...
+  std::vector<std::string> m_curr_phr; //phrase from current hypothesis
+  double delta; //NEW
+
+  double lmProb;
+  int discontig0;
+  int discontig1;
+  int discontig2;
+  double UnsegWP;
+
+ public:
+
+  dsgHypothesis();
+  ~dsgHypothesis() {};
+  void calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &, bool isCompleted, const AlignmentInfo &align, int  sourceOffset, bool optimistic);
+  void calculateDsgProbinIsol(DsgLM& ptrDsgLM, Desegmenter &, const AlignmentInfo &align);
+
+  void setPhrases(std::vector<std::string> & val1 ) {//MSAL
+    m_curr_phr = val1;
+  }
+
+  void setDelta(double val1 ) { //NEW
+    delta = val1;
+  }
+
+  void setState(const FFState* prev_state);
+  dsgState * saveState();
+  void print();
+  void populateScores(std::vector <float> & scores , const int numFeatures);
+  void setState(const lm::ngram::State & val) {
+    lmState = val;
+  }
+
+  bool isPrefix(const std::string &);
+  bool isSuffix(const std::string &);
+  bool isStem(const std::string &);
+  bool isValidChain(const  std::string  &, std::vector<std::string> &chain);
+  vector<string> grouper(std::vector<std::string> &,std::vector<std::vector<int> > &,int,const AlignmentInfo &align,bool);
+
+};
+} // namespace
+
+
author	msalameh83 <msalameh83@gmail.com>	2016-04-01 20:53:54 +0300
committer	msalameh83 <msalameh83@gmail.com>	2016-04-01 20:53:54 +0300
commit	9b260d48c99851a4ab11c69cb569178ba182799f (patch)
tree	1d9250f6c1b12dae995a89a2802b9023f6d3d3b8 /moses/FF
parent	f23beaa27dbc73c6eb3af4213ca4eb66f592cc53 (diff)