Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses/FF
diff options
context:
space:
mode:
authormsalameh83 <msalameh83@gmail.com>2016-04-01 20:53:54 +0300
committermsalameh83 <msalameh83@gmail.com>2016-04-01 20:53:54 +0300
commit9b260d48c99851a4ab11c69cb569178ba182799f (patch)
tree1d9250f6c1b12dae995a89a2802b9023f6d3d3b8 /moses/FF
parentf23beaa27dbc73c6eb3af4213ca4eb66f592cc53 (diff)
deseg initial commit
Diffstat (limited to 'moses/FF')
-rw-r--r--moses/FF/Dsg-Feature/Desegmenter.cpp133
-rw-r--r--moses/FF/Dsg-Feature/Desegmenter.h52
-rw-r--r--moses/FF/Dsg-Feature/DsgModel.cpp155
-rw-r--r--moses/FF/Dsg-Feature/DsgModel.h63
-rw-r--r--moses/FF/Dsg-Feature/KenDsg.cpp34
-rw-r--r--moses/FF/Dsg-Feature/KenDsg.h62
-rw-r--r--moses/FF/Dsg-Feature/dsgHyp.cpp424
-rw-r--r--moses/FF/Dsg-Feature/dsgHyp.h109
8 files changed, 1032 insertions, 0 deletions
diff --git a/moses/FF/Dsg-Feature/Desegmenter.cpp b/moses/FF/Dsg-Feature/Desegmenter.cpp
new file mode 100644
index 000000000..93b6db528
--- /dev/null
+++ b/moses/FF/Dsg-Feature/Desegmenter.cpp
@@ -0,0 +1,133 @@
+#include <iostream> // std::cout
+#include <fstream> // std::ifstream
+#include<string>
+#include<sstream>
+#include<vector>
+#include<map>
+
+#include "Desegmenter.h"
+#include <boost/algorithm/string/replace.hpp>
+
+using namespace std;
+
+namespace Moses
+{
+void Desegmenter::Load(const string filename){
+
+ std::ifstream myFile(filename.c_str() );//, std::ifstream::in);
+ if (myFile.is_open()){
+ cerr << "Desegmentation File open successful." << endl;
+ string line;
+ while (getline(myFile, line)){
+ stringstream ss(line);
+ string token;
+ vector<string> myline;
+ while (getline(ss, token, '\t')){
+ myline.push_back(token);
+ }
+ mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
+ }
+ myFile.close();
+ }
+ else
+ cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
+}
+
+
+
+
+vector<string> Desegmenter::Search(string myKey){
+ multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
+ vector<string> result;
+ if (mmiPairFound != mmDesegTable.end()){
+ size_t nNumPairsInMap = mmDesegTable.count(myKey);
+
+ for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter){
+
+ if (mmiPairFound != mmDesegTable.end()) {
+ result.push_back(mmiPairFound->second);
+ }
+ ++mmiPairFound;
+ }
+ return result;
+ }
+ else{
+ string rule_deseg ;
+ rule_deseg = ApplyRules(myKey);
+ result.push_back(rule_deseg);
+ return result;
+ }
+}
+
+string Desegmenter::ApplyRules(string & segToken){
+ string desegToken=segToken;
+
+ boost::replace_all(desegToken, "l+ All", "ll");
+ boost::replace_all(desegToken, "l+ Al", "ll");
+ boost::replace_all(desegToken, "y+ y ", "y");
+ boost::replace_all(desegToken, "p+ ", "t");
+ boost::replace_all(desegToken, "' +", "}");
+ boost::replace_all(desegToken, "y +", "A");
+ boost::replace_all(desegToken, "n +n", "n");
+ boost::replace_all(desegToken, "mn +m", "mm");
+ boost::replace_all(desegToken, "En +m", "Em");
+ boost::replace_all(desegToken, "An +lA", "Em");
+ boost::replace_all(desegToken, "-LRB-", "(");
+ boost::replace_all(desegToken, "-RRB-", ")");
+ boost::replace_all(desegToken, "+ +", "");
+
+ boost::replace_all(desegToken, "+ ", "");
+ boost::replace_all(desegToken, " +", "");
+
+ return desegToken;
+}
+
+
+Desegmenter::~Desegmenter()
+{}
+
+
+
+/*
+void Completer::Load(const string filename){
+
+ std::ifstream myFile(filename.c_str() );
+ if (myFile.is_open()){
+ cerr << "Completer File open successful." << endl;
+ string line;
+ while (getline(myFile, line)){
+ stringstream ss(line);
+ string token;
+ vector<string> myline;
+ while (getline(ss, token, '\t')){
+ myline.push_back(token);
+ }
+ mmDetok.insert(pair<string, string>(myline[0], myline[1] ));
+ }
+ myFile.close();
+ }
+ else
+ cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
+ //return mmDetok;
+}
+
+string Completer::Search(string myKey){
+
+ //unordered_multimap<string, string>::const_iterator mmiPairFound = mmDetok.find(myKey);
+ map<string, string>::const_iterator mi = mmDetok.find(myKey);
+ //vector<string> result;
+ string result="";
+ if (mi != mmDetok.end()){
+ result=mi->second;
+ return result;
+ }
+ else{
+ return result;
+ }
+}
+
+Completer::~Completer()
+{}
+*/
+
+}
diff --git a/moses/FF/Dsg-Feature/Desegmenter.h b/moses/FF/Dsg-Feature/Desegmenter.h
new file mode 100644
index 000000000..133bcee4d
--- /dev/null
+++ b/moses/FF/Dsg-Feature/Desegmenter.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include<string>
+#include<map>
+
+
+using namespace std;
+
+namespace Moses
+{
+class Desegmenter
+{
+private:
+ std::multimap<string, string> mmDesegTable;
+ std::string filename;
+ void Load(const string filename);
+
+public:
+ Desegmenter(const std::string& file){
+ filename = file;
+ Load(filename);//, mmDetok);
+ }
+ string getFileName(){ return filename; }
+
+ vector<string> Search(string myKey);
+ string ApplyRules(string &);
+
+ ~Desegmenter();
+};
+
+
+/*class Completer
+{
+private:
+ //std::multimap<string, string,std::less< std::string > > mmDetok;
+ std::map<string, string> mmDetok;
+ std::string filename;
+ void Load(const string filename);
+
+public:
+ Completer(const std::string& file){
+ filename = file;
+ Load(filename);//, mmDetok);
+ }
+ string getFileName(){ return filename; }
+ string Search(string myKey);
+
+ ~Completer();
+};
+*/
+
+}
diff --git a/moses/FF/Dsg-Feature/DsgModel.cpp b/moses/FF/Dsg-Feature/DsgModel.cpp
new file mode 100644
index 000000000..635109774
--- /dev/null
+++ b/moses/FF/Dsg-Feature/DsgModel.cpp
@@ -0,0 +1,155 @@
+#include <fstream>
+#include "DsgModel.h"
+#include "dsgHyp.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+
+using namespace std;
+using namespace lm::ngram;
+
+namespace Moses
+{
+
+ DesegModel::DesegModel(const std::string &line)
+ :StatefulFeatureFunction(5, line )
+ {
+ tFactor = 0;
+ order=5;
+ numFeatures = 5;
+ optimistic = 1;
+ ReadParameters();
+ }
+
+ DesegModel::~DesegModel()
+ {
+ delete DSGM;
+ }
+
+ void DesegModel :: readLanguageModel(const char *lmFile)
+ {
+ DSGM = ConstructDsgLM(m_lmPath.c_str());
+ State startState = DSGM->NullContextState(); // MSAL
+ desegT=new Desegmenter(m_desegPath);// Desegmentation Table
+ }
+
+
+ void DesegModel::Load(AllOptions::ptr const& opts)
+ {
+ m_options = opts; //ADDED
+ readLanguageModel(m_lmPath.c_str());
+ }
+
+
+
+ void DesegModel:: EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedScores) const
+ {
+
+ dsgHypothesis obj;
+ vector <string> myTargetPhrase;
+ vector<float> scores;
+ vector<string> targ_phrase; //stores the segmented tokens in the target phrase
+ const AlignmentInfo &align = targetPhrase.GetAlignTerm();
+
+ for (int i = 0; i < targetPhrase.GetSize(); i++) {
+ targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
+ }
+
+ obj.setState(DSGM->NullContextState());
+ obj.setPhrases(targ_phrase);
+ obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
+ obj.populateScores(scores,numFeatures);
+ estimatedScores.PlusEquals(this, scores);
+
+ }
+
+
+ FFState* DesegModel::EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const
+ {
+
+ const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
+ const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
+ const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
+ size_t sourceOffset = src_rng.GetStartPos();
+
+ dsgHypothesis obj;
+ vector<float> scores;
+ vector<string> targ_phrase; //stores the segmented tokens in the target phrase
+ bool isCompleted;
+
+ isCompleted=cur_hypo.IsSourceCompleted();
+ for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
+ targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
+ }
+
+ obj.setState(prev_state);
+ obj.setPhrases( targ_phrase );
+ obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
+ obj.populateScores(scores,numFeatures);
+ accumulator->PlusEquals(this, scores);
+ return obj.saveState();
+
+ }
+
+ FFState* DesegModel::EvaluateWhenApplied(
+ const ChartHypothesis& /* cur_hypo */,
+ int /* featureID - used to index the state in the previous hypotheses */,
+ ScoreComponentCollection* accumulator) const
+ {
+ UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
+
+ }
+
+ const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
+ {
+ VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
+ State startState = DSGM->BeginSentenceState();
+ dsgState ss= dsgState(startState);
+ /////ss.setDelta(0.0);
+ return new dsgState(ss);
+ }
+
+ std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
+ {
+ return "dsg";
+ }
+
+
+ void DesegModel::SetParameter(const std::string& key, const std::string& value)
+ {
+
+ if (key == "path") {
+ m_lmPath = value;
+ } else if (key == "contiguity-features") {
+ if(value == "no")
+ numFeatures = 1;
+ else
+ numFeatures = 5;
+ } else if (key == "output-factor") {
+ tFactor = Scan<int>(value);
+ } else if (key == "optimistic") {
+ if (value == "n")
+ optimistic = 0;
+ else
+ optimistic = 1;
+ } else if (key == "deseg-path") {
+ m_desegPath = value;
+ } else if (key == "order") {
+ order = Scan<int>(value);
+ } else {
+ StatefulFeatureFunction::SetParameter(key, value);
+ }
+ }
+
+ bool DesegModel::IsUseable(const FactorMask &mask) const
+ {
+ bool ret = mask[0];
+ return ret;
+ }
+
+} // namespace
diff --git a/moses/FF/Dsg-Feature/DsgModel.h b/moses/FF/Dsg-Feature/DsgModel.h
new file mode 100644
index 000000000..8db0eea05
--- /dev/null
+++ b/moses/FF/Dsg-Feature/DsgModel.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <string>
+#include <map>
+#include <vector>
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/Manager.h"
+#include "moses/FF/Dsg-Feature/dsgHyp.h"
+#include "moses/FF/Dsg-Feature/Desegmenter.h"
+#include "KenDsg.h"
+
+
+namespace Moses
+{
+
+ class DesegModel : public StatefulFeatureFunction
+ {
+ public:
+
+ DsgLM * DSGM;
+ Desegmenter* desegT; //MSAL
+ int tFactor;// Target Factor ...
+ int order; //MSAL
+ int numFeatures; // Number of features used an be 1 (unsegmented LM)or 4 (with 3 contiguity features)
+ bool optimistic;
+
+ DesegModel(const std::string &line);
+ ~DesegModel();
+
+ void readLanguageModel(const char *);
+ void Load(AllOptions::ptr const& opts);
+
+ FFState* EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
+
+ virtual FFState* EvaluateWhenApplied(
+ const ChartHypothesis& /* cur_hypo */,
+ int /* featureID - used to index the state in the previous hypotheses */,
+ ScoreComponentCollection* accumulator) const;
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedScores) const;
+
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+
+ virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ bool IsUseable(const FactorMask &mask) const;
+
+ protected:
+ typedef std::vector<float> Scores;
+ std::string m_lmPath;
+ std::string m_desegPath;
+ };
+
+
+}
diff --git a/moses/FF/Dsg-Feature/KenDsg.cpp b/moses/FF/Dsg-Feature/KenDsg.cpp
new file mode 100644
index 000000000..08a8dd0ed
--- /dev/null
+++ b/moses/FF/Dsg-Feature/KenDsg.cpp
@@ -0,0 +1,34 @@
+#include "KenDsg.h"
+
+namespace Moses
+{
+
+ DsgLM* ConstructDsgLM(const char *file)
+ {
+ lm::ngram::ModelType model_type;
+ lm::ngram::Config config;
+ if (lm::ngram::RecognizeBinary(file, model_type)) {
+ switch(model_type) {
+ case lm::ngram::PROBING:
+ return new KenDsg<lm::ngram::ProbingModel>(file, config);
+ case lm::ngram::REST_PROBING:
+ return new KenDsg<lm::ngram::RestProbingModel>(file, config);
+ case lm::ngram::TRIE:
+ return new KenDsg<lm::ngram::TrieModel>(file, config);
+ case lm::ngram::QUANT_TRIE:
+ return new KenDsg<lm::ngram::QuantTrieModel>(file, config);
+ case lm::ngram::ARRAY_TRIE:
+ return new KenDsg<lm::ngram::ArrayTrieModel>(file, config);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new KenDsg<lm::ngram::QuantArrayTrieModel>(file, config);
+ default:
+ UTIL_THROW2("Unrecognized kenlm model type " << model_type);
+ }
+ } else {
+ return new KenDsg<lm::ngram::ProbingModel>(file, config);
+ }
+ }
+
+} // namespace
+
+
diff --git a/moses/FF/Dsg-Feature/KenDsg.h b/moses/FF/Dsg-Feature/KenDsg.h
new file mode 100644
index 000000000..44d7ea6eb
--- /dev/null
+++ b/moses/FF/Dsg-Feature/KenDsg.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <string>
+#include "lm/model.hh"
+//#include <boost/shared_ptr.hpp>
+
+namespace Moses
+{
+
+class KenDsgBase
+{
+ public:
+ virtual ~KenDsgBase() {}
+
+ virtual float Score(const lm::ngram::State&, StringPiece,
+ lm::ngram::State&) const = 0;
+
+ virtual const lm::ngram::State &BeginSentenceState() const = 0;
+
+ virtual const lm::ngram::State &NullContextState() const = 0;
+
+ virtual float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const = 0;
+};
+
+template <class KenModel>
+ class KenDsg : public KenDsgBase
+{
+ public:
+ KenDsg(const char *file, const lm::ngram::Config &config)
+ : m_kenlm(file, config) {}
+
+ float Score(const lm::ngram::State &in_state,
+ StringPiece word,
+ lm::ngram::State &out_state) const {
+ return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
+ out_state);
+ }
+
+ const lm::ngram::State &BeginSentenceState() const {
+ return m_kenlm.BeginSentenceState();
+ }
+
+ const lm::ngram::State &NullContextState() const {
+ return m_kenlm.NullContextState();
+ }
+
+ float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const {
+ return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().EndSentence(), out_state);
+ }
+
+
+ private:
+ // boost::shared_ptr<KenModel> m_kenlm;
+ KenModel m_kenlm;
+};
+
+ typedef KenDsgBase DsgLM;
+
+ DsgLM* ConstructDsgLM(const char *file);
+
+
+} // namespace
diff --git a/moses/FF/Dsg-Feature/dsgHyp.cpp b/moses/FF/Dsg-Feature/dsgHyp.cpp
new file mode 100644
index 000000000..6329b5ebd
--- /dev/null
+++ b/moses/FF/Dsg-Feature/dsgHyp.cpp
@@ -0,0 +1,424 @@
+#include "dsgHyp.h"
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+#include <algorithm>
+#include <cstdlib> //NEW
+#include <math.h> //NEW
+#include <map> //NEW
+
+
+using namespace std;
+using namespace lm::ngram;
+
+namespace Moses
+{
+ dsgState::dsgState(const State & val)
+ {
+ lmState = val;
+ }
+
+ void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
+ {
+ //gap.clear();
+ buffer = danglingTok;
+ span=srcSpans;
+ delta=deltaValue;//NEW
+ }
+
+
+ size_t dsgState::hash() const //CHECKKKKKKKKKK
+ {
+
+ size_t ret = 0;
+ boost::hash_combine(ret, lmState);
+
+ /*size_t ret = delta;
+
+ boost::hash_combine(ret, buffer);
+ boost::hash_combine(ret, span);
+ boost::hash_combine(ret, lmState.length);
+
+ return ret;*/
+ }
+
+ bool dsgState::operator==(const FFState& otherBase) const //CHECK
+ {
+ const dsgState &other = static_cast<const dsgState&>(otherBase);
+
+ if (lmState < other.lmState) return false;
+ if (lmState == other.lmState) return true;
+ return false;
+
+ /*if (buffer.size()!=other.buffer.size()){return false;}
+ if (span.size()!=other.span.size()){return false;};
+ if (delta!=other.delta){return false;}
+ if (lmState.length!=other.lmState.length){return false;}
+ //if (lmState == other.lmState) {return true;}
+ return true;*/
+
+ }
+
+ std::string dsgState :: getName() const
+ {
+ return "done";
+ }
+
+ //////////////////////////////////////////////////
+
+ dsgHypothesis :: dsgHypothesis()
+ {
+ lmProb = 0;
+ discontig0 = 0;
+ discontig1 = 0;
+ discontig2 = 0;
+ UnsegWP = 0;
+ m_buffer.clear();//="";
+ //delta=0.0;
+ }
+
+ void dsgHypothesis :: setState(const FFState* prev_state)
+ {
+ if(prev_state != NULL) {
+ m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
+ m_span = static_cast <const dsgState *> (prev_state)->getSpan();
+ lmState = static_cast <const dsgState *> (prev_state)->getLMState();
+ delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
+ }
+ }
+
+ dsgState * dsgHypothesis :: saveState()
+ {
+ dsgState * statePtr = new dsgState(lmState);
+ statePtr->saveState(m_buffer, m_span, delta);
+ //statePtr->saveState(gap,span,0.0);
+ return statePtr;
+ }
+
+ void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
+ {
+ scores.clear();
+ scores.push_back(lmProb); //TODAY
+
+ if (numFeatures == 1)
+ return;
+ scores.push_back(discontig0);
+ scores.push_back(discontig1);
+ scores.push_back(discontig2);
+ scores.push_back(UnsegWP);
+ }
+
+
+
+ bool dsgHypothesis::isPrefix(const std::string &tok){
+ if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { return true; }
+ else { return false; };
+ }
+
+ bool dsgHypothesis::isSuffix(const std::string &tok){
+ if ((tok.at(0) == '+' )&& (tok != "+")) { return true; }
+ else { return false; };
+ }
+
+ bool dsgHypothesis::isStem(const std::string &tok){
+ if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')){ return true; }
+ else { return false; };
+ }
+
+
+
+ /**
+ * chain stores segmented tokens that are in process of building a word
+ * The function checks if tok contributes to the word being formed in chain
+ *
+ */
+ bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain){
+ std::string last_tok;
+ if (chain.size() >= 1){
+ last_tok = chain[chain.size() - 1];
+ }
+ else{
+ last_tok = "NULL";
+ }
+ if(tok=="+"){return false;}
+ if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
+ else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { return true; } // allows one suffix ONLY
+ //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
+ else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; }
+ else { return false; }
+ }
+
+ /**
+ * grouper function groups tokens that form a word together
+ */
+ vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation){
+
+ std::vector<std::string> chain;
+ std::vector<int> chain_ids;
+ std::vector<std::string> allchains;
+ chain_ids=m_span;//MSAL
+
+ if (!m_buffer.empty() && !isolation){// if evaluate in isolation is called, then do not add buffer content
+ for (int i = 0; i < m_buffer.size(); i++){ // initialize chain with the content of the buffer
+ chain.push_back(m_buffer[i]);
+ }
+ }
+
+ for (int i = 0; i < phr_vec.size(); i++){
+ std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
+
+ if (isValidChain(phr_vec[i], chain)){
+ chain.push_back(phr_vec[i]);
+ if (sourcePosSet.empty()==false){
+ for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
+ int cur=*it;
+ chain_ids.push_back(cur+sourceOffset); //MSAL
+ }
+ }
+ }
+
+ else if (chain.size() == 0) { // start of a suffix at hypothesis0
+ allchains.push_back(phr_vec[i]);
+ allchain_ids.push_back(chain_ids);
+ chain_ids.clear();//={};
+ }
+
+ else { // tokens formed a complete word; add tokens segmented by space to allchains
+ std::string joined = boost::algorithm::join(chain, " ");
+ allchains.push_back(joined);
+ allchain_ids.push_back(chain_ids);
+
+ chain.clear();// = {};
+ chain_ids.clear();//={};
+
+ chain.push_back(phr_vec[i]);
+ if (sourcePosSet.empty()==false){
+ for (std::set<size_t>::iterator it(sourcePosSet.begin());it != sourcePosSet.end(); it++) {
+ int cur=*it;
+ chain_ids.push_back(cur+sourceOffset); //MSAL
+ }
+ }
+ /*else {
+ //chain_ids.push_back(sourceOffset);
+ //std::cout << sourceOffset <<" $ ";
+ //chain_ids.push_back({});
+ std::cout << "NONE $ ";
+ }*/
+ //chain_ids.push_back(i+sourceOffset);//MSAL
+ }
+
+ }
+
+ if (!chain.empty()){
+ std::string joined = boost::algorithm::join(chain, " ");
+ allchains.push_back(joined);
+ allchain_ids.push_back(chain_ids);
+ }
+ return allchains;
+ }
+
+
+
+ void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ){
+ lmProb = 0;
+ State currState = lmState;
+ State temp;
+ string desegmented="";
+ vector <string> words;
+ vector <string> currFVec;
+
+ discontig0=0;
+ discontig1=0;
+ discontig2=0;
+ UnsegWP=0;
+
+ currFVec = m_buffer;
+
+ /*
+ std::cout << "GAP: ";
+ for (int j=0 ; j< m_buffer.size();j++){cout << " " << m_buffer[j];}
+ std::cout << endl;
+ std::cout << "Phrase: ";
+ for (int j=0 ; j< m_curr_phr.size();j++){cout << " " << m_curr_phr[j];}
+ std::cout << endl; */
+
+ currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
+
+ //std::cout << "First: ";
+ //for (int j=0 ; j< currFVec.size();j++){cout << " " << currFVec[j];}
+ //std::cout << endl;
+
+ int vecSize=currFVec.size();
+
+ // phrases with suffix-starts and prefix-end
+ if (currFVec.size()>0 && isPrefix (currFVec.back())) {
+ UnsegWP-=0.5;}
+ if (currFVec.size()>0 && isSuffix (currFVec.front())) {
+ UnsegWP-=0.5;}
+
+
+ /* //Dropping prefix-end and suffix-start
+ while (currFVec.size()>0 && isPrefix (currFVec.back())){
+ currFVec.pop_back(); //drop prefix appearing at end of phrase
+ }
+
+ while (currFVec.size()>0 && isSuffix (currFVec.front())){
+ currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
+ } */
+
+
+ vector<vector<int> > chain_ids;
+ words = grouper(currFVec,chain_ids,0,align,1);
+
+ for (int i = 0; i<words.size(); i++) {
+ UnsegWP+=1;
+ temp = currState;
+ if (words[i].find(" ")!=std::string::npos){
+ desegmented=desegT.Search(words[i])[0];
+ lmProb += ptrDsgLM.Score(temp,desegmented,currState);
+ }
+ else{
+ boost::replace_all(words[i], "-LRB-", "(");
+ boost::replace_all(words[i], "-RRB-", ")");
+ lmProb += ptrDsgLM.Score(temp,words[i],currState);
+ }
+ }
+ //opProb=TransformLMScore(opProb);
+ lmState = currState;
+ }
+
+ void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
+ {
+ lmProb = 0;
+ discontig0=0;
+ discontig1=0;
+ discontig2=0;
+ UnsegWP=0;
+
+ State currState = lmState;
+ State temp;
+ string desegmented="";
+ vector <string> words;
+ vector <string> currFVec;
+ bool completePhraseSuffixEnd = false;
+ vector<vector<int> > all_chain_ids;
+ double pscore;
+ currFVec=m_curr_phr;
+
+ // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
+ if (isSuffix (currFVec.back()) && (currFVec.back()!="+")){completePhraseSuffixEnd=true;}
+
+ words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
+
+ for (int i = 0; i < words.size(); i++) {
+ temp = currState; //NEW ADDED
+
+ if (i==words.size()-1){
+ if (completePhraseSuffixEnd){ //i.e if phrase ends with suffix, which marks an end of a word
+ m_buffer.clear();// ="";
+ m_span.clear();// ={};//MSAL
+ //delta=0.0; //Dont enable this, wrong
+ }
+ else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
+ m_buffer.clear();
+ if (optimistic == 1){
+ // (2)Comment the below if you want delayed scoring
+ if ( isPrefix (currFVec.back())){ // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
+
+ pscore = ptrDsgLM.Score(temp,desegmented,currState);
+
+ // enable the 3 lines below with (1) and disable lines below it
+ //opProb = opProb + pscore - delta; //NEW
+ //delta=pscore;
+ //currState=temp;
+
+ lmProb -= delta;
+ delta = 0.0;
+ }//*/
+
+ //Comments these else statements below with (2) if you want to delay prefix-end scoring
+ else if (words[i].find(" ")!=std::string::npos){ //NEW
+ desegmented=desegT.Search(words[i])[0]; //NEW
+ pscore=ptrDsgLM.Score(temp,desegmented,currState);
+ //opProb += pscore-delta;
+ lmProb = lmProb + pscore - delta; //NEW
+ delta=pscore;
+ currState=temp;
+ }
+ else{
+ boost::replace_all(words[i], "-LRB-", "("); //NEW CHECK
+ boost::replace_all(words[i], "-RRB-", ")"); //NEW CHECK
+ pscore=ptrDsgLM.Score(temp,words[i],currState);
+ //opProb += pscore-delta; //NEW
+ lmProb = lmProb + pscore - delta; //NEW
+ delta=pscore; //NEW
+ currState=temp;
+ }}//*/
+
+ m_buffer.push_back(words.back());
+ //gap=words.back();
+ m_span=all_chain_ids.back();//MSAL
+ //opProb=TransformLMScore(opProb);
+ //lmState = currState;
+ break;
+ }
+ }
+
+ //temp = currState; NEW COMMENTED
+ if (words[i].find(" ")!=std::string::npos){
+ UnsegWP+=1;
+ desegmented=desegT.Search(words[i])[0];
+ std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
+ if (cur_chain_ids.size()>1){
+ vector<int> dsc;
+ for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it);it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
+ int cur=*it;
+ int mynext=*next;
+ if (std::abs(cur - mynext)>= 3) {
+ dsc.push_back(3);
+ //discontig2+=1;
+ //break;
+ }
+ else if (std::abs(cur - mynext)== 2){
+ //discontig1+=1;
+ dsc.push_back(2);
+ //break;
+ }
+ else if (std::abs(cur - mynext)<= 1){
+ //discontig0+=1;
+ dsc.push_back(1);
+ }
+ }
+ int mymax=*std::max_element(dsc.begin(),dsc.end());
+ if (mymax==3){discontig2+=1;}
+ else if (mymax==2){discontig1+=1;}
+ else{discontig0+=1;}
+ }
+ else{
+ discontig0 += 1;
+ }
+
+ //opProb += ptrDsgLM.Score(temp,ptrDsgLM.GetVocabulary().Index(desegmented),currState);
+ lmProb += ptrDsgLM.Score(temp,desegmented,currState);
+ }
+ else{
+ UnsegWP+=1;
+ boost::replace_all(words[i], "-LRB-", "(");
+ boost::replace_all(words[i], "-RRB-", ")");
+ lmProb += ptrDsgLM.Score(temp,words[i],currState);
+ }
+ }
+
+ if (isCompleted){
+ temp = currState;
+ lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
+ }
+ //opProb=TransformLMScore(opProb);
+ lmState = currState;
+ }
+
+
+ void dsgHypothesis :: print()
+ {}
+
+
+} // namespace
diff --git a/moses/FF/Dsg-Feature/dsgHyp.h b/moses/FF/Dsg-Feature/dsgHyp.h
new file mode 100644
index 000000000..a609b7fb6
--- /dev/null
+++ b/moses/FF/Dsg-Feature/dsgHyp.h
@@ -0,0 +1,109 @@
+#pragma once
+
+
+# include "moses/FF/FFState.h"
+# include "moses/Manager.h"
+# include <set>
+# include <map>
+# include <string>
+# include <vector>
+# include "moses/FF/Dsg-Feature/Desegmenter.h"
+# include "KenDsg.h"
+
+
+namespace Moses
+{
+
+ class dsgState : public FFState
+ {
+ public:
+
+ dsgState(const lm::ngram::State & val);
+ //int Compare(const FFState& other) const;
+ virtual bool operator==(const FFState& other) const; //CHECK
+ void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
+
+ std::vector<std::string> getBuffer() const {
+ return buffer;
+ }
+
+ std::vector<int> getSpan() const {
+ return span;
+ }
+
+ lm::ngram::State getLMState() const {
+ return lmState;
+ }
+
+ float getDelta() const { //NEW
+ return delta;
+ }
+
+ void setDelta(double val1 ) { //NEWWWW
+ delta = val1;
+ }
+
+ void print() const;
+ std::string getName() const;
+
+ virtual size_t hash() const;
+
+
+ protected:
+ std::vector<std::string> buffer;
+ std::vector<int> span;
+ lm::ngram::State lmState;
+ double delta; //NEW
+ };
+
+
+
+class dsgHypothesis
+{
+
+ private:
+ std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
+ std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
+ lm::ngram::State lmState; // KenLM's Model State ...
+ std::vector<std::string> m_curr_phr; //phrase from current hypothesis
+ double delta; //NEW
+
+ double lmProb;
+ int discontig0;
+ int discontig1;
+ int discontig2;
+ double UnsegWP;
+
+ public:
+
+ dsgHypothesis();
+ ~dsgHypothesis() {};
+ void calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &, bool isCompleted, const AlignmentInfo &align, int sourceOffset, bool optimistic);
+ void calculateDsgProbinIsol(DsgLM& ptrDsgLM, Desegmenter &, const AlignmentInfo &align);
+
+ void setPhrases(std::vector<std::string> & val1 ) {//MSAL
+ m_curr_phr = val1;
+ }
+
+ void setDelta(double val1 ) { //NEW
+ delta = val1;
+ }
+
+ void setState(const FFState* prev_state);
+ dsgState * saveState();
+ void print();
+ void populateScores(std::vector <float> & scores , const int numFeatures);
+ void setState(const lm::ngram::State & val) {
+ lmState = val;
+ }
+
+ bool isPrefix(const std::string &);
+ bool isSuffix(const std::string &);
+ bool isStem(const std::string &);
+ bool isValidChain(const std::string &, std::vector<std::string> &chain);
+ vector<string> grouper(std::vector<std::string> &,std::vector<std::vector<int> > &,int,const AlignmentInfo &align,bool);
+
+};
+} // namespace
+
+