Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2016-09-26 17:58:03 +0300
committerHieu Hoang <hieuhoang@gmail.com>2016-09-26 17:58:03 +0300
commit79117bb80d9d572b6e0e3f20eeb8754d4b3f4f6f (patch)
treef5fbf484bd5307615ea9302c1965c631dab26d96
parente33ecf34bb5be6e9a64fefbf8b0c49b682377ff6 (diff)
add osmHyp, 2
-rw-r--r--contrib/moses2/FF/OSM/OpSequenceModel.cpp3
-rw-r--r--contrib/moses2/FF/OSM/osmHyp.cpp604
-rw-r--r--contrib/moses2/FF/OSM/osmHyp.h107
-rw-r--r--contrib/moses2/Jamfile1
4 files changed, 714 insertions, 1 deletions
diff --git a/contrib/moses2/FF/OSM/OpSequenceModel.cpp b/contrib/moses2/FF/OSM/OpSequenceModel.cpp
index 515b438ad..acd6ad74f 100644
--- a/contrib/moses2/FF/OSM/OpSequenceModel.cpp
+++ b/contrib/moses2/FF/OSM/OpSequenceModel.cpp
@@ -1,5 +1,6 @@
#include <sstream>
#include "OpSequenceModel.h"
+#include "osmHyp.h"
#include "../../PhraseBased/Manager.h"
#include "../../PhraseBased/Hypothesis.h"
#include "lm/state.hh"
@@ -78,9 +79,9 @@ void OpSequenceModel::EvaluateInIsolation(MemPool &pool,
const TargetPhraseImpl &targetPhrase, Scores &scores,
SCORE &estimatedScore) const
{
- /*
osmHypothesis obj;
obj.setState(OSM->NullContextState());
+ /*
Bitmap myBitmap(source.GetSize());
vector <string> mySourcePhrase;
vector <string> myTargetPhrase;
diff --git a/contrib/moses2/FF/OSM/osmHyp.cpp b/contrib/moses2/FF/OSM/osmHyp.cpp
index e69de29bb..b78d33ea6 100644
--- a/contrib/moses2/FF/OSM/osmHyp.cpp
+++ b/contrib/moses2/FF/OSM/osmHyp.cpp
@@ -0,0 +1,604 @@
+#include "osmHyp.h"
+#include <sstream>
+
+using namespace std;
+using namespace lm::ngram;
+
+namespace Moses2
+{
+osmState::osmState(const State & val)
+ :j(0)
+ ,E(0)
+{
+ lmState = val;
+
+}
+
+void osmState::saveState(int jVal, int eVal, map <int , string> & gapVal)
+{
+ gap.clear();
+ gap = gapVal;
+ j = jVal;
+ E = eVal;
+}
+
+size_t osmState::hash() const
+{
+ size_t ret = j;
+
+ boost::hash_combine(ret, E);
+ boost::hash_combine(ret, gap);
+ boost::hash_combine(ret, lmState.length);
+
+ return ret;
+}
+
+bool osmState::operator==(const FFState& otherBase) const
+{
+ const osmState &other = static_cast<const osmState&>(otherBase);
+ if (j != other.j)
+ return false;
+ if (E != other.E)
+ return false;
+ if (gap != other.gap)
+ return false;
+ if (lmState.length != other.lmState.length)
+ return false;
+
+ return true;
+}
+
+std::string osmState :: getName() const
+{
+
+ return "done";
+}
+
+//////////////////////////////////////////////////
+
+osmHypothesis :: osmHypothesis()
+{
+ opProb = 0;
+ gapWidth = 0;
+ gapCount = 0;
+ openGapCount = 0;
+ deletionCount = 0;
+ gapCount = 0;
+ j = 0;
+ E = 0;
+ gap.clear();
+}
+
+void osmHypothesis :: setState(const FFState* prev_state)
+{
+
+ if(prev_state != NULL) {
+
+ j = static_cast <const osmState *> (prev_state)->getJ();
+ E = static_cast <const osmState *> (prev_state)->getE();
+ gap = static_cast <const osmState *> (prev_state)->getGap();
+ lmState = static_cast <const osmState *> (prev_state)->getLMState();
+ }
+}
+
+osmState * osmHypothesis :: saveState()
+{
+
+ osmState * statePtr = new osmState(lmState);
+ statePtr->saveState(j,E,gap);
+ return statePtr;
+}
+
+int osmHypothesis :: isTranslationOperation(int x)
+{
+ if (operations[x].find("_JMP_BCK_") != -1)
+ return 0;
+
+ if (operations[x].find("_JMP_FWD_") != -1)
+ return 0;
+
+ if (operations[x].find("_CONT_CEPT_") != -1)
+ return 0;
+
+ if (operations[x].find("_INS_GAP_") != -1)
+ return 0;
+
+ return 1;
+
+}
+
+void osmHypothesis :: removeReorderingOperations()
+{
+ gapCount = 0;
+ deletionCount = 0;
+ openGapCount = 0;
+ gapWidth = 0;
+
+ std::vector <std::string> tupleSequence;
+
+ for (int x = 0; x < operations.size(); x++) {
+ // cout<<operations[x]<<endl;
+
+ if(isTranslationOperation(x) == 1) {
+ tupleSequence.push_back(operations[x]);
+ }
+
+ }
+
+ operations.clear();
+ operations = tupleSequence;
+}
+
+void osmHypothesis :: calculateOSMProb(OSMLM& ptrOp)
+{
+
+ opProb = 0;
+ State currState = lmState;
+ State temp;
+
+ for (size_t i = 0; i<operations.size(); i++) {
+ temp = currState;
+ opProb += ptrOp.Score(temp,operations[i],currState);
+ }
+
+ lmState = currState;
+
+ //print();
+}
+
+
+int osmHypothesis :: firstOpenGap(vector <int> & coverageVector)
+{
+
+ int firstOG =-1;
+
+ for(int nd = 0; nd < coverageVector.size(); nd++) {
+ if(coverageVector[nd]==0) {
+ firstOG = nd;
+ return firstOG;
+ }
+ }
+
+ return firstOG;
+
+}
+
+string osmHypothesis :: intToString(int num)
+{
+ return SPrint(num);
+
+}
+
+void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , Bitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF)
+{
+
+ int gFlag = 0;
+ int gp = 0;
+ int ans;
+
+
+ if ( j < j1) { // j1 is the index of the source word we are about to generate ...
+ //if(coverageVector[j]==0) // if source word at j is not generated yet ...
+ if(coverageVector.GetValue(j)==0) { // if source word at j is not generated yet ...
+ operations.push_back("_INS_GAP_");
+ gFlag++;
+ gap[j]="Unfilled";
+ }
+ if (j == E) {
+ j = j1;
+ } else {
+ operations.push_back("_JMP_FWD_");
+ j=E;
+ }
+ }
+
+ if (j1 < j) {
+ // if(j < E && coverageVector[j]==0)
+ if(j < E && coverageVector.GetValue(j)==0) {
+ operations.push_back("_INS_GAP_");
+ gFlag++;
+ gap[j]="Unfilled";
+ }
+
+ j=closestGap(gap,j1,gp);
+ operations.push_back("_JMP_BCK_"+ intToString(gp));
+
+ //cout<<"I am j "<<j<<endl;
+ //cout<<"I am j1 "<<j1<<endl;
+
+ if(j==j1)
+ gap[j]="Filled";
+ }
+
+ if (j < j1) {
+ operations.push_back("_INS_GAP_");
+ gap[j] = "Unfilled";
+ gFlag++;
+ j=j1;
+ }
+
+ if(contFlag == 0) { // First words of the multi-word cept ...
+
+ if(english == "_TRANS_SLF_") { // Unknown word ...
+ operations.push_back("_TRANS_SLF_");
+ } else {
+ operations.push_back("_TRANS_" + english + "_TO_" + german);
+ }
+
+ //ans = firstOpenGap(coverageVector);
+ ans = coverageVector.GetFirstGapPos();
+
+ if (ans != -1)
+ gapWidth += j - ans;
+
+ } else if (contFlag == 2) {
+
+ operations.push_back("_INS_" + german);
+ ans = coverageVector.GetFirstGapPos();
+
+ if (ans != -1)
+ gapWidth += j - ans;
+ deletionCount++;
+ } else {
+ operations.push_back("_CONT_CEPT_");
+ }
+
+ //coverageVector[j]=1;
+ coverageVector.SetValue(j,1);
+ j+=1;
+
+ if(E<j)
+ E=j;
+
+ if (gFlag > 0)
+ gapCount++;
+
+ openGapCount += getOpenGaps();
+
+ //if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end())
+ if (j < coverageVector.GetSize()) {
+ if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end()) {
+ j1 = j;
+ german = currF[j1-startIndex];
+ english = "_INS_";
+ generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF);
+ }
+ }
+
+}
+
+void osmHypothesis :: print()
+{
+ for (int i = 0; i< operations.size(); i++) {
+ cerr<<operations[i]<<" ";
+
+ }
+
+ cerr<<endl<<endl;
+
+ cerr<<"Operation Probability "<<opProb<<endl;
+ cerr<<"Gap Count "<<gapCount<<endl;
+ cerr<<"Open Gap Count "<<openGapCount<<endl;
+ cerr<<"Gap Width "<<gapWidth<<endl;
+ cerr<<"Deletion Count "<<deletionCount<<endl;
+
+ cerr<<"_______________"<<endl;
+}
+
+int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp)
+{
+
+ int dist=1172;
+ int value=-1;
+ int temp=0;
+ gp=0;
+ int opGap=0;
+
+ map <int,string> :: iterator iter;
+
+ iter=gap.end();
+
+ do {
+ iter--;
+ //cout<<"Trapped "<<iter->first<<endl;
+
+ if(iter->first==j1 && iter->second== "Unfilled") {
+ opGap++;
+ gp = opGap;
+ return j1;
+
+ }
+
+ if(iter->second =="Unfilled") {
+ opGap++;
+ temp = iter->first - j1;
+
+ if(temp<0)
+ temp=temp * -1;
+
+ if(dist>temp && iter->first < j1) {
+ dist=temp;
+ value=iter->first;
+ gp=opGap;
+ }
+ }
+
+
+ } while(iter!=gap.begin());
+
+ return value;
+}
+
+
+
+int osmHypothesis :: getOpenGaps()
+{
+ map <int,string> :: iterator iter;
+
+ int nd = 0;
+ for (iter = gap.begin(); iter!=gap.end(); iter++) {
+ if(iter->second == "Unfilled")
+ nd++;
+ }
+
+ return nd;
+
+}
+
+void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes)
+{
+
+ operations.push_back("_DEL_" + english);
+ currTargetIndex++;
+
+ while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end()) {
+ currTargetIndex++;
+ }
+
+ if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end()) {
+ english = currE[currTargetIndex];
+ generateDeleteOperations(english,currTargetIndex,doneTargetIndexes);
+ }
+
+}
+
+void osmHypothesis :: computeOSMFeature(int startIndex , Bitmap & coverageVector)
+{
+
+ set <int> doneTargetIndexes;
+ set <int> eSide;
+ set <int> fSide;
+ set <int> :: iterator iter;
+ string english;
+ string source;
+ int j1;
+ int targetIndex = 0;
+ doneTargetIndexes.clear();
+
+
+ if (targetNullWords.size() != 0) { // Source words to be deleted in the start of this phrase ...
+ iter = targetNullWords.begin();
+
+ if (*iter == startIndex) {
+
+ j1 = startIndex;
+ source = currF[j1-startIndex];
+ english = "_INS_";
+ generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF);
+ }
+ }
+
+ if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) { // first word has to be deleted ...
+ english = currE[targetIndex];
+ generateDeleteOperations(english,targetIndex, doneTargetIndexes);
+ }
+
+
+ for (size_t i = 0; i < ceptsInPhrase.size(); i++) {
+ source = "";
+ english = "";
+
+ fSide = ceptsInPhrase[i].first;
+ eSide = ceptsInPhrase[i].second;
+
+ iter = eSide.begin();
+ targetIndex = *iter;
+ english += currE[*iter];
+ iter++;
+
+ for (; iter != eSide.end(); iter++) {
+ if(*iter == targetIndex+1)
+ targetIndex++;
+ else
+ doneTargetIndexes.insert(*iter);
+
+ english += "^_^";
+ english += currE[*iter];
+ }
+
+ iter = fSide.begin();
+ source += currF[*iter];
+ iter++;
+
+ for (; iter != fSide.end(); iter++) {
+ source += "^_^";
+ source += currF[*iter];
+ }
+
+ iter = fSide.begin();
+ j1 = *iter + startIndex;
+ iter++;
+
+ generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF);
+
+
+ for (; iter != fSide.end(); iter++) {
+ j1 = *iter + startIndex;
+ generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF);
+ }
+
+ targetIndex++; // Check whether the next target word is unaligned ...
+
+ while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end()) {
+ targetIndex++;
+ }
+
+ if(sourceNullWords.find(targetIndex) != sourceNullWords.end()) {
+ english = currE[targetIndex];
+ generateDeleteOperations(english,targetIndex, doneTargetIndexes);
+ }
+ }
+
+ //removeReorderingOperations();
+
+ //print();
+
+}
+
+void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
+{
+ set <int> :: iterator iter;
+
+ int sz = eSide.size();
+ vector <int> t;
+
+ for (iter = eSide.begin(); iter != eSide.end(); iter++) {
+ t = tS[*iter];
+
+ for (size_t i = 0; i < t.size(); i++) {
+ fSide.insert(t[i]);
+ }
+
+ }
+
+ for (iter = fSide.begin(); iter != fSide.end(); iter++) {
+
+ t = sT[*iter];
+
+ for (size_t i = 0 ; i<t.size(); i++) {
+ eSide.insert(t[i]);
+ }
+
+ }
+
+ if (eSide.size () > sz) {
+ getMeCepts(eSide,fSide,tS,sT);
+ }
+
+}
+
+void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength)
+{
+
+ std::map <int , vector <int> > sT;
+ std::map <int , vector <int> > tS;
+ std::set <int> eSide;
+ std::set <int> fSide;
+ std::set <int> :: iterator iter;
+ std :: map <int , vector <int> > :: iterator iter2;
+ std :: pair < set <int> , set <int> > cept;
+ int src;
+ int tgt;
+
+
+ for (size_t i = 0; i < align.size(); i+=2) {
+ src = align[i];
+ tgt = align[i+1];
+ tS[tgt].push_back(src);
+ sT[src].push_back(tgt);
+ }
+
+ for (int i = startIndex; i<= endIndex; i++) { // What are unaligned source words in this phrase ...
+ if (sT.find(i-startIndex) == sT.end()) {
+ targetNullWords.insert(i);
+ }
+ }
+
+ for (int i = 0; i < targetPhraseLength; i++) { // What are unaligned target words in this phrase ...
+ if (tS.find(i) == tS.end()) {
+ sourceNullWords.insert(i);
+ }
+ }
+
+
+ while (tS.size() != 0 && sT.size() != 0) {
+
+ iter2 = tS.begin();
+
+ eSide.clear();
+ fSide.clear();
+ eSide.insert (iter2->first);
+
+ getMeCepts(eSide, fSide, tS , sT);
+
+ for (iter = eSide.begin(); iter != eSide.end(); iter++) {
+ iter2 = tS.find(*iter);
+ tS.erase(iter2);
+ }
+
+ for (iter = fSide.begin(); iter != fSide.end(); iter++) {
+ iter2 = sT.find(*iter);
+ sT.erase(iter2);
+ }
+
+ cept = make_pair (fSide , eSide);
+ ceptsInPhrase.push_back(cept);
+ }
+
+
+
+ /*
+
+ cerr<<"Extracted Cepts "<<endl;
+ for (int i = 0; i < ceptsInPhrase.size(); i++)
+ {
+
+ fSide = ceptsInPhrase[i].first;
+ eSide = ceptsInPhrase[i].second;
+
+ for (iter = eSide.begin(); iter != eSide.end(); iter++)
+ {
+ cerr<<*iter<<" ";
+ }
+ cerr<<"<---> ";
+
+ for (iter = fSide.begin(); iter != fSide.end(); iter++)
+ {
+ cerr<<*iter<<" ";
+ }
+
+ cerr<<endl;
+ }
+ cerr<<endl;
+
+ cerr<<"Unaligned Target Words"<<endl;
+
+ for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++)
+ cerr<<*iter<<"<--->"<<endl;
+
+ cerr<<"Unaligned Source Words"<<endl;
+
+ for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++)
+ cerr<<*iter<<"<--->"<<endl;
+
+ */
+
+}
+
+void osmHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
+{
+ scores.clear();
+ scores.push_back(opProb);
+
+ if (numFeatures == 1)
+ return;
+
+ scores.push_back(gapWidth);
+ scores.push_back(gapCount);
+ scores.push_back(openGapCount);
+ scores.push_back(deletionCount);
+}
+
+
+} // namespace
+
diff --git a/contrib/moses2/FF/OSM/osmHyp.h b/contrib/moses2/FF/OSM/osmHyp.h
index e69de29bb..a30c84837 100644
--- a/contrib/moses2/FF/OSM/osmHyp.h
+++ b/contrib/moses2/FF/OSM/osmHyp.h
@@ -0,0 +1,107 @@
+#pragma once
+
+# include <set>
+# include <map>
+# include <string>
+# include <vector>
+#include "KenOSM.h"
+# include "../FFState.h"
+# include "../../legacy/Bitmap.h"
+
+namespace Moses2
+{
+
+class osmState : public FFState
+{
+public:
+ osmState(const lm::ngram::State & val);
+ virtual size_t hash() const;
+ virtual bool operator==(const FFState& other) const;
+
+ virtual std::string ToString() const
+ { return "osmState"; }
+
+ void saveState(int jVal, int eVal, std::map <int , std::string> & gapVal);
+ int getJ()const {
+ return j;
+ }
+ int getE()const {
+ return E;
+ }
+ std::map <int , std::string> getGap() const {
+ return gap;
+ }
+
+ lm::ngram::State getLMState() const {
+ return lmState;
+ }
+
+ void print() const;
+ std::string getName() const;
+
+protected:
+ int j, E;
+ std::map <int,std::string> gap;
+ lm::ngram::State lmState;
+};
+
+class osmHypothesis
+{
+
+private:
+
+
+ std::vector <std::string> operations; // List of operations required to generated this hyp ...
+ std::map <int,std::string> gap; // Maintains gap history ...
+ int j; // Position after the last source word generated ...
+ int E; // Position after the right most source word so far generated ...
+ lm::ngram::State lmState; // KenLM's Model State ...
+
+ int gapCount; // Number of gaps inserted ...
+ int deletionCount;
+ int openGapCount;
+ int gapWidth;
+ double opProb;
+
+ std::vector <std::string> currE;
+ std::vector <std::string> currF;
+ std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
+ std::set <int> targetNullWords;
+ std::set <int> sourceNullWords;
+
+ int closestGap(std::map <int,std::string> gap,int j1, int & gp);
+ int firstOpenGap(std::vector <int> & coverageVector);
+ std::string intToString(int);
+ int getOpenGaps();
+ int isTranslationOperation(int j);
+ void removeReorderingOperations();
+
+ void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT);
+
+public:
+
+ osmHypothesis();
+ ~osmHypothesis() {};
+ void generateOperations(int & startIndex, int j1 , int contFlag , Bitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
+ void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
+ void calculateOSMProb(OSMLM& ptrOp);
+ void computeOSMFeature(int startIndex , Bitmap & coverageVector);
+ void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
+ void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2) {
+ currF = val1;
+ currE = val2;
+ }
+ void setState(const FFState* prev_state);
+ osmState * saveState();
+ void print();
+ void populateScores(std::vector <float> & scores , const int numFeatures);
+ void setState(const lm::ngram::State & val) {
+ lmState = val;
+ }
+
+};
+
+} // namespace
+
+
+
diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile
index 78175db01..13c429d8d 100644
--- a/contrib/moses2/Jamfile
+++ b/contrib/moses2/Jamfile
@@ -46,6 +46,7 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
FF/OSM/OpSequenceModel.cpp
FF/OSM/KenOSM.cpp
+ FF/OSM/osmHyp.cpp
# LM/LanguageModelDALM.cpp
LM/LanguageModel.cpp