Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2016-10-03 14:04:01 +0300
committerHieu Hoang <hieuhoang@gmail.com>2016-10-03 14:04:01 +0300
commit88e13e3b833c4221e7417ce29c973af8c867c82b (patch)
treeda211936f591bffa8e6f4ffad68a404a6b50804f
parent968c72538f6976f6558f0799e7d0b1293b7e3d96 (diff)
parentfc0d9900ffe2bf6dfb5243c642d8686f450c2af9 (diff)
Merge branch 'master' of github.com:moses-smt/mosesdecoder
-rw-r--r--BUILD-INSTRUCTIONS.txt1
-rw-r--r--contrib/moses2/FF/FFState.cpp (renamed from contrib/moses2/legacy/FFState.cpp)0
-rw-r--r--contrib/moses2/FF/FFState.h (renamed from contrib/moses2/legacy/FFState.h)0
-rw-r--r--contrib/moses2/FF/FeatureRegistry.cpp5
-rw-r--r--contrib/moses2/FF/LexicalReordering/LRState.h2
-rw-r--r--contrib/moses2/FF/OSM/KenOSM.cpp33
-rw-r--r--contrib/moses2/FF/OSM/KenOSM.h53
-rw-r--r--contrib/moses2/FF/OSM/OpSequenceModel.cpp248
-rw-r--r--contrib/moses2/FF/OSM/OpSequenceModel.h57
-rw-r--r--contrib/moses2/FF/OSM/osmHyp.cpp601
-rw-r--r--contrib/moses2/FF/OSM/osmHyp.h111
-rw-r--r--contrib/moses2/FF/PointerState.cpp (renamed from contrib/moses2/legacy/PointerState.cpp)0
-rw-r--r--contrib/moses2/FF/PointerState.h (renamed from contrib/moses2/legacy/PointerState.h)0
-rw-r--r--contrib/moses2/FF/StatefulFeatureFunction.h2
-rw-r--r--contrib/moses2/HypothesisBase.h2
-rw-r--r--contrib/moses2/Jamfile5
-rw-r--r--contrib/moses2/LM/LanguageModel.cpp2
-rw-r--r--contrib/moses2/PhraseBased/Hypothesis.h4
-rw-r--r--contrib/moses2/PhraseBased/Sentence.cpp2
-rw-r--r--contrib/moses2/SCFG/Sentence.cpp2
-rw-r--r--contrib/moses2/TranslationModel/Transliteration.cpp229
-rw-r--r--contrib/moses2/TranslationModel/Transliteration.h91
-rw-r--r--misc/1-1-Extraction.cpp4
-rw-r--r--regression-testing/Jamfile2
-rw-r--r--scripts/ems/example/config.factored2
-rw-r--r--scripts/ems/example/config.hierarchical2
-rw-r--r--scripts/ems/example/config.syntax2
-rw-r--r--scripts/ems/example/config.toy2
-rw-r--r--scripts/ems/example/config.toy.bilinguallm2
-rwxr-xr-xscripts/ems/experiment.perl2
-rwxr-xr-xscripts/generic/binarize4moses2.perl4
31 files changed, 1455 insertions, 17 deletions
diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index a41582bfa..7b9bc3a8a 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -7,4 +7,3 @@ into the source tree from elsewhere:
* "bjam-files" is taken from Boost.
* "util" and "lm" are taken from KenLM: https://github.com/kpu/kenlm
-
diff --git a/contrib/moses2/legacy/FFState.cpp b/contrib/moses2/FF/FFState.cpp
index e69de29bb..e69de29bb 100644
--- a/contrib/moses2/legacy/FFState.cpp
+++ b/contrib/moses2/FF/FFState.cpp
diff --git a/contrib/moses2/legacy/FFState.h b/contrib/moses2/FF/FFState.h
index 33ef5d1f6..33ef5d1f6 100644
--- a/contrib/moses2/legacy/FFState.h
+++ b/contrib/moses2/FF/FFState.h
diff --git a/contrib/moses2/FF/FeatureRegistry.cpp b/contrib/moses2/FF/FeatureRegistry.cpp
index 28aa4258d..b040eb8a1 100644
--- a/contrib/moses2/FF/FeatureRegistry.cpp
+++ b/contrib/moses2/FF/FeatureRegistry.cpp
@@ -3,6 +3,7 @@
#include "../TranslationModel/Memory/PhraseTableMemory.h"
#include "../TranslationModel/ProbingPT.h"
#include "../TranslationModel/UnknownWordPenalty.h"
+#include "../TranslationModel/Transliteration.h"
#include "../LM/KENLM.h"
#include "../LM/KENLMBatch.h"
@@ -13,6 +14,7 @@
#include "LexicalReordering/LexicalReordering.h"
#include "PhrasePenalty.h"
#include "WordPenalty.h"
+#include "OSM/OpSequenceModel.h"
#include "SkeletonStatefulFF.h"
#include "SkeletonStatelessFF.h"
@@ -51,6 +53,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME2("PhraseDictionaryMemory", PhraseTableMemory);
MOSES_FNAME(ProbingPT);
+ MOSES_FNAME2("PhraseDictionaryTransliteration", Transliteration);
MOSES_FNAME(UnknownWordPenalty);
Add("KENLM", new KenFactory());
@@ -64,6 +67,8 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(LexicalReordering);
MOSES_FNAME(PhrasePenalty);
MOSES_FNAME(WordPenalty);
+ MOSES_FNAME(OpSequenceModel);
+
MOSES_FNAME(SkeletonStatefulFF);
MOSES_FNAME(SkeletonStatelessFF);
}
diff --git a/contrib/moses2/FF/LexicalReordering/LRState.h b/contrib/moses2/FF/LexicalReordering/LRState.h
index 846acb092..0e906d09a 100644
--- a/contrib/moses2/FF/LexicalReordering/LRState.h
+++ b/contrib/moses2/FF/LexicalReordering/LRState.h
@@ -1,5 +1,5 @@
#pragma once
-#include "../../legacy/FFState.h"
+#include "../FFState.h"
#include "LRModel.h"
namespace Moses2
diff --git a/contrib/moses2/FF/OSM/KenOSM.cpp b/contrib/moses2/FF/OSM/KenOSM.cpp
new file mode 100644
index 000000000..6b410fc9e
--- /dev/null
+++ b/contrib/moses2/FF/OSM/KenOSM.cpp
@@ -0,0 +1,33 @@
+#include "KenOSM.h"
+
+namespace Moses2
+{
+
+OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method)
+{
+ lm::ngram::ModelType model_type;
+ lm::ngram::Config config;
+ config.load_method = load_method;
+ if (lm::ngram::RecognizeBinary(file, model_type)) {
+ switch(model_type) {
+ case lm::ngram::PROBING:
+ return new KenOSM<lm::ngram::ProbingModel>(file, config);
+ case lm::ngram::REST_PROBING:
+ return new KenOSM<lm::ngram::RestProbingModel>(file, config);
+ case lm::ngram::TRIE:
+ return new KenOSM<lm::ngram::TrieModel>(file, config);
+ case lm::ngram::QUANT_TRIE:
+ return new KenOSM<lm::ngram::QuantTrieModel>(file, config);
+ case lm::ngram::ARRAY_TRIE:
+ return new KenOSM<lm::ngram::ArrayTrieModel>(file, config);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new KenOSM<lm::ngram::QuantArrayTrieModel>(file, config);
+ default:
+ UTIL_THROW2("Unrecognized kenlm model type " << model_type);
+ }
+ } else {
+ return new KenOSM<lm::ngram::ProbingModel>(file, config);
+ }
+}
+
+} // namespace
diff --git a/contrib/moses2/FF/OSM/KenOSM.h b/contrib/moses2/FF/OSM/KenOSM.h
new file mode 100644
index 000000000..f1275232f
--- /dev/null
+++ b/contrib/moses2/FF/OSM/KenOSM.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <string>
+#include "lm/model.hh"
+
+namespace Moses2
+{
+
+class KenOSMBase
+{
+public:
+ virtual ~KenOSMBase() {}
+
+ virtual float Score(const lm::ngram::State&, StringPiece,
+ lm::ngram::State&) const = 0;
+
+ virtual const lm::ngram::State &BeginSentenceState() const = 0;
+
+ virtual const lm::ngram::State &NullContextState() const = 0;
+};
+
+template <class KenModel>
+class KenOSM : public KenOSMBase
+{
+public:
+ KenOSM(const char *file, const lm::ngram::Config &config)
+ : m_kenlm(file, config) {}
+
+ float Score(const lm::ngram::State &in_state,
+ StringPiece word,
+ lm::ngram::State &out_state) const {
+ return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
+ out_state);
+ }
+
+ const lm::ngram::State &BeginSentenceState() const {
+ return m_kenlm.BeginSentenceState();
+ }
+
+ const lm::ngram::State &NullContextState() const {
+ return m_kenlm.NullContextState();
+ }
+
+private:
+ KenModel m_kenlm;
+};
+
+typedef KenOSMBase OSMLM;
+
+OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method);
+
+
+} // namespace
diff --git a/contrib/moses2/FF/OSM/OpSequenceModel.cpp b/contrib/moses2/FF/OSM/OpSequenceModel.cpp
new file mode 100644
index 000000000..572065813
--- /dev/null
+++ b/contrib/moses2/FF/OSM/OpSequenceModel.cpp
@@ -0,0 +1,248 @@
+#include <sstream>
+#include "OpSequenceModel.h"
+#include "osmHyp.h"
+#include "lm/state.hh"
+#include "../../PhraseBased/Manager.h"
+#include "../../PhraseBased/Hypothesis.h"
+#include "../../PhraseBased/TargetPhraseImpl.h"
+#include "../../PhraseBased/Sentence.h"
+#include "../../TranslationModel/UnknownWordPenalty.h"
+#include "../../System.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+OpSequenceModel::OpSequenceModel(size_t startInd, const std::string &line) :
+ StatefulFeatureFunction(startInd, line)
+{
+ sFactor = 0;
+ tFactor = 0;
+ numFeatures = 5;
+ load_method = util::READ;
+
+ ReadParameters();
+}
+
+OpSequenceModel::~OpSequenceModel()
+{
+ // TODO Auto-generated destructor stub
+}
+
+void OpSequenceModel::Load(System &system)
+{
+ readLanguageModel(m_lmPath.c_str());
+}
+
+FFState* OpSequenceModel::BlankState(MemPool &pool, const System &sys) const
+{
+ return new (pool.Allocate<osmState>()) osmState();
+}
+
+void OpSequenceModel::EmptyHypothesisState(FFState &state,
+ const ManagerBase &mgr, const InputType &input,
+ const Hypothesis &hypo) const
+{
+ lm::ngram::State startState = OSM->BeginSentenceState();
+
+ osmState &stateCast = static_cast<osmState&>(state);
+ stateCast.setState(startState);
+}
+
+void OpSequenceModel::EvaluateInIsolation(MemPool &pool,
+ const System &system, const Phrase<Moses2::Word> &source,
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+ osmHypothesis obj;
+ obj.setState(OSM->NullContextState());
+
+ Bitmap myBitmap (pool, source.GetSize());
+ myBitmap.Init(std::vector<bool>());
+
+ vector <string> mySourcePhrase;
+ vector <string> myTargetPhrase;
+ vector<float> scoresVec;
+ vector <int> alignments;
+ int startIndex = 0;
+ int endIndex = source.GetSize();
+
+ const AlignmentInfo &align = targetPhrase.GetAlignTerm();
+ AlignmentInfo::const_iterator iter;
+
+ for (iter = align.begin(); iter != align.end(); ++iter) {
+ alignments.push_back(iter->first);
+ alignments.push_back(iter->second);
+ }
+
+ for (size_t i = 0; i < targetPhrase.GetSize(); i++) {
+ if (&targetPhrase.pt == system.featureFunctions.GetUnknownWordPenalty() && sFactor == 0 && tFactor == 0)
+ myTargetPhrase.push_back("_TRANS_SLF_");
+ else
+ myTargetPhrase.push_back(targetPhrase[i][tFactor]->GetString().as_string());
+ }
+
+ for (size_t i = 0; i < source.GetSize(); i++) {
+ mySourcePhrase.push_back(source[i][sFactor]->GetString().as_string());
+ }
+
+ obj.setPhrases(mySourcePhrase , myTargetPhrase);
+ obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize());
+ obj.computeOSMFeature(startIndex,myBitmap);
+ obj.calculateOSMProb(*OSM);
+ obj.populateScores(scoresVec,numFeatures);
+
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+ scoresVec.data());
+ estimatedScore += weightedScore;
+
+}
+
+void OpSequenceModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void OpSequenceModel::EvaluateWhenApplied(const ManagerBase &mgr,
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+ FFState &state) const
+{
+ const TargetPhrase<Moses2::Word> &target = hypo.GetTargetPhrase();
+ const Bitmap &bitmap = hypo.GetBitmap();
+ Bitmap myBitmap(bitmap);
+ const ManagerBase &manager = hypo.GetManager();
+ const InputType &source = manager.GetInput();
+ const Sentence &sourceSentence = static_cast<const Sentence&>(source);
+
+ osmHypothesis obj;
+ vector <string> mySourcePhrase;
+ vector <string> myTargetPhrase;
+ vector<float> scoresVec;
+
+
+ //target.GetWord(0)
+
+ //cerr << target <<" --- "<<target.GetSourcePhrase()<< endl; // English ...
+
+ //cerr << align << endl; // Alignments ...
+ //cerr << cur_hypo.GetCurrSourceWordsRange() << endl;
+
+ //cerr << source <<endl;
+
+// int a = sourceRange.GetStartPos();
+// cerr << source.GetWord(a);
+ //cerr <<a<<endl;
+
+ //const Sentence &sentence = static_cast<const Sentence&>(curr_hypo.GetManager().GetSource());
+
+
+ const Range & sourceRange = hypo.GetInputPath().range;
+ int startIndex = sourceRange.GetStartPos();
+ int endIndex = sourceRange.GetEndPos();
+ const AlignmentInfo &align = hypo.GetTargetPhrase().GetAlignTerm();
+ // osmState * statePtr;
+
+ vector <int> alignments;
+
+
+
+ AlignmentInfo::const_iterator iter;
+
+ for (iter = align.begin(); iter != align.end(); ++iter) {
+ //cerr << iter->first << "----" << iter->second << " ";
+ alignments.push_back(iter->first);
+ alignments.push_back(iter->second);
+ }
+
+
+ //cerr<<bitmap<<endl;
+ //cerr<<startIndex<<" "<<endIndex<<endl;
+
+
+ for (int i = startIndex; i <= endIndex; i++) {
+ myBitmap.SetValue(i,0); // resetting coverage of this phrase ...
+ mySourcePhrase.push_back(sourceSentence[i][sFactor]->GetString().as_string());
+ // cerr<<mySourcePhrase[i]<<endl;
+ }
+
+ for (size_t i = 0; i < target.GetSize(); i++) {
+ if (&target.pt == mgr.system.featureFunctions.GetUnknownWordPenalty() && sFactor == 0 && tFactor == 0)
+ myTargetPhrase.push_back("_TRANS_SLF_");
+ else
+ myTargetPhrase.push_back(target[i][tFactor]->GetString().as_string());
+
+ }
+
+
+ //cerr<<myBitmap<<endl;
+
+ obj.setState(&prevState);
+ obj.constructCepts(alignments,startIndex,endIndex,target.GetSize());
+ obj.setPhrases(mySourcePhrase , myTargetPhrase);
+ obj.computeOSMFeature(startIndex,myBitmap);
+ obj.calculateOSMProb(*OSM);
+ obj.populateScores(scoresVec,numFeatures);
+ //obj.print();
+
+ scores.PlusEquals(mgr.system, *this, scoresVec);
+
+ osmState &stateCast = static_cast<osmState&>(state);
+ obj.saveState(stateCast);
+}
+
+void OpSequenceModel::EvaluateWhenApplied(const SCFG::Manager &mgr,
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+ FFState &state) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
+{
+
+ if (key == "path") {
+ m_lmPath = value;
+ } else if (key == "support-features") {
+ if(value == "no")
+ numFeatures = 1;
+ else
+ numFeatures = 5;
+ } else if (key == "input-factor") {
+ sFactor = Scan<int>(value);
+ } else if (key == "output-factor") {
+ tFactor = Scan<int>(value);
+ } else if (key == "load") {
+ if (value == "lazy") {
+ load_method = util::LAZY;
+ } else if (value == "populate_or_lazy") {
+ load_method = util::POPULATE_OR_LAZY;
+ } else if (value == "populate_or_read" || value == "populate") {
+ load_method = util::POPULATE_OR_READ;
+ } else if (value == "read") {
+ load_method = util::READ;
+ } else if (value == "parallel_read") {
+ load_method = util::PARALLEL_READ;
+ } else {
+ UTIL_THROW2("Unknown KenLM load method " << value);
+ }
+ } else {
+ StatefulFeatureFunction::SetParameter(key, value);
+ }
+}
+
+void OpSequenceModel :: readLanguageModel(const char *lmFile)
+{
+ string unkOp = "_TRANS_SLF_";
+ OSM = ConstructOSMLM(m_lmPath.c_str(), load_method);
+
+ lm::ngram::State startState = OSM->NullContextState();
+ lm::ngram::State endState;
+ unkOpProb = OSM->Score(startState,unkOp,endState);
+}
+
+}
diff --git a/contrib/moses2/FF/OSM/OpSequenceModel.h b/contrib/moses2/FF/OSM/OpSequenceModel.h
new file mode 100644
index 000000000..d46cc82fb
--- /dev/null
+++ b/contrib/moses2/FF/OSM/OpSequenceModel.h
@@ -0,0 +1,57 @@
+#include "../StatefulFeatureFunction.h"
+#include "util/mmap.hh"
+#include "KenOSM.h"
+
+namespace Moses2
+{
+
+
+class OpSequenceModel : public StatefulFeatureFunction
+{
+public:
+ OSMLM* OSM;
+ float unkOpProb;
+ int numFeatures; // Number of features used ...
+ int sFactor; // Source Factor ...
+ int tFactor; // Target Factor ...
+ util::LoadMethod load_method; // method to load model
+
+ OpSequenceModel(size_t startInd, const std::string &line);
+ virtual ~OpSequenceModel();
+
+ virtual void Load(System &system);
+
+ virtual FFState* BlankState(MemPool &pool, const System &sys) const;
+ virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+ const InputType &input, const Hypothesis &hypo) const;
+
+ virtual void
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ virtual void
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ virtual void EvaluateWhenApplied(const ManagerBase &mgr,
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+ FFState &state) const;
+
+ virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+ FFState &state) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+protected:
+ std::string m_lmPath;
+
+ void readLanguageModel(const char *);
+
+};
+
+}
+
+
diff --git a/contrib/moses2/FF/OSM/osmHyp.cpp b/contrib/moses2/FF/OSM/osmHyp.cpp
new file mode 100644
index 000000000..ede841a80
--- /dev/null
+++ b/contrib/moses2/FF/OSM/osmHyp.cpp
@@ -0,0 +1,601 @@
+#include "osmHyp.h"
+#include <sstream>
+
+using namespace std;
+using namespace lm::ngram;
+
+namespace Moses2
+{
+void osmState::setState(const lm::ngram::State & val)
+{
+ j = 0;
+ E = 0;
+ lmState = val;
+}
+
+void osmState::saveState(int jVal, int eVal, map <int , string> & gapVal)
+{
+ gap.clear();
+ gap = gapVal;
+ j = jVal;
+ E = eVal;
+}
+
+size_t osmState::hash() const
+{
+ size_t ret = j;
+
+ boost::hash_combine(ret, E);
+ boost::hash_combine(ret, gap);
+ boost::hash_combine(ret, lmState.length);
+
+ return ret;
+}
+
+bool osmState::operator==(const FFState& otherBase) const
+{
+ const osmState &other = static_cast<const osmState&>(otherBase);
+ if (j != other.j)
+ return false;
+ if (E != other.E)
+ return false;
+ if (gap != other.gap)
+ return false;
+ if (lmState.length != other.lmState.length)
+ return false;
+
+ return true;
+}
+
+std::string osmState :: getName() const
+{
+
+ return "done";
+}
+
+//////////////////////////////////////////////////
+
+osmHypothesis :: osmHypothesis()
+{
+ opProb = 0;
+ gapWidth = 0;
+ gapCount = 0;
+ openGapCount = 0;
+ deletionCount = 0;
+ gapCount = 0;
+ j = 0;
+ E = 0;
+ gap.clear();
+}
+
+void osmHypothesis :: setState(const FFState* prev_state)
+{
+
+ if(prev_state != NULL) {
+
+ j = static_cast <const osmState *> (prev_state)->getJ();
+ E = static_cast <const osmState *> (prev_state)->getE();
+ gap = static_cast <const osmState *> (prev_state)->getGap();
+ lmState = static_cast <const osmState *> (prev_state)->getLMState();
+ }
+}
+
+void osmHypothesis :: saveState(osmState &state)
+{
+ state.setState(lmState);
+ state.saveState(j,E,gap);
+}
+
+int osmHypothesis :: isTranslationOperation(int x)
+{
+ if (operations[x].find("_JMP_BCK_") != -1)
+ return 0;
+
+ if (operations[x].find("_JMP_FWD_") != -1)
+ return 0;
+
+ if (operations[x].find("_CONT_CEPT_") != -1)
+ return 0;
+
+ if (operations[x].find("_INS_GAP_") != -1)
+ return 0;
+
+ return 1;
+
+}
+
+void osmHypothesis :: removeReorderingOperations()
+{
+ gapCount = 0;
+ deletionCount = 0;
+ openGapCount = 0;
+ gapWidth = 0;
+
+ std::vector <std::string> tupleSequence;
+
+ for (int x = 0; x < operations.size(); x++) {
+ // cout<<operations[x]<<endl;
+
+ if(isTranslationOperation(x) == 1) {
+ tupleSequence.push_back(operations[x]);
+ }
+
+ }
+
+ operations.clear();
+ operations = tupleSequence;
+}
+
+void osmHypothesis :: calculateOSMProb(OSMLM& ptrOp)
+{
+
+ opProb = 0;
+ State currState = lmState;
+ State temp;
+
+ for (size_t i = 0; i<operations.size(); i++) {
+ temp = currState;
+ opProb += ptrOp.Score(temp,operations[i],currState);
+ }
+
+ lmState = currState;
+
+ //print();
+}
+
+
+int osmHypothesis :: firstOpenGap(vector <int> & coverageVector)
+{
+
+ int firstOG =-1;
+
+ for(int nd = 0; nd < coverageVector.size(); nd++) {
+ if(coverageVector[nd]==0) {
+ firstOG = nd;
+ return firstOG;
+ }
+ }
+
+ return firstOG;
+
+}
+
+string osmHypothesis :: intToString(int num)
+{
+ return SPrint(num);
+
+}
+
+void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , Bitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF)
+{
+
+ int gFlag = 0;
+ int gp = 0;
+ int ans;
+
+
+ if ( j < j1) { // j1 is the index of the source word we are about to generate ...
+ //if(coverageVector[j]==0) // if source word at j is not generated yet ...
+ if(coverageVector.GetValue(j)==0) { // if source word at j is not generated yet ...
+ operations.push_back("_INS_GAP_");
+ gFlag++;
+ gap[j]="Unfilled";
+ }
+ if (j == E) {
+ j = j1;
+ } else {
+ operations.push_back("_JMP_FWD_");
+ j=E;
+ }
+ }
+
+ if (j1 < j) {
+ // if(j < E && coverageVector[j]==0)
+ if(j < E && coverageVector.GetValue(j)==0) {
+ operations.push_back("_INS_GAP_");
+ gFlag++;
+ gap[j]="Unfilled";
+ }
+
+ j=closestGap(gap,j1,gp);
+ operations.push_back("_JMP_BCK_"+ intToString(gp));
+
+ //cout<<"I am j "<<j<<endl;
+ //cout<<"I am j1 "<<j1<<endl;
+
+ if(j==j1)
+ gap[j]="Filled";
+ }
+
+ if (j < j1) {
+ operations.push_back("_INS_GAP_");
+ gap[j] = "Unfilled";
+ gFlag++;
+ j=j1;
+ }
+
+ if(contFlag == 0) { // First words of the multi-word cept ...
+
+ if(english == "_TRANS_SLF_") { // Unknown word ...
+ operations.push_back("_TRANS_SLF_");
+ } else {
+ operations.push_back("_TRANS_" + english + "_TO_" + german);
+ }
+
+ //ans = firstOpenGap(coverageVector);
+ ans = coverageVector.GetFirstGapPos();
+
+ if (ans != -1)
+ gapWidth += j - ans;
+
+ } else if (contFlag == 2) {
+
+ operations.push_back("_INS_" + german);
+ ans = coverageVector.GetFirstGapPos();
+
+ if (ans != -1)
+ gapWidth += j - ans;
+ deletionCount++;
+ } else {
+ operations.push_back("_CONT_CEPT_");
+ }
+
+ //coverageVector[j]=1;
+ coverageVector.SetValue(j,1);
+ j+=1;
+
+ if(E<j)
+ E=j;
+
+ if (gFlag > 0)
+ gapCount++;
+
+ openGapCount += getOpenGaps();
+
+ //if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end())
+ if (j < coverageVector.GetSize()) {
+ if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end()) {
+ j1 = j;
+ german = currF[j1-startIndex];
+ english = "_INS_";
+ generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF);
+ }
+ }
+
+}
+
+void osmHypothesis :: print()
+{
+ for (int i = 0; i< operations.size(); i++) {
+ cerr<<operations[i]<<" ";
+
+ }
+
+ cerr<<endl<<endl;
+
+ cerr<<"Operation Probability "<<opProb<<endl;
+ cerr<<"Gap Count "<<gapCount<<endl;
+ cerr<<"Open Gap Count "<<openGapCount<<endl;
+ cerr<<"Gap Width "<<gapWidth<<endl;
+ cerr<<"Deletion Count "<<deletionCount<<endl;
+
+ cerr<<"_______________"<<endl;
+}
+
+int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp)
+{
+
+ int dist=1172;
+ int value=-1;
+ int temp=0;
+ gp=0;
+ int opGap=0;
+
+ map <int,string> :: iterator iter;
+
+ iter=gap.end();
+
+ do {
+ iter--;
+ //cout<<"Trapped "<<iter->first<<endl;
+
+ if(iter->first==j1 && iter->second== "Unfilled") {
+ opGap++;
+ gp = opGap;
+ return j1;
+
+ }
+
+ if(iter->second =="Unfilled") {
+ opGap++;
+ temp = iter->first - j1;
+
+ if(temp<0)
+ temp=temp * -1;
+
+ if(dist>temp && iter->first < j1) {
+ dist=temp;
+ value=iter->first;
+ gp=opGap;
+ }
+ }
+
+
+ } while(iter!=gap.begin());
+
+ return value;
+}
+
+
+
+int osmHypothesis :: getOpenGaps()
+{
+ map <int,string> :: iterator iter;
+
+ int nd = 0;
+ for (iter = gap.begin(); iter!=gap.end(); iter++) {
+ if(iter->second == "Unfilled")
+ nd++;
+ }
+
+ return nd;
+
+}
+
+void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes)
+{
+
+ operations.push_back("_DEL_" + english);
+ currTargetIndex++;
+
+ while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end()) {
+ currTargetIndex++;
+ }
+
+ if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end()) {
+ english = currE[currTargetIndex];
+ generateDeleteOperations(english,currTargetIndex,doneTargetIndexes);
+ }
+
+}
+
+void osmHypothesis :: computeOSMFeature(int startIndex , Bitmap & coverageVector)
+{
+
+ set <int> doneTargetIndexes;
+ set <int> eSide;
+ set <int> fSide;
+ set <int> :: iterator iter;
+ string english;
+ string source;
+ int j1;
+ int targetIndex = 0;
+ doneTargetIndexes.clear();
+
+
+ if (targetNullWords.size() != 0) { // Source words to be deleted in the start of this phrase ...
+ iter = targetNullWords.begin();
+
+ if (*iter == startIndex) {
+
+ j1 = startIndex;
+ source = currF[j1-startIndex];
+ english = "_INS_";
+ generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF);
+ }
+ }
+
+ if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) { // first word has to be deleted ...
+ english = currE[targetIndex];
+ generateDeleteOperations(english,targetIndex, doneTargetIndexes);
+ }
+
+
+ for (size_t i = 0; i < ceptsInPhrase.size(); i++) {
+ source = "";
+ english = "";
+
+ fSide = ceptsInPhrase[i].first;
+ eSide = ceptsInPhrase[i].second;
+
+ iter = eSide.begin();
+ targetIndex = *iter;
+ english += currE[*iter];
+ iter++;
+
+ for (; iter != eSide.end(); iter++) {
+ if(*iter == targetIndex+1)
+ targetIndex++;
+ else
+ doneTargetIndexes.insert(*iter);
+
+ english += "^_^";
+ english += currE[*iter];
+ }
+
+ iter = fSide.begin();
+ source += currF[*iter];
+ iter++;
+
+ for (; iter != fSide.end(); iter++) {
+ source += "^_^";
+ source += currF[*iter];
+ }
+
+ iter = fSide.begin();
+ j1 = *iter + startIndex;
+ iter++;
+
+ generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF);
+
+
+ for (; iter != fSide.end(); iter++) {
+ j1 = *iter + startIndex;
+ generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF);
+ }
+
+ targetIndex++; // Check whether the next target word is unaligned ...
+
+ while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end()) {
+ targetIndex++;
+ }
+
+ if(sourceNullWords.find(targetIndex) != sourceNullWords.end()) {
+ english = currE[targetIndex];
+ generateDeleteOperations(english,targetIndex, doneTargetIndexes);
+ }
+ }
+
+ //removeReorderingOperations();
+
+ //print();
+
+}
+
+void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
+{
+ set <int> :: iterator iter;
+
+ int sz = eSide.size();
+ vector <int> t;
+
+ for (iter = eSide.begin(); iter != eSide.end(); iter++) {
+ t = tS[*iter];
+
+ for (size_t i = 0; i < t.size(); i++) {
+ fSide.insert(t[i]);
+ }
+
+ }
+
+ for (iter = fSide.begin(); iter != fSide.end(); iter++) {
+
+ t = sT[*iter];
+
+ for (size_t i = 0 ; i<t.size(); i++) {
+ eSide.insert(t[i]);
+ }
+
+ }
+
+ if (eSide.size () > sz) {
+ getMeCepts(eSide,fSide,tS,sT);
+ }
+
+}
+
+void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength)
+{
+
+ std::map <int , vector <int> > sT;
+ std::map <int , vector <int> > tS;
+ std::set <int> eSide;
+ std::set <int> fSide;
+ std::set <int> :: iterator iter;
+ std :: map <int , vector <int> > :: iterator iter2;
+ std :: pair < set <int> , set <int> > cept;
+ int src;
+ int tgt;
+
+
+ for (size_t i = 0; i < align.size(); i+=2) {
+ src = align[i];
+ tgt = align[i+1];
+ tS[tgt].push_back(src);
+ sT[src].push_back(tgt);
+ }
+
+ for (int i = startIndex; i<= endIndex; i++) { // What are unaligned source words in this phrase ...
+ if (sT.find(i-startIndex) == sT.end()) {
+ targetNullWords.insert(i);
+ }
+ }
+
+ for (int i = 0; i < targetPhraseLength; i++) { // What are unaligned target words in this phrase ...
+ if (tS.find(i) == tS.end()) {
+ sourceNullWords.insert(i);
+ }
+ }
+
+
+ while (tS.size() != 0 && sT.size() != 0) {
+
+ iter2 = tS.begin();
+
+ eSide.clear();
+ fSide.clear();
+ eSide.insert (iter2->first);
+
+ getMeCepts(eSide, fSide, tS , sT);
+
+ for (iter = eSide.begin(); iter != eSide.end(); iter++) {
+ iter2 = tS.find(*iter);
+ tS.erase(iter2);
+ }
+
+ for (iter = fSide.begin(); iter != fSide.end(); iter++) {
+ iter2 = sT.find(*iter);
+ sT.erase(iter2);
+ }
+
+ cept = make_pair (fSide , eSide);
+ ceptsInPhrase.push_back(cept);
+ }
+
+
+
+ /*
+
+ cerr<<"Extracted Cepts "<<endl;
+ for (int i = 0; i < ceptsInPhrase.size(); i++)
+ {
+
+ fSide = ceptsInPhrase[i].first;
+ eSide = ceptsInPhrase[i].second;
+
+ for (iter = eSide.begin(); iter != eSide.end(); iter++)
+ {
+ cerr<<*iter<<" ";
+ }
+ cerr<<"<---> ";
+
+ for (iter = fSide.begin(); iter != fSide.end(); iter++)
+ {
+ cerr<<*iter<<" ";
+ }
+
+ cerr<<endl;
+ }
+ cerr<<endl;
+
+ cerr<<"Unaligned Target Words"<<endl;
+
+ for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++)
+ cerr<<*iter<<"<--->"<<endl;
+
+ cerr<<"Unaligned Source Words"<<endl;
+
+ for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++)
+ cerr<<*iter<<"<--->"<<endl;
+
+ */
+
+}
+
+void osmHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
+{
+ scores.clear();
+ scores.push_back(opProb);
+
+ if (numFeatures == 1)
+ return;
+
+ scores.push_back(gapWidth);
+ scores.push_back(gapCount);
+ scores.push_back(openGapCount);
+ scores.push_back(deletionCount);
+}
+
+
+} // namespace
+
diff --git a/contrib/moses2/FF/OSM/osmHyp.h b/contrib/moses2/FF/OSM/osmHyp.h
new file mode 100644
index 000000000..c2893d366
--- /dev/null
+++ b/contrib/moses2/FF/OSM/osmHyp.h
@@ -0,0 +1,111 @@
+#pragma once
+
+# include <set>
+# include <map>
+# include <string>
+# include <vector>
+#include "KenOSM.h"
+# include "../FFState.h"
+# include "../../legacy/Bitmap.h"
+
+namespace Moses2
+{
+
+class osmState : public FFState
+{
+public:
+ osmState()
+ {}
+
+ void setState(const lm::ngram::State & val);
+
+ virtual size_t hash() const;
+ virtual bool operator==(const FFState& other) const;
+
+ virtual std::string ToString() const
+ { return "osmState"; }
+
+ void saveState(int jVal, int eVal, std::map <int , std::string> & gapVal);
+ int getJ()const {
+ return j;
+ }
+ int getE()const {
+ return E;
+ }
+ std::map <int , std::string> getGap() const {
+ return gap;
+ }
+
+ lm::ngram::State getLMState() const {
+ return lmState;
+ }
+
+ void print() const;
+ std::string getName() const;
+
+protected:
+ int j, E;
+ std::map <int,std::string> gap;
+ lm::ngram::State lmState;
+};
+
+class osmHypothesis
+{
+
+private:
+
+
+ std::vector <std::string> operations; // List of operations required to generated this hyp ...
+ std::map <int,std::string> gap; // Maintains gap history ...
+ int j; // Position after the last source word generated ...
+ int E; // Position after the right most source word so far generated ...
+ lm::ngram::State lmState; // KenLM's Model State ...
+
+ int gapCount; // Number of gaps inserted ...
+ int deletionCount;
+ int openGapCount;
+ int gapWidth;
+ double opProb;
+
+ std::vector <std::string> currE;
+ std::vector <std::string> currF;
+ std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
+ std::set <int> targetNullWords;
+ std::set <int> sourceNullWords;
+
+ int closestGap(std::map <int,std::string> gap,int j1, int & gp);
+ int firstOpenGap(std::vector <int> & coverageVector);
+ std::string intToString(int);
+ int getOpenGaps();
+ int isTranslationOperation(int j);
+ void removeReorderingOperations();
+
+ void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT);
+
+public:
+
+ osmHypothesis();
+ ~osmHypothesis() {};
+ void generateOperations(int & startIndex, int j1 , int contFlag , Bitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
+ void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
+ void calculateOSMProb(OSMLM& ptrOp);
+ void computeOSMFeature(int startIndex , Bitmap & coverageVector);
+ void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
+ void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2) {
+ currF = val1;
+ currE = val2;
+ }
+ void setState(const FFState* prev_state);
+ void saveState(osmState &state);
+ void print();
+ void populateScores(std::vector <float> & scores , const int numFeatures);
+ void setState(const lm::ngram::State & val) {
+ lmState = val;
+ }
+
+};
+
+} // namespace
+
+
+
diff --git a/contrib/moses2/legacy/PointerState.cpp b/contrib/moses2/FF/PointerState.cpp
index e69de29bb..e69de29bb 100644
--- a/contrib/moses2/legacy/PointerState.cpp
+++ b/contrib/moses2/FF/PointerState.cpp
diff --git a/contrib/moses2/legacy/PointerState.h b/contrib/moses2/FF/PointerState.h
index 41e6edf9f..41e6edf9f 100644
--- a/contrib/moses2/legacy/PointerState.h
+++ b/contrib/moses2/FF/PointerState.h
diff --git a/contrib/moses2/FF/StatefulFeatureFunction.h b/contrib/moses2/FF/StatefulFeatureFunction.h
index 70be3ad39..fffb1eea7 100644
--- a/contrib/moses2/FF/StatefulFeatureFunction.h
+++ b/contrib/moses2/FF/StatefulFeatureFunction.h
@@ -9,7 +9,7 @@
#define STATEFULFEATUREFUNCTION_H_
#include "FeatureFunction.h"
-#include "../legacy/FFState.h"
+#include "FFState.h"
#include "../MemPool.h"
namespace Moses2
diff --git a/contrib/moses2/HypothesisBase.h b/contrib/moses2/HypothesisBase.h
index 23f5c6474..6ef4d3891 100644
--- a/contrib/moses2/HypothesisBase.h
+++ b/contrib/moses2/HypothesisBase.h
@@ -8,7 +8,7 @@
#include <iostream>
#include <cstddef>
-#include "legacy/FFState.h"
+#include "FF/FFState.h"
#include "Scores.h"
namespace Moses2
diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile
index 600dd0513..ed74865ee 100644
--- a/contrib/moses2/Jamfile
+++ b/contrib/moses2/Jamfile
@@ -44,6 +44,10 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
FF/LexicalReordering/PhraseBasedReorderingState.cpp
FF/LexicalReordering/ReorderingStack.cpp
+ FF/OSM/OpSequenceModel.cpp
+ FF/OSM/KenOSM.cpp
+ FF/OSM/osmHyp.cpp
+
# LM/LanguageModelDALM.cpp
LM/LanguageModel.cpp
LM/KENLM.cpp
@@ -52,6 +56,7 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
TranslationModel/PhraseTable.cpp
TranslationModel/ProbingPT.cpp
+ TranslationModel/Transliteration.cpp
TranslationModel/UnknownWordPenalty.cpp
TranslationModel/Memory/PhraseTableMemory.cpp
diff --git a/contrib/moses2/LM/LanguageModel.cpp b/contrib/moses2/LM/LanguageModel.cpp
index b27b84c77..8a6fe3b39 100644
--- a/contrib/moses2/LM/LanguageModel.cpp
+++ b/contrib/moses2/LM/LanguageModel.cpp
@@ -11,9 +11,9 @@
#include "../PhraseBased/Manager.h"
#include "../PhraseBased/Hypothesis.h"
#include "../PhraseBased/TargetPhraseImpl.h"
+#include "../FF/PointerState.h"
#include "../legacy/Util2.h"
#include "../legacy/InputFileStream.h"
-#include "../legacy/PointerState.h"
#include "../legacy/Bitmap.h"
#include "../legacy/Util2.h"
diff --git a/contrib/moses2/PhraseBased/Hypothesis.h b/contrib/moses2/PhraseBased/Hypothesis.h
index 3afb17df4..7859c1d14 100644
--- a/contrib/moses2/PhraseBased/Hypothesis.h
+++ b/contrib/moses2/PhraseBased/Hypothesis.h
@@ -8,14 +8,14 @@
#include <iostream>
#include <cstddef>
-#include "../legacy/FFState.h"
+#include "../FF/FFState.h"
#include "../legacy/Bitmap.h"
+#include "../legacy/Range.h"
#include "../Scores.h"
#include "../Phrase.h"
#include "../TargetPhrase.h"
#include "../InputPathBase.h"
#include "../HypothesisBase.h"
-#include "../legacy/Range.h"
namespace Moses2
{
diff --git a/contrib/moses2/PhraseBased/Sentence.cpp b/contrib/moses2/PhraseBased/Sentence.cpp
index 2021da7d7..d0c728530 100644
--- a/contrib/moses2/PhraseBased/Sentence.cpp
+++ b/contrib/moses2/PhraseBased/Sentence.cpp
@@ -51,7 +51,7 @@ Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab,
string str2 = "<xml>" + str + "</xml>";
pugi::xml_parse_result result = doc.load(str2.c_str(),
- pugi::parse_default | pugi::parse_comments);
+ pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments);
pugi::xml_node topNode = doc.child("xml");
std::vector<std::string> toks;
diff --git a/contrib/moses2/SCFG/Sentence.cpp b/contrib/moses2/SCFG/Sentence.cpp
index b900e6fbe..5e69a7e23 100644
--- a/contrib/moses2/SCFG/Sentence.cpp
+++ b/contrib/moses2/SCFG/Sentence.cpp
@@ -48,7 +48,7 @@ Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab,
string str2 = "<xml>" + str + "</xml>";
pugi::xml_parse_result result = doc.load(str2.c_str(),
- pugi::parse_default | pugi::parse_comments);
+ pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments);
pugi::xml_node topNode = doc.child("xml");
std::vector<std::string> toks;
diff --git a/contrib/moses2/TranslationModel/Transliteration.cpp b/contrib/moses2/TranslationModel/Transliteration.cpp
new file mode 100644
index 000000000..f92348ee9
--- /dev/null
+++ b/contrib/moses2/TranslationModel/Transliteration.cpp
@@ -0,0 +1,229 @@
+/*
+ * Transliteration.cpp
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "Transliteration.h"
+#include "../System.h"
+#include "../Scores.h"
+#include "../InputType.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "../PhraseBased/InputPath.h"
+#include "../PhraseBased/TargetPhrases.h"
+#include "../PhraseBased/Sentence.h"
+#include "../SCFG/InputPath.h"
+#include "../SCFG/TargetPhraseImpl.h"
+#include "../SCFG/Manager.h"
+#include "../SCFG/Sentence.h"
+#include "../SCFG/ActiveChart.h"
+#include "util/tempfile.hh"
+#include "../legacy/Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+Transliteration::Transliteration(size_t startInd, const std::string &line) :
+ PhraseTable(startInd, line)
+{
+ ReadParameters();
+ UTIL_THROW_IF2(m_mosesDir.empty() ||
+ m_scriptDir.empty() ||
+ m_externalDir.empty() ||
+ m_inputLang.empty() ||
+ m_outputLang.empty(), "Must specify all arguments");
+}
+
+Transliteration::~Transliteration()
+{
+ // TODO Auto-generated destructor stub
+}
+
+void
+Transliteration::
+SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "moses-dir") {
+ m_mosesDir = value;
+ } else if (key == "script-dir") {
+ m_scriptDir = value;
+ } else if (key == "external-dir") {
+ m_externalDir = value;
+ } else if (key == "input-lang") {
+ m_inputLang = value;
+ } else if (key == "output-lang") {
+ m_outputLang = value;
+ } else {
+ PhraseTable::SetParameter(key, value);
+ }
+}
+
+void Transliteration::Lookup(const Manager &mgr,
+ InputPathsBase &inputPaths) const
+{
+ BOOST_FOREACH(InputPathBase *pathBase, inputPaths){
+ InputPath *path = static_cast<InputPath*>(pathBase);
+
+ if (SatisfyBackoff(mgr, *path)) {
+ const SubPhrase<Moses2::Word> &phrase = path->subPhrase;
+
+ TargetPhrases *tps = Lookup(mgr, mgr.GetPool(), *path);
+ path->AddTargetPhrases(*this, tps);
+ }
+ }
+
+}
+
+TargetPhrases *Transliteration::Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const
+{
+ const SubPhrase<Moses2::Word> &sourcePhrase = inputPath.subPhrase;
+ size_t hash = sourcePhrase.hash();
+
+ // TRANSLITERATE
+ const util::temp_file inFile;
+ const util::temp_dir outDir;
+
+ ofstream inStream(inFile.path().c_str());
+ inStream << sourcePhrase.Debug(mgr.system) << endl;
+ inStream.close();
+
+ string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
+ " --transliteration-model-dir " + m_filePath +
+ " --moses-src-dir " + m_mosesDir +
+ " --external-bin-dir " + m_externalDir +
+ " --input-extension " + m_inputLang +
+ " --output-extension " + m_outputLang +
+ " --oov-file " + inFile.path() +
+ " --out-dir " + outDir.path();
+
+ int ret = system(cmd.c_str());
+ UTIL_THROW_IF2(ret != 0, "Transliteration script error");
+
+ TargetPhrases *tps = NULL;
+ tps = new (pool.Allocate<TargetPhrases>()) TargetPhrases(pool, 1);
+
+ vector<TargetPhraseImpl*> targetPhrases
+ = CreateTargetPhrases(mgr, pool, sourcePhrase, outDir.path());
+
+ vector<TargetPhraseImpl*>::const_iterator iter;
+ for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
+ TargetPhraseImpl *tp = *iter;
+ tps->AddTargetPhrase(*tp);
+ }
+ mgr.system.featureFunctions.EvaluateAfterTablePruning(pool, *tps, sourcePhrase);
+
+ inputPath.AddTargetPhrases(*this, tps);
+}
+
+std::vector<TargetPhraseImpl*> Transliteration::CreateTargetPhrases(
+ const Manager &mgr,
+ MemPool &pool,
+ const SubPhrase<Moses2::Word> &sourcePhrase,
+ const std::string &outDir) const
+{
+ std::vector<TargetPhraseImpl*> ret;
+
+ string outPath = outDir + "/out.txt";
+ ifstream outStream(outPath.c_str());
+
+ string line;
+ while (getline(outStream, line)) {
+ vector<string> toks = Moses2::Tokenize(line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");
+
+ TargetPhraseImpl *tp =
+ new (pool.Allocate<TargetPhraseImpl>()) TargetPhraseImpl(pool, *this, mgr.system, 1);
+ Moses2::Word &word = (*tp)[0];
+ word.CreateFromString(mgr.system.GetVocab(), mgr.system, toks[0]);
+
+ float score = Scan<float>(toks[1]);
+ tp->GetScores().PlusEquals(mgr.system, *this, score);
+
+ // score of all other ff when this rule is being loaded
+ mgr.system.featureFunctions.EvaluateInIsolation(pool, mgr.system, sourcePhrase, *tp);
+
+ ret.push_back(tp);
+ }
+
+ outStream.close();
+
+ return ret;
+
+}
+
+
+void Transliteration::EvaluateInIsolation(const System &system,
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+// SCFG ///////////////////////////////////////////////////////////////////////////////////////////
+void Transliteration::InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+void Transliteration::LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const
+{
+ UTIL_THROW2("Not implemented");
+}
+
+}
+
diff --git a/contrib/moses2/TranslationModel/Transliteration.h b/contrib/moses2/TranslationModel/Transliteration.h
new file mode 100644
index 000000000..15f262ac8
--- /dev/null
+++ b/contrib/moses2/TranslationModel/Transliteration.h
@@ -0,0 +1,91 @@
+/*
+ * Transliteration.h
+ *
+ * Created on: 28 Oct 2015
+ * Author: hieu
+ */
+
+#pragma once
+
+#include "PhraseTable.h"
+
+namespace Moses2
+{
+class Sentence;
+class InputPaths;
+class Range;
+
+class Transliteration: public PhraseTable
+{
+public:
+ Transliteration(size_t startInd, const std::string &line);
+ virtual ~Transliteration();
+
+ void Lookup(const Manager &mgr, InputPathsBase &inputPaths) const;
+ virtual TargetPhrases *Lookup(const Manager &mgr, MemPool &pool,
+ InputPath &inputPath) const;
+
+ virtual void
+ EvaluateInIsolation(const System &system, const Phrase<Moses2::Word> &source,
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
+ SCORE &estimatedScore) const;
+
+ virtual void InitActiveChart(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ SCFG::InputPath &path) const;
+
+ void Lookup(MemPool &pool,
+ const SCFG::Manager &mgr,
+ size_t maxChartSpan,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+ void LookupUnary(MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &path) const;
+
+protected:
+ virtual void LookupNT(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const Moses2::Range &subPhraseRange,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Stacks &stacks,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenWord(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::InputPath &prevPath,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ virtual void LookupGivenNode(
+ MemPool &pool,
+ const SCFG::Manager &mgr,
+ const SCFG::ActiveChartEntry &prevEntry,
+ const SCFG::Word &wordSought,
+ const Moses2::Hypotheses *hypos,
+ const Moses2::Range &subPhraseRange,
+ SCFG::InputPath &outPath) const;
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+protected:
+ std::string m_filePath;
+ std::string m_mosesDir, m_scriptDir, m_externalDir, m_inputLang, m_outputLang;
+
+ std::vector<TargetPhraseImpl*> CreateTargetPhrases(
+ const Manager &mgr,
+ MemPool &pool,
+ const SubPhrase<Moses2::Word> &sourcePhrase,
+ const std::string &outDir) const;
+
+};
+
+}
+
diff --git a/misc/1-1-Extraction.cpp b/misc/1-1-Extraction.cpp
index cf3817abf..cea1f3cb7 100644
--- a/misc/1-1-Extraction.cpp
+++ b/misc/1-1-Extraction.cpp
@@ -216,7 +216,9 @@ int main(int argc, char * argv[])
getWords(f[i],currF);
getWords(a[i],currA);
- cerr<<"Processing "<<i<<endl;
+ if (i % 100000 == 0) {
+ cerr<<"Processing "<<i<<endl;
+ }
constructCepts(ceptsInPhrase, sourceNullWords , targetNullWords, currA , currE.size(), currF.size());
getOneToOne(ceptsInPhrase , currF , currE, one);
diff --git a/regression-testing/Jamfile b/regression-testing/Jamfile
index 68b9ebd39..17e399e43 100644
--- a/regression-testing/Jamfile
+++ b/regression-testing/Jamfile
@@ -37,9 +37,11 @@ if $(with-regtest) {
if $(skip-compact) {
reg_test phrase : [ glob $(test-dir)/phrase.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses : @reg_test_decode ;
reg_test chart : [ glob $(test-dir)/chart.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses : @reg_test_decode ;
+ reg_test moses2 : [ glob $(test-dir)/moses2.* : $(test-dir)/*withDALM $(test-dir)/*compactptable ] : ../moses-cmd//moses2 : @reg_test_decode ;
} else {
reg_test phrase : [ glob $(test-dir)/phrase.* : $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
reg_test chart : [ glob $(test-dir)/chart.* : $(test-dir)/*withDALM ] : ../moses-cmd//moses : @reg_test_decode ;
+ reg_test moses2 : [ glob $(test-dir)/moses2.* : $(test-dir)/*withDALM ] : ../contrib/moses2//moses2 : @reg_test_decode ;
}
if [ option.get "with-dalm" : : "yes" ] {
diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored
index 7e1004db6..6344c9714 100644
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@@ -414,7 +414,7 @@ alignment-symmetrization-method = grow-diag-final-and
#
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
-#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'"
#
# OR if you want to use with SRILM
#
diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical
index 3d00ffd79..88c36c430 100644
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@@ -397,7 +397,7 @@ alignment-symmetrization-method = grow-diag-final-and
#
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
-#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'"
#
# if OSM training should be skipped, point to OSM Model
#osm-model =
diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax
index bdbd2b4e0..8b20df1e2 100644
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@@ -401,7 +401,7 @@ alignment-symmetrization-method = grow-diag-final-and
#
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
-#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'"
#
# if OSM training should be skipped, point to OSM Model
#osm-model =
diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy
index 6667a9744..748fd0cd0 100644
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@@ -378,7 +378,7 @@ alignment-symmetrization-method = grow-diag-final-and
#
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
-#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'"
#
# OR if you want to use with SRILM
#
diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm
index 9bf94613f..3e64947fc 100644
--- a/scripts/ems/example/config.toy.bilinguallm
+++ b/scripts/ems/example/config.toy.bilinguallm
@@ -394,7 +394,7 @@ alignment-symmetrization-method = grow-diag-final-and
#
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
-#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'"
#
# OR if you want to use with SRILM
#
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index e52c82319..23e771e8b 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -2315,7 +2315,7 @@ sub define_training_build_transliteration_model {
my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
- my $srilm_dir = &check_and_get("TRAINING:srilm-dir");
+ my $srilm_dir = &check_backoff_and_get("TRAINING:srilm-dir");
my $decoder = &get("TRAINING:transliteration-decoder");
my $cmd = "$moses_script_dir/Transliteration/train-transliteration-module.pl";
diff --git a/scripts/generic/binarize4moses2.perl b/scripts/generic/binarize4moses2.perl
index 0865b9f66..5b9f08e50 100755
--- a/scripts/generic/binarize4moses2.perl
+++ b/scripts/generic/binarize4moses2.perl
@@ -12,12 +12,14 @@ my $mosesDir = "$RealBin/../..";
my $ptPath;
my $lexRoPath;
my $outPath;
+my $numScores = 4;
my $numLexScores;
my $pruneNum = 0;
GetOptions("phrase-table=s" => \$ptPath,
"lex-ro=s" => \$lexRoPath,
"output-dir=s" => \$outPath,
+ "num-scores=s" => \$numScores,
"num-lex-scores=i" => \$numLexScores,
"prune=i" => \$pruneNum
) or exit 1;
@@ -41,7 +43,7 @@ systemCheck($cmd);
$cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr | gzip -c > $tempPath/pt.withLexRO.gz";
systemCheck($cmd);
-$cmd = "$mosesDir/bin/CreateProbingPT2 --num-lex-scores $numLexScores --log-prob --input-pt $tempPath/pt.withLexRO.gz --output-dir $outPath";
+$cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --num-lex-scores $numLexScores --log-prob --input-pt $tempPath/pt.withLexRO.gz --output-dir $outPath";
systemCheck($cmd);
exit(0);