diff options
-rw-r--r-- | mert/BleuDocScorer.cpp | 79 | ||||
-rw-r--r-- | mert/BleuDocScorer.h | 2 | ||||
-rw-r--r-- | mert/BleuScorer.h | 4 | ||||
-rw-r--r-- | misc/queryPhraseTableMin.cpp | 6 | ||||
-rw-r--r-- | moses/FF/OSM-Feature/OpSequenceModel.cpp | 246 | ||||
-rw-r--r-- | moses/FF/OSM-Feature/OpSequenceModel.h | 49 | ||||
-rw-r--r-- | moses/FF/OSM-Feature/osmHyp.cpp | 898 | ||||
-rw-r--r-- | moses/FF/OSM-Feature/osmHyp.h | 115 | ||||
-rw-r--r-- | moses/LM/Ken.cpp | 2 | ||||
-rw-r--r-- | moses/StaticData.cpp | 6 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PhraseDecoder.cpp | 9 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PhraseDecoder.h | 5 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp | 5 | ||||
-rw-r--r-- | moses/TranslationModel/CompactPT/PhraseTableCreator.cpp | 36 | ||||
-rw-r--r-- | moses/Word.h | 3 |
15 files changed, 715 insertions, 750 deletions
diff --git a/mert/BleuDocScorer.cpp b/mert/BleuDocScorer.cpp index 558757cef..b96a6bc48 100644 --- a/mert/BleuDocScorer.cpp +++ b/mert/BleuDocScorer.cpp @@ -31,11 +31,11 @@ const char REFLEN_CLOSEST[] = "closest"; namespace MosesTuning { - + BleuDocScorer::BleuDocScorer(const string& config) - : BleuScorer("BLEUDOC", config), - m_ref_length_type(CLOSEST) + : BleuScorer("BLEUDOC", config), + m_ref_length_type(CLOSEST) { const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST); if (reflen == REFLEN_AVERAGE) { @@ -63,41 +63,40 @@ bool BleuDocScorer::OpenReferenceStream(istream* is, size_t file_id) if (line.find("<doc docid") != std::string::npos) { // new document doc_id++; - m_references.push_back(new ScopedVector<Reference>()); + m_references.push_back(new ScopedVector<Reference>()); sid = 0; - } - else if (line.find("<seg") != std::string::npos) { //new sentence + } else if (line.find("<seg") != std::string::npos) { //new sentence int start = line.find_first_of('>') + 1; std::string trans = line.substr(start, line.find_last_of('<')-start); trans = preprocessSentence(trans); if (file_id == 0) { - Reference* ref = new Reference; - m_references[doc_id]->push_back(ref); // Take ownership of the Reference object. + Reference* ref = new Reference; + m_references[doc_id]->push_back(ref); // Take ownership of the Reference object. } if (m_references[doc_id]->size() <= sid) { - return false; + return false; } NgramCounts counts; size_t length = CountNgrams(trans, counts, kBleuNgramOrder); - + //for any counts larger than those already there, merge them in for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) { - const NgramCounts::Key& ngram = ci->first; - const NgramCounts::Value newcount = ci->second; - - NgramCounts::Value oldcount = 0; - m_references[doc_id]->get().at(sid)->get_counts()->Lookup(ngram, &oldcount); - if (newcount > oldcount) { - m_references[doc_id]->get().at(sid)->get_counts()->operator[](ngram) = newcount; - } + const NgramCounts::Key& ngram = ci->first; + const NgramCounts::Value newcount = ci->second; + + NgramCounts::Value oldcount = 0; + m_references[doc_id]->get().at(sid)->get_counts()->Lookup(ngram, &oldcount); + if (newcount > oldcount) { + m_references[doc_id]->get().at(sid)->get_counts()->operator[](ngram) = newcount; + } } //add in the length - m_references[doc_id]->get().at(sid)->push_back(length); + m_references[doc_id]->get().at(sid)->push_back(length); if (sid > 0 && sid % 100 == 0) { - TRACE_ERR("."); + TRACE_ERR("."); } ++sid; } @@ -127,14 +126,14 @@ void BleuDocScorer::prepareStats(size_t sid, const string& text, ScoreStats& ent //precision on each ngram type for (NgramCounts::const_iterator testcounts_it = testcounts.begin(); - testcounts_it != testcounts.end(); ++testcounts_it) { + testcounts_it != testcounts.end(); ++testcounts_it) { const NgramCounts::Value guess = testcounts_it->second; const size_t len = testcounts_it->first.size(); NgramCounts::Value correct = 0; - + NgramCounts::Value v = 0; if (m_references[sid]->get().at(i)->get_counts()->Lookup(testcounts_it->first, &v)) { - correct = min(v, guess); + correct = min(v, guess); } stats[len * 2 - 2] += correct; stats[len * 2 - 1] += guess; @@ -143,13 +142,13 @@ void BleuDocScorer::prepareStats(size_t sid, const string& text, ScoreStats& ent const int reference_len = CalcReferenceLength(sid, i, length); stats.push_back(reference_len); - //ADD stats to totStats - std::transform(stats.begin(), stats.end(), totStats.begin(), - totStats.begin(), std::plus<int>()); + //ADD stats to totStats + std::transform(stats.begin(), stats.end(), totStats.begin(), + totStats.begin(), std::plus<int>()); } - entry.set(totStats); + entry.set(totStats); } - + std::vector<std::string> BleuDocScorer::splitDoc(const std::string& text) { std::vector<std::string> res; @@ -188,18 +187,18 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const int BleuDocScorer::CalcReferenceLength(size_t doc_id, size_t sentence_id, size_t length) { switch (m_ref_length_type) { - case AVERAGE: - return m_references[doc_id]->get().at(sentence_id)->CalcAverage(); - break; - case CLOSEST: - return m_references[doc_id]->get().at(sentence_id)->CalcClosest(length); - break; - case SHORTEST: - return m_references[doc_id]->get().at(sentence_id)->CalcShortest(); - break; - default: - cerr << "unknown reference types." << endl; - exit(1); + case AVERAGE: + return m_references[doc_id]->get().at(sentence_id)->CalcAverage(); + break; + case CLOSEST: + return m_references[doc_id]->get().at(sentence_id)->CalcClosest(length); + break; + case SHORTEST: + return m_references[doc_id]->get().at(sentence_id)->CalcShortest(); + break; + default: + cerr << "unknown reference types." << endl; + exit(1); } } diff --git a/mert/BleuDocScorer.h b/mert/BleuDocScorer.h index 349745825..9677410f8 100644 --- a/mert/BleuDocScorer.h +++ b/mert/BleuDocScorer.h @@ -29,7 +29,7 @@ public: virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); virtual statscore_t calculateScore(const std::vector<int>& comps) const; - int CalcReferenceLength(std::size_t doc_id, std::size_t sentence_id, std::size_t length); + int CalcReferenceLength(std::size_t doc_id, std::size_t sentence_id, std::size_t length); // NOTE: this function is used for unit testing. virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id); diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 92d7fb9d5..8be567574 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -67,7 +67,7 @@ public: // NOTE: this function is used for unit testing. virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id); - //private: + //private: protected: ReferenceLengthType m_ref_length_type; @@ -76,7 +76,7 @@ protected: // constructor used by subclasses BleuScorer(const std::string& name, const std::string& config): StatisticsBasedScorer(name,config) {} - + // no copying allowed BleuScorer(const BleuScorer&); BleuScorer& operator=(const BleuScorer&); diff --git a/misc/queryPhraseTableMin.cpp b/misc/queryPhraseTableMin.cpp index 57747d8f2..0b4324020 100644 --- a/misc/queryPhraseTableMin.cpp +++ b/misc/queryPhraseTableMin.cpp @@ -51,12 +51,12 @@ int main(int argc, char **argv) const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||"); const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0"); const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0"); - const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0"); - const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0"); + //const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0"); + //const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0"); StaticData::InstanceNonConst().LoadData(parameter); - PhraseDictionaryCompact pdc("input-factor=0 output-factor=0 num-features=5 path=" + ttable); + PhraseDictionaryCompact pdc("PhraseDictionaryCompact input-factor=0 output-factor=0 num-features=5 path=" + ttable); pdc.Load(); std::string line; diff --git a/moses/FF/OSM-Feature/OpSequenceModel.cpp b/moses/FF/OSM-Feature/OpSequenceModel.cpp index 705763a35..fa8007156 100644 --- a/moses/FF/OSM-Feature/OpSequenceModel.cpp +++ b/moses/FF/OSM-Feature/OpSequenceModel.cpp @@ -11,7 +11,7 @@ namespace Moses { OpSequenceModel::OpSequenceModel(const std::string &line) -:StatefulFeatureFunction("OpSequenceModel", 5, line ) + :StatefulFeatureFunction("OpSequenceModel", 5, line ) { ReadParameters(); } @@ -19,29 +19,29 @@ OpSequenceModel::OpSequenceModel(const std::string &line) void OpSequenceModel :: readLanguageModel(const char *lmFile) { - string unkOp = "_TRANS_SLF_"; + string unkOp = "_TRANS_SLF_"; - - /* - // Code for SRILM + /* + + // Code for SRILM - vector <int> numbers; + vector <int> numbers; int nonWordFlag = 0; - - ptrOp = new Api; - ptrOp -> read_lm(lmFile,lmOrder); - numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str()))); - unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag); - - */ - - // Code to load KenLM - - OSM = new Model(m_lmPath.c_str()); - State startState = OSM->NullContextState(); - State endState; - unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState); + + ptrOp = new Api; + ptrOp -> read_lm(lmFile,lmOrder); + numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str()))); + unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag); + + */ + + // Code to load KenLM + + OSM = new Model(m_lmPath.c_str()); + State startState = OSM->NullContextState(); + State endState; + unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState); } @@ -85,58 +85,55 @@ void OpSequenceModel::Load() void OpSequenceModel:: Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { - osmHypothesis obj; - obj.setState(OSM->NullContextState()); - WordsBitmap myBitmap(source.GetSize()); - vector <string> mySourcePhrase; - vector <string> myTargetPhrase; - vector<float> scores(5); - vector <int> alignments; - int startIndex = 0; - int endIndex = source.GetSize(); - - const AlignmentInfo &align = targetPhrase.GetAlignTerm(); - AlignmentInfo::const_iterator iter; - - - for (iter = align.begin(); iter != align.end(); ++iter) - { - alignments.push_back(iter->first); - alignments.push_back(iter->second); - } - - for (int i = 0; i < targetPhrase.GetSize(); i++) - { - if (targetPhrase.GetWord(i).IsOOV()) - myTargetPhrase.push_back("_TRANS_SLF_"); - else - myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string()); - } - - for (int i = 0; i < source.GetSize(); i++) - { - mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string()); - } - - obj.setPhrases(mySourcePhrase , myTargetPhrase); - obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize()); - obj.computeOSMFeature(startIndex,myBitmap); - obj.calculateOSMProb(*OSM); - obj.populateScores(scores); - estimatedFutureScore.PlusEquals(this, scores); + osmHypothesis obj; + obj.setState(OSM->NullContextState()); + WordsBitmap myBitmap(source.GetSize()); + vector <string> mySourcePhrase; + vector <string> myTargetPhrase; + vector<float> scores(5); + vector <int> alignments; + int startIndex = 0; + int endIndex = source.GetSize(); + + const AlignmentInfo &align = targetPhrase.GetAlignTerm(); + AlignmentInfo::const_iterator iter; + + + for (iter = align.begin(); iter != align.end(); ++iter) { + alignments.push_back(iter->first); + alignments.push_back(iter->second); + } + + for (int i = 0; i < targetPhrase.GetSize(); i++) { + if (targetPhrase.GetWord(i).IsOOV()) + myTargetPhrase.push_back("_TRANS_SLF_"); + else + myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string()); + } + + for (int i = 0; i < source.GetSize(); i++) { + mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string()); + } + + obj.setPhrases(mySourcePhrase , myTargetPhrase); + obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize()); + obj.computeOSMFeature(startIndex,myBitmap); + obj.calculateOSMProb(*OSM); + obj.populateScores(scores); + estimatedFutureScore.PlusEquals(this, scores); } FFState* OpSequenceModel::Evaluate( - const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const { const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase(); const WordsBitmap &bitmap = cur_hypo.GetWordsBitmap(); @@ -159,83 +156,81 @@ FFState* OpSequenceModel::Evaluate( //cerr << source <<endl; - // int a = sourceRange.GetStartPos(); - // cerr << source.GetWord(a); +// int a = sourceRange.GetStartPos(); +// cerr << source.GetWord(a); //cerr <<a<<endl; //const Sentence &sentence = static_cast<const Sentence&>(curr_hypo.GetManager().GetSource()); - const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange(); - int startIndex = sourceRange.GetStartPos(); - int endIndex = sourceRange.GetEndPos(); - const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm(); - osmState * statePtr; + const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange(); + int startIndex = sourceRange.GetStartPos(); + int endIndex = sourceRange.GetEndPos(); + const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm(); + osmState * statePtr; - vector <int> alignments; + vector <int> alignments; - AlignmentInfo::const_iterator iter; + AlignmentInfo::const_iterator iter; - for (iter = align.begin(); iter != align.end(); ++iter) { - //cerr << iter->first << "----" << iter->second << " "; - alignments.push_back(iter->first); - alignments.push_back(iter->second); - } + for (iter = align.begin(); iter != align.end(); ++iter) { + //cerr << iter->first << "----" << iter->second << " "; + alignments.push_back(iter->first); + alignments.push_back(iter->second); + } - //cerr<<bitmap<<endl; - //cerr<<startIndex<<" "<<endIndex<<endl; + //cerr<<bitmap<<endl; + //cerr<<startIndex<<" "<<endIndex<<endl; - for (int i = startIndex; i <= endIndex; i++) - { - myBitmap.SetValue(i,0); // resetting coverage of this phrase ... - mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string()); - // cerr<<mySourcePhrase[i]<<endl; + for (int i = startIndex; i <= endIndex; i++) { + myBitmap.SetValue(i,0); // resetting coverage of this phrase ... + mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string()); + // cerr<<mySourcePhrase[i]<<endl; } - for (int i = 0; i < target.GetSize(); i++) - { + for (int i = 0; i < target.GetSize(); i++) { - if (target.GetWord(i).IsOOV()) - myTargetPhrase.push_back("_TRANS_SLF_"); - else - myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString().as_string()); + if (target.GetWord(i).IsOOV()) + myTargetPhrase.push_back("_TRANS_SLF_"); + else + myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString().as_string()); } - + //cerr<<myBitmap<<endl; obj.setState(prev_state); obj.constructCepts(alignments,startIndex,endIndex,target.GetSize()); obj.setPhrases(mySourcePhrase , myTargetPhrase); - obj.computeOSMFeature(startIndex,myBitmap); + obj.computeOSMFeature(startIndex,myBitmap); obj.calculateOSMProb(*OSM); obj.populateScores(scores); -/* - if (bitmap.GetFirstGapPos() == NOT_FOUND) - { + /* + if (bitmap.GetFirstGapPos() == NOT_FOUND) + { - int xx; - cerr<<bitmap<<endl; - int a = bitmap.GetFirstGapPos(); - obj.print(); - cin>>xx; - } - */ + int xx; + cerr<<bitmap<<endl; + int a = bitmap.GetFirstGapPos(); + obj.print(); + cin>>xx; + } + */ -/* - vector<float> scores(5); - scores[0] = 0.343423f; - scores[1] = 1.343423f; - scores[2] = 2.343423f; - scores[3] = 3.343423f; - scores[4] = 4.343423f; - */ + /* + vector<float> scores(5); + scores[0] = 0.343423f; + scores[1] = 1.343423f; + scores[2] = 2.343423f; + scores[3] = 3.343423f; + scores[4] = 4.343423f; + */ accumulator->PlusEquals(this, scores); @@ -245,7 +240,7 @@ FFState* OpSequenceModel::Evaluate( //return statePtr; - // return NULL; +// return NULL; } FFState* OpSequenceModel::EvaluateChart( @@ -276,29 +271,28 @@ std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const ParallelPhrase pp(source, target); std::map<ParallelPhrase, Scores>::const_iterator iter; iter = m_futureCost.find(pp); - //iter = m_coll.find(pp); +//iter = m_coll.find(pp); if (iter == m_futureCost.end()) { vector<float> scores(5, 0); scores[0] = unkOpProb; return scores; - } - else { + } else { const vector<float> &scores = iter->second; - return scores; + return scores; } } void OpSequenceModel::SetParameter(const std::string& key, const std::string& value) { - if (key == "feature-path") { - m_featurePath = value; - } else if (key == "path") { - m_lmPath = value; - } else if (key == "order") { - lmOrder = Scan<int>(value); - } else { - StatefulFeatureFunction::SetParameter(key, value); - } + if (key == "feature-path") { + m_featurePath = value; + } else if (key == "path") { + m_lmPath = value; + } else if (key == "order") { + lmOrder = Scan<int>(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } } } // namespace diff --git a/moses/FF/OSM-Feature/OpSequenceModel.h b/moses/FF/OSM-Feature/OpSequenceModel.h index 1e32bd6a1..fe9cef0bd 100644 --- a/moses/FF/OSM-Feature/OpSequenceModel.h +++ b/moses/FF/OSM-Feature/OpSequenceModel.h @@ -16,26 +16,26 @@ class OpSequenceModel : public StatefulFeatureFunction { public: - - lm::ngram::Model * OSM; - - int lmOrder; - float unkOpProb; - OpSequenceModel(const std::string &line); + lm::ngram::Model * OSM; - void readLanguageModel(const char *); - void Load(); + int lmOrder; + float unkOpProb; - FFState* Evaluate( - const Hypothesis& cur_hypo, - const FFState* prev_state, - ScoreComponentCollection* accumulator) const; + OpSequenceModel(const std::string &line); - void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + void readLanguageModel(const char *); + void Load(); + + FFState* Evaluate( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + void Evaluate(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */, @@ -49,17 +49,18 @@ public: std::vector<float> GetFutureScores(const Phrase &source, const Phrase &target) const; void SetParameter(const std::string& key, const std::string& value); - bool IsUseable(const FactorMask &mask) const - { return true; } + bool IsUseable(const FactorMask &mask) const { + return true; + } protected: - typedef std::pair<Phrase, Phrase> ParallelPhrase; - typedef std::vector<float> Scores; - std::map<ParallelPhrase, Scores> m_futureCost; + typedef std::pair<Phrase, Phrase> ParallelPhrase; + typedef std::vector<float> Scores; + std::map<ParallelPhrase, Scores> m_futureCost; - std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase; - std::set <int> targetNullWords; - std::string m_featurePath, m_lmPath; + std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase; + std::set <int> targetNullWords; + std::string m_featurePath, m_lmPath; diff --git a/moses/FF/OSM-Feature/osmHyp.cpp b/moses/FF/OSM-Feature/osmHyp.cpp index 555bbb00b..5ef80211c 100644 --- a/moses/FF/OSM-Feature/osmHyp.cpp +++ b/moses/FF/OSM-Feature/osmHyp.cpp @@ -1,4 +1,4 @@ - #include "osmHyp.h" +#include "osmHyp.h" #include <sstream> using namespace std; @@ -7,19 +7,19 @@ using namespace lm::ngram; namespace Moses { osmState::osmState(const State & val) -:j(0) -,E(0) + :j(0) + ,E(0) { lmState = val; - + } void osmState::saveState(int jVal, int eVal, map <int , string> & gapVal) { - gap.clear(); - gap = gapVal; - j = jVal; - E = eVal; + gap.clear(); + gap = gapVal; + j = jVal; + E = eVal; } int osmState::Compare(const FFState& otherBase) const @@ -33,7 +33,7 @@ int osmState::Compare(const FFState& otherBase) const return (gap < other.gap) ? -1 : +1; if (lmState.length < other.lmState.length) return -1; - + if (lmState.length > other.lmState.length) return 1; return 0; @@ -43,606 +43,552 @@ int osmState::Compare(const FFState& otherBase) const std::string osmState :: getName() const { - return "done"; + return "done"; } ////////////////////////////////////////////////// osmHypothesis :: osmHypothesis() { - opProb = 0; - gapWidth = 0; - gapCount = 0; - openGapCount = 0; - deletionCount = 0; - gapCount = 0; - j = 0; - E = 0; - gap.clear(); + opProb = 0; + gapWidth = 0; + gapCount = 0; + openGapCount = 0; + deletionCount = 0; + gapCount = 0; + j = 0; + E = 0; + gap.clear(); } void osmHypothesis :: setState(const FFState* prev_state) { - if(prev_state != NULL) - { + if(prev_state != NULL) { - j = static_cast <const osmState *> (prev_state)->getJ(); - E = static_cast <const osmState *> (prev_state)->getE(); - gap = static_cast <const osmState *> (prev_state)->getGap(); - lmState = static_cast <const osmState *> (prev_state)->getLMState(); - } + j = static_cast <const osmState *> (prev_state)->getJ(); + E = static_cast <const osmState *> (prev_state)->getE(); + gap = static_cast <const osmState *> (prev_state)->getGap(); + lmState = static_cast <const osmState *> (prev_state)->getLMState(); + } } osmState * osmHypothesis :: saveState() { - osmState * statePtr = new osmState(lmState); - statePtr->saveState(j,E,gap); - return statePtr; + osmState * statePtr = new osmState(lmState); + statePtr->saveState(j,E,gap); + return statePtr; } int osmHypothesis :: isTranslationOperation(int x) { - if (operations[x].find("_JMP_BCK_") != -1) - return 0; - - if (operations[x].find("_JMP_FWD_") != -1) - return 0; - - if (operations[x].find("_CONT_CEPT_") != -1) - return 0; - - if (operations[x].find("_INS_GAP_") != -1) - return 0; - - return 1; - + if (operations[x].find("_JMP_BCK_") != -1) + return 0; + + if (operations[x].find("_JMP_FWD_") != -1) + return 0; + + if (operations[x].find("_CONT_CEPT_") != -1) + return 0; + + if (operations[x].find("_INS_GAP_") != -1) + return 0; + + return 1; + } void osmHypothesis :: removeReorderingOperations() { - gapCount = 0; - deletionCount = 0; - openGapCount = 0; - gapWidth = 0; - //cout<<"I came here"<<endl; - - std::vector <std::string> tupleSequence; - - for (int x = 0; x < operations.size(); x++) - { - // cout<<operations[x]<<endl; - - if(isTranslationOperation(x) == 1) - { - tupleSequence.push_back(operations[x]); - } - - } - - operations.clear(); - operations = tupleSequence; + gapCount = 0; + deletionCount = 0; + openGapCount = 0; + gapWidth = 0; + //cout<<"I came here"<<endl; + + std::vector <std::string> tupleSequence; + + for (int x = 0; x < operations.size(); x++) { + // cout<<operations[x]<<endl; + + if(isTranslationOperation(x) == 1) { + tupleSequence.push_back(operations[x]); + } + + } + + operations.clear(); + operations = tupleSequence; } void osmHypothesis :: calculateOSMProb(Model & ptrOp) { - - opProb = 0; - State currState = lmState; - State temp; - for (int i = 0; i<operations.size(); i++) - { - temp = currState; - opProb += ptrOp.Score(temp,ptrOp.GetVocabulary().Index(operations[i]),currState); - } + opProb = 0; + State currState = lmState; + State temp; + + for (int i = 0; i<operations.size(); i++) { + temp = currState; + opProb += ptrOp.Score(temp,ptrOp.GetVocabulary().Index(operations[i]),currState); + } - lmState = currState; + lmState = currState; - //print(); + //print(); } int osmHypothesis :: firstOpenGap(vector <int> & coverageVector) { - - int firstOG =-1; - - for(int nd = 0; nd < coverageVector.size(); nd++) - { - if(coverageVector[nd]==0) - { - firstOG = nd; - return firstOG; - } - } - - return firstOG; + + int firstOG =-1; + + for(int nd = 0; nd < coverageVector.size(); nd++) { + if(coverageVector[nd]==0) { + firstOG = nd; + return firstOG; + } + } + + return firstOG; } string osmHypothesis :: intToString(int num) { - - std::ostringstream stm; - stm<<num; - return stm.str(); + std::ostringstream stm; + stm<<num; + + return stm.str(); } void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , WordsBitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF) { - - int gFlag = 0; - int gp = 0; - int ans; - - - if ( j < j1) // j1 is the index of the source word we are about to generate ... - { - //if(coverageVector[j]==0) // if source word at j is not generated yet ... - if(coverageVector.GetValue(j)==0) // if source word at j is not generated yet ... - { - operations.push_back("_INS_GAP_"); - gFlag++; - gap[j]="Unfilled"; - } - if (j == E) - { - j = j1; - } - else - { - operations.push_back("_JMP_FWD_"); - j=E; - } - } - - if (j1 < j) - { - // if(j < E && coverageVector[j]==0) - if(j < E && coverageVector.GetValue(j)==0) - { - operations.push_back("_INS_GAP_"); - gFlag++; - gap[j]="Unfilled"; - } - - j=closestGap(gap,j1,gp); - operations.push_back("_JMP_BCK_"+ intToString(gp)); - - //cout<<"I am j "<<j<<endl; - //cout<<"I am j1 "<<j1<<endl; - - if(j==j1) - gap[j]="Filled"; - } - - if (j < j1) - { - operations.push_back("_INS_GAP_"); - gap[j] = "Unfilled"; - gFlag++; - j=j1; - } - - if(contFlag == 0) // First words of the multi-word cept ... - { - - if(english == "_TRANS_SLF_") // Unknown word ... - { - operations.push_back("_TRANS_SLF_"); - } - else - { - operations.push_back("_TRANS_" + english + "_TO_" + german); - } - - //ans = firstOpenGap(coverageVector); - ans = coverageVector.GetFirstGapPos(); - - if (ans != -1) - gapWidth += j - ans; - - } - else if (contFlag == 2) - { - - operations.push_back("_INS_" + german); - ans = coverageVector.GetFirstGapPos(); - - if (ans != -1) - gapWidth += j - ans; - deletionCount++; - } - else - { - operations.push_back("_CONT_CEPT_"); - } - - //coverageVector[j]=1; - coverageVector.SetValue(j,1); - j+=1; - - if(E<j) - E=j; - - if (gFlag > 0) - gapCount++; - - openGapCount += getOpenGaps(); - - //if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end()) - if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end()) - { - j1 = j; - german = currF[j1-startIndex]; - english = "_INS_"; - generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF); - } + + int gFlag = 0; + int gp = 0; + int ans; + + + if ( j < j1) { // j1 is the index of the source word we are about to generate ... + //if(coverageVector[j]==0) // if source word at j is not generated yet ... + if(coverageVector.GetValue(j)==0) { // if source word at j is not generated yet ... + operations.push_back("_INS_GAP_"); + gFlag++; + gap[j]="Unfilled"; + } + if (j == E) { + j = j1; + } else { + operations.push_back("_JMP_FWD_"); + j=E; + } + } + + if (j1 < j) { + // if(j < E && coverageVector[j]==0) + if(j < E && coverageVector.GetValue(j)==0) { + operations.push_back("_INS_GAP_"); + gFlag++; + gap[j]="Unfilled"; + } + + j=closestGap(gap,j1,gp); + operations.push_back("_JMP_BCK_"+ intToString(gp)); + + //cout<<"I am j "<<j<<endl; + //cout<<"I am j1 "<<j1<<endl; + + if(j==j1) + gap[j]="Filled"; + } + + if (j < j1) { + operations.push_back("_INS_GAP_"); + gap[j] = "Unfilled"; + gFlag++; + j=j1; + } + + if(contFlag == 0) { // First words of the multi-word cept ... + + if(english == "_TRANS_SLF_") { // Unknown word ... + operations.push_back("_TRANS_SLF_"); + } else { + operations.push_back("_TRANS_" + english + "_TO_" + german); + } + + //ans = firstOpenGap(coverageVector); + ans = coverageVector.GetFirstGapPos(); + + if (ans != -1) + gapWidth += j - ans; + + } else if (contFlag == 2) { + + operations.push_back("_INS_" + german); + ans = coverageVector.GetFirstGapPos(); + + if (ans != -1) + gapWidth += j - ans; + deletionCount++; + } else { + operations.push_back("_CONT_CEPT_"); + } + + //coverageVector[j]=1; + coverageVector.SetValue(j,1); + j+=1; + + if(E<j) + E=j; + + if (gFlag > 0) + gapCount++; + + openGapCount += getOpenGaps(); + + //if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end()) + if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end()) { + j1 = j; + german = currF[j1-startIndex]; + english = "_INS_"; + generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF); + } } void osmHypothesis :: print() { - for (int i = 0; i< operations.size(); i++) - { - cerr<<operations[i]<<" "; + for (int i = 0; i< operations.size(); i++) { + cerr<<operations[i]<<" "; + + } - } + cerr<<endl<<endl; - cerr<<endl<<endl; - - cerr<<"Operation Probability "<<opProb<<endl; - cerr<<"Gap Count "<<gapCount<<endl; - cerr<<"Open Gap Count "<<openGapCount<<endl; - cerr<<"Gap Width "<<gapWidth<<endl; - cerr<<"Deletion Count "<<deletionCount<<endl; + cerr<<"Operation Probability "<<opProb<<endl; + cerr<<"Gap Count "<<gapCount<<endl; + cerr<<"Open Gap Count "<<openGapCount<<endl; + cerr<<"Gap Width "<<gapWidth<<endl; + cerr<<"Deletion Count "<<deletionCount<<endl; - cerr<<"_______________"<<endl; + cerr<<"_______________"<<endl; } int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp) { - int dist=1172; - int value=-1; - int temp=0; - gp=0; - int opGap=0; - - map <int,string> :: iterator iter; - - iter=gap.end(); - - do - { - iter--; - //cout<<"Trapped "<<iter->first<<endl; - - if(iter->first==j1 && iter->second== "Unfilled") - { - opGap++; - gp = opGap; - return j1; - - } - - if(iter->second =="Unfilled") - { - opGap++; - temp = iter->first - j1; - - if(temp<0) - temp=temp * -1; - - if(dist>temp && iter->first < j1) - { - dist=temp; - value=iter->first; - gp=opGap; - } - } - - - } - while(iter!=gap.begin()); - - return value; + int dist=1172; + int value=-1; + int temp=0; + gp=0; + int opGap=0; + + map <int,string> :: iterator iter; + + iter=gap.end(); + + do { + iter--; + //cout<<"Trapped "<<iter->first<<endl; + + if(iter->first==j1 && iter->second== "Unfilled") { + opGap++; + gp = opGap; + return j1; + + } + + if(iter->second =="Unfilled") { + opGap++; + temp = iter->first - j1; + + if(temp<0) + temp=temp * -1; + + if(dist>temp && iter->first < j1) { + dist=temp; + value=iter->first; + gp=opGap; + } + } + + + } while(iter!=gap.begin()); + + return value; } int osmHypothesis :: getOpenGaps() { - map <int,string> :: iterator iter; + map <int,string> :: iterator iter; - int nd = 0; - for (iter = gap.begin(); iter!=gap.end(); iter++) - { - if(iter->second == "Unfilled") - nd++; - } + int nd = 0; + for (iter = gap.begin(); iter!=gap.end(); iter++) { + if(iter->second == "Unfilled") + nd++; + } - return nd; + return nd; } void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes) { - operations.push_back("_DEL_" + english); - currTargetIndex++; + operations.push_back("_DEL_" + english); + currTargetIndex++; - while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end()) - { - currTargetIndex++; - } + while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end()) { + currTargetIndex++; + } - if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end()) - { - english = currE[currTargetIndex]; - generateDeleteOperations(english,currTargetIndex,doneTargetIndexes); - } + if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end()) { + english = currE[currTargetIndex]; + generateDeleteOperations(english,currTargetIndex,doneTargetIndexes); + } } void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageVector) { - set <int> doneTargetIndexes; - set <int> eSide; - set <int> fSide; - set <int> :: iterator iter; - string english; - string source; - int j1; - int start = 0; - int targetIndex = 0; - doneTargetIndexes.clear(); - - - if (targetNullWords.size() != 0) // Source words to be deleted in the start of this phrase ... - { - iter = targetNullWords.begin(); - - if (*iter == startIndex) - { - - j1 = startIndex; - source = currF[j1-startIndex]; - english = "_INS_"; - generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF); - } - } - - if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) // first word has to be deleted ... - { - english = currE[targetIndex]; - generateDeleteOperations(english,targetIndex, doneTargetIndexes); - } - - - for (int i = 0; i < ceptsInPhrase.size(); i++) - { - source = ""; - english = ""; - - fSide = ceptsInPhrase[i].first; - eSide = ceptsInPhrase[i].second; - - iter = eSide.begin(); - targetIndex = *iter; - english += currE[*iter]; - iter++; - - for (; iter != eSide.end(); iter++) - { - if(*iter == targetIndex+1) - targetIndex++; - else - doneTargetIndexes.insert(*iter); - - english += "^_^"; - english += currE[*iter]; - } - - iter = fSide.begin(); - source += currF[*iter]; - iter++; - - for (; iter != fSide.end(); iter++) - { - source += "^_^"; - source += currF[*iter]; - } - - iter = fSide.begin(); - j1 = *iter + startIndex; - iter++; - - generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF); - - - for (; iter != fSide.end(); iter++) - { - j1 = *iter + startIndex; - generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF); - } - - targetIndex++; // Check whether the next target word is unaligned ... - - while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end()) - { - targetIndex++; - } - - if(sourceNullWords.find(targetIndex) != sourceNullWords.end()) - { - english = currE[targetIndex]; - generateDeleteOperations(english,targetIndex, doneTargetIndexes); - } - } - - //removeReorderingOperations(); - - //print(); + set <int> doneTargetIndexes; + set <int> eSide; + set <int> fSide; + set <int> :: iterator iter; + string english; + string source; + int j1; + int start = 0; + int targetIndex = 0; + doneTargetIndexes.clear(); + + + if (targetNullWords.size() != 0) { // Source words to be deleted in the start of this phrase ... + iter = targetNullWords.begin(); + + if (*iter == startIndex) { + + j1 = startIndex; + source = currF[j1-startIndex]; + english = "_INS_"; + generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF); + } + } + + if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) { // first word has to be deleted ... + english = currE[targetIndex]; + generateDeleteOperations(english,targetIndex, doneTargetIndexes); + } + + + for (int i = 0; i < ceptsInPhrase.size(); i++) { + source = ""; + english = ""; + + fSide = ceptsInPhrase[i].first; + eSide = ceptsInPhrase[i].second; + + iter = eSide.begin(); + targetIndex = *iter; + english += currE[*iter]; + iter++; + + for (; iter != eSide.end(); iter++) { + if(*iter == targetIndex+1) + targetIndex++; + else + doneTargetIndexes.insert(*iter); + + english += "^_^"; + english += currE[*iter]; + } + + iter = fSide.begin(); + source += currF[*iter]; + iter++; + + for (; iter != fSide.end(); iter++) { + source += "^_^"; + source += currF[*iter]; + } + + iter = fSide.begin(); + j1 = *iter + startIndex; + iter++; + + generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF); + + + for (; iter != fSide.end(); iter++) { + j1 = *iter + startIndex; + generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF); + } + + targetIndex++; // Check whether the next target word is unaligned ... + + while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end()) { + targetIndex++; + } + + if(sourceNullWords.find(targetIndex) != sourceNullWords.end()) { + english = currE[targetIndex]; + generateDeleteOperations(english,targetIndex, doneTargetIndexes); + } + } + + //removeReorderingOperations(); + + //print(); } void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT) { - set <int> :: iterator iter; + set <int> :: iterator iter; - int sz = eSide.size(); - vector <int> t; + int sz = eSide.size(); + vector <int> t; - for (iter = eSide.begin(); iter != eSide.end(); iter++) - { - t = tS[*iter]; + for (iter = eSide.begin(); iter != eSide.end(); iter++) { + t = tS[*iter]; - for (int i = 0; i < t.size(); i++) - { - fSide.insert(t[i]); - } + for (int i = 0; i < t.size(); i++) { + fSide.insert(t[i]); + } - } + } - for (iter = fSide.begin(); iter != fSide.end(); iter++) - { + for (iter = fSide.begin(); iter != fSide.end(); iter++) { - t = sT[*iter]; + t = sT[*iter]; - for (int i = 0 ; i<t.size(); i++) - { - eSide.insert(t[i]); - } + for (int i = 0 ; i<t.size(); i++) { + eSide.insert(t[i]); + } - } + } - if (eSide.size () > sz) - { - getMeCepts(eSide,fSide,tS,sT); - } + if (eSide.size () > sz) { + getMeCepts(eSide,fSide,tS,sT); + } } void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength) { - std::map <int , vector <int> > sT; - std::map <int , vector <int> > tS; - std::set <int> eSide; - std::set <int> fSide; - std::set <int> :: iterator iter; - std :: map <int , vector <int> > :: iterator iter2; - std :: pair < set <int> , set <int> > cept; - int src; - int tgt; + std::map <int , vector <int> > sT; + std::map <int , vector <int> > tS; + std::set <int> eSide; + std::set <int> fSide; + std::set <int> :: iterator iter; + std :: map <int , vector <int> > :: iterator iter2; + std :: pair < set <int> , set <int> > cept; + int src; + int tgt; - for (int i = 0; i < align.size(); i+=2) - { - src = align[i]; - tgt = align[i+1]; - tS[tgt].push_back(src); - sT[src].push_back(tgt); - } + for (int i = 0; i < align.size(); i+=2) { + src = align[i]; + tgt = align[i+1]; + tS[tgt].push_back(src); + sT[src].push_back(tgt); + } - for (int i = startIndex; i<= endIndex; i++) // What are unaligned source words in this phrase ... - { - if (sT.find(i-startIndex) == sT.end()) - { - targetNullWords.insert(i); - } - } + for (int i = startIndex; i<= endIndex; i++) { // What are unaligned source words in this phrase ... + if (sT.find(i-startIndex) == sT.end()) { + targetNullWords.insert(i); + } + } - for (int i = 0; i < targetPhraseLength; i++) // What are unaligned target words in this phrase ... - { - if (tS.find(i) == tS.end()) - { - sourceNullWords.insert(i); - } - } + for (int i = 0; i < targetPhraseLength; i++) { // What are unaligned target words in this phrase ... + if (tS.find(i) == tS.end()) { + sourceNullWords.insert(i); + } + } - while (tS.size() != 0 && sT.size() != 0) - { + while (tS.size() != 0 && sT.size() != 0) { - iter2 = tS.begin(); + iter2 = tS.begin(); - eSide.clear(); - fSide.clear(); - eSide.insert (iter2->first); + eSide.clear(); + fSide.clear(); + eSide.insert (iter2->first); - getMeCepts(eSide, fSide, tS , sT); + getMeCepts(eSide, fSide, tS , sT); - for (iter = eSide.begin(); iter != eSide.end(); iter++) - { - iter2 = tS.find(*iter); - tS.erase(iter2); - } + for (iter = eSide.begin(); iter != eSide.end(); iter++) { + iter2 = tS.find(*iter); + tS.erase(iter2); + } - for (iter = fSide.begin(); iter != fSide.end(); iter++) - { - iter2 = sT.find(*iter); - sT.erase(iter2); - } + for (iter = fSide.begin(); iter != fSide.end(); iter++) { + iter2 = sT.find(*iter); + sT.erase(iter2); + } - cept = make_pair (fSide , eSide); - ceptsInPhrase.push_back(cept); - } + cept = make_pair (fSide , eSide); + ceptsInPhrase.push_back(cept); + } -/* + /* - cerr<<"Extracted Cepts "<<endl; - for (int i = 0; i < ceptsInPhrase.size(); i++) - { + cerr<<"Extracted Cepts "<<endl; + for (int i = 0; i < ceptsInPhrase.size(); i++) + { - fSide = ceptsInPhrase[i].first; - eSide = ceptsInPhrase[i].second; + fSide = ceptsInPhrase[i].first; + eSide = ceptsInPhrase[i].second; - for (iter = eSide.begin(); iter != eSide.end(); iter++) - { - cerr<<*iter<<" "; - } - cerr<<"<---> "; + for (iter = eSide.begin(); iter != eSide.end(); iter++) + { + cerr<<*iter<<" "; + } + cerr<<"<---> "; - for (iter = fSide.begin(); iter != fSide.end(); iter++) - { - cerr<<*iter<<" "; - } + for (iter = fSide.begin(); iter != fSide.end(); iter++) + { + cerr<<*iter<<" "; + } - cerr<<endl; - } - cerr<<endl; + cerr<<endl; + } + cerr<<endl; - cerr<<"Unaligned Target Words"<<endl; + cerr<<"Unaligned Target Words"<<endl; - for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++) - cerr<<*iter<<"<--->"<<endl; + for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++) + cerr<<*iter<<"<--->"<<endl; - cerr<<"Unaligned Source Words"<<endl; + cerr<<"Unaligned Source Words"<<endl; - for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++) - cerr<<*iter<<"<--->"<<endl; + for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++) + cerr<<*iter<<"<--->"<<endl; -*/ + */ } void osmHypothesis :: populateScores(vector <float> & scores) { - scores.clear(); - scores.push_back(opProb); - scores.push_back(gapWidth); - scores.push_back(gapCount); - scores.push_back(openGapCount); - scores.push_back(deletionCount); + scores.clear(); + scores.push_back(opProb); + scores.push_back(gapWidth); + scores.push_back(gapCount); + scores.push_back(openGapCount); + scores.push_back(deletionCount); } diff --git a/moses/FF/OSM-Feature/osmHyp.h b/moses/FF/OSM-Feature/osmHyp.h index ab8051176..368cd8e19 100644 --- a/moses/FF/OSM-Feature/osmHyp.h +++ b/moses/FF/OSM-Feature/osmHyp.h @@ -17,15 +17,23 @@ public: osmState(const lm::ngram::State & val); int Compare(const FFState& other) const; void saveState(int jVal, int eVal, std::map <int , std::string> & gapVal); - int getJ()const {return j;} - int getE()const {return E;} - std::map <int , std::string> getGap() const { return gap;} - - lm::ngram::State getLMState() const {return lmState;} + int getJ()const { + return j; + } + int getE()const { + return E; + } + std::map <int , std::string> getGap() const { + return gap; + } + + lm::ngram::State getLMState() const { + return lmState; + } void print() const; std::string getName() const; - + protected: int j, E; std::map <int,std::string> gap; @@ -35,51 +43,56 @@ protected: class osmHypothesis { - private: - - - std::vector <std::string> operations; // List of operations required to generated this hyp ... - std::map <int,std::string> gap; // Maintains gap history ... - int j; // Position after the last source word generated ... - int E; // Position after the right most source word so far generated ... - lm::ngram::State lmState; // KenLM's Model State ... - - int gapCount; // Number of gaps inserted ... - int deletionCount; - int openGapCount; - int gapWidth; - double opProb; - - std::vector <std::string> currE; - std::vector <std::string> currF; - std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase; - std::set <int> targetNullWords; - std::set <int> sourceNullWords; - - int closestGap(std::map <int,std::string> gap,int j1, int & gp); - int firstOpenGap(std::vector <int> & coverageVector); - std::string intToString(int); - int getOpenGaps(); - int isTranslationOperation(int j); - void removeReorderingOperations(); - - void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT); - - public: - - osmHypothesis(); - ~osmHypothesis(){}; - void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF); - void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes); - void calculateOSMProb(lm::ngram::Model & ptrOp); - void computeOSMFeature(int startIndex , WordsBitmap & coverageVector); - void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength); - void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2){currF = val1; currE = val2;} - void setState(const FFState* prev_state); - osmState * saveState(); - void print(); - void populateScores(std::vector <float> & scores); - void setState(const lm::ngram::State & val){lmState = val;} +private: + + + std::vector <std::string> operations; // List of operations required to generated this hyp ... + std::map <int,std::string> gap; // Maintains gap history ... + int j; // Position after the last source word generated ... + int E; // Position after the right most source word so far generated ... + lm::ngram::State lmState; // KenLM's Model State ... + + int gapCount; // Number of gaps inserted ... + int deletionCount; + int openGapCount; + int gapWidth; + double opProb; + + std::vector <std::string> currE; + std::vector <std::string> currF; + std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase; + std::set <int> targetNullWords; + std::set <int> sourceNullWords; + + int closestGap(std::map <int,std::string> gap,int j1, int & gp); + int firstOpenGap(std::vector <int> & coverageVector); + std::string intToString(int); + int getOpenGaps(); + int isTranslationOperation(int j); + void removeReorderingOperations(); + + void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT); + +public: + + osmHypothesis(); + ~osmHypothesis() {}; + void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF); + void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes); + void calculateOSMProb(lm::ngram::Model & ptrOp); + void computeOSMFeature(int startIndex , WordsBitmap & coverageVector); + void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength); + void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2) { + currF = val1; + currE = val2; + } + void setState(const FFState* prev_state); + osmState * saveState(); + void print(); + void populateScores(std::vector <float> & scores); + void setState(const lm::ngram::State & val) { + lmState = val; + } }; diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp index df757386a..edfbc7f75 100644 --- a/moses/LM/Ken.cpp +++ b/moses/LM/Ken.cpp @@ -383,7 +383,7 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string try { lm::ngram::ModelType model_type; if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { - + switch(model_type) { case lm::ngram::PROBING: return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index e0a4683de..af52b5cbf 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -694,9 +694,9 @@ bool StaticData::LoadData(Parameter *parameter) vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); SetWeights(model, weights); } else if (feature == "OpSequenceModel") { - OpSequenceModel* model = new OpSequenceModel(line); - vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); - SetWeights(model, weights); + OpSequenceModel* model = new OpSequenceModel(line); + vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); + SetWeights(model, weights); } else if (feature == "PhrasePenalty") { PhrasePenalty* model = new PhrasePenalty(line); vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp index 085a7337c..c0767dad9 100644 --- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp @@ -190,7 +190,7 @@ std::string PhraseDecoder::MakeSourceKey(std::string &source) return source + m_separator; } -TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel) +TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel, bool eval) { // Not using TargetPhraseCollection avoiding "new" operator @@ -234,7 +234,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase & // Decompress and decode target phrase collection TargetPhraseVectorPtr decodedPhraseColl = - DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel); + DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel, eval); return decodedPhraseColl; } else @@ -243,7 +243,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase & TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, - const Phrase &sourcePhrase, bool topLevel) + const Phrase &sourcePhrase, bool topLevel, bool eval) { bool extending = tpv->size(); @@ -397,7 +397,8 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( if(scores.size() == m_numScoreComponent) { targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores); - targetPhrase->Evaluate(sourcePhrase); + if(eval) + targetPhrase->Evaluate(sourcePhrase); if(m_containsAlignmentInfo) state = Alignment; diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.h b/moses/TranslationModel/CompactPT/PhraseDecoder.h index 85e9334da..413918314 100644 --- a/moses/TranslationModel/CompactPT/PhraseDecoder.h +++ b/moses/TranslationModel/CompactPT/PhraseDecoder.h @@ -131,12 +131,13 @@ public: size_t Load(std::FILE* in); TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase, - bool topLevel = false); + bool topLevel = false, bool eval = true); TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, - bool topLevel); + bool topLevel, + bool eval); void PruneCache(); }; diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp index dce6a6228..8d0f9ff2f 100644 --- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp @@ -117,7 +117,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c // Retrieve target phrase collection from phrase table TargetPhraseVectorPtr decodedPhraseColl - = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); + = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true); if(decodedPhraseColl != NULL && decodedPhraseColl->size()) { TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl)); @@ -130,7 +130,6 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase()); for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) { TargetPhrase *tp = new TargetPhrase(*it); - cerr << *tp << endl; phraseColl->Add(tp); } @@ -152,7 +151,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase return TargetPhraseVectorPtr(); // Retrieve target phrase collection from phrase table - return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); + return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false); } PhraseDictionaryCompact::~PhraseDictionaryCompact() diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp index fc3b056c6..f2192ee36 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp @@ -38,7 +38,7 @@ bool operator<(const PackedItem &pi1, const PackedItem &pi2) } std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__"; -std::string PhraseTableCreator::m_separator = " ||| "; +std::string PhraseTableCreator::m_separator = "|||"; PhraseTableCreator::PhraseTableCreator(std::string inPath, std::string outPath, @@ -332,12 +332,12 @@ void PhraseTableCreator::CreateRankHash() inline std::string PhraseTableCreator::MakeSourceKey(std::string &source) { - return source + m_separator; + return source + " " + m_separator + " "; } inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) { - return source + m_separator + target + m_separator; + return source + " " + m_separator + " " + target + " " + m_separator + " "; } void PhraseTableCreator::EncodeTargetPhrases() @@ -1034,17 +1034,24 @@ void RankingTask::operator()() for(size_t i = 0; i < lines.size(); i++) { std::vector<std::string> tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); - - if(tokens.size() < 3) { + + for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++) + *it = Moses::Trim(*it); + + if(tokens.size() < 4) { std::cerr << "Error: It seems the following line has a wrong format:" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; abort(); } - if(tokens.size() == 3 && m_creator.m_warnMe) { - std::cerr << "Warning: It seems the following line contains no alignment information, " << std::endl; - std::cerr << "but you are using PREnc encoding which makes use of alignment data. " << std::endl; - std::cerr << "Better use -encoding None or disable this warning with -no-warnings ." << std::endl; + + if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) { + std::cerr << "Error: It seems the following line contains no alignment information, " << std::endl; + std::cerr << "but you are using "; + std::cerr << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc"); + std::cerr << " encoding which makes use of alignment data. " << std::endl; + std::cerr << "Use -encoding None" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; + abort(); } std::vector<float> scores = Tokenize<float>(tokens[2]); @@ -1125,18 +1132,23 @@ void EncodingTask::operator()() std::vector<std::string> tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); + for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++) + *it = Moses::Trim(*it); + if(tokens.size() < 3) { std::cerr << "Error: It seems the following line has a wrong format:" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; abort(); } - if(tokens.size() == 3 && m_creator.m_coding != PhraseTableCreator::None && m_creator.m_warnMe) { - std::cerr << "Warning: It seems the following line contains no alignment information, " << std::endl; + + if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) { + std::cerr << "Error: It seems the following line contains no alignment information, " << std::endl; std::cerr << "but you are using "; std::cerr << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc"); std::cerr << " encoding which makes use of alignment data. " << std::endl; - std::cerr << "Better use -encoding None or disable this warning with -no-warnings." << std::endl; + std::cerr << "Use -encoding None" << std::endl; std::cerr << "Line " << i << ": " << lines[i] << std::endl; + abort(); } size_t ownRank = 0; diff --git a/moses/Word.h b/moses/Word.h index 6a91f1a76..1b07a0345 100644 --- a/moses/Word.h +++ b/moses/Word.h @@ -59,8 +59,7 @@ public: /** deep copy */ Word(const Word ©) :m_isNonTerminal(copy.m_isNonTerminal) - ,m_isOOV(copy.m_isOOV) - { + ,m_isOOV(copy.m_isOOV) { std::memcpy(m_factorArray, copy.m_factorArray, sizeof(FactorArray)); } |