Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authoreherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-18 22:13:45 +0400
committereherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-18 22:13:45 +0400
commit5c5d971895c177be55cf6b43acd3c827915f2bb4 (patch)
tree7a8813b76e5921c381b64a2c8feabe2f72e5878d /moses
parent7beabbd9ed39a52fcfe1d6dbd0567cf1ab1b4768 (diff)
added unknown-word handling code (currently commented out) and source-word deletion (also currently commented out)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@176 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses')
-rwxr-xr-xmoses/src/Arc.cpp2
-rwxr-xr-xmoses/src/Arc.h2
-rw-r--r--moses/src/DeletionHypothesis.cpp69
-rw-r--r--moses/src/DeletionHypothesis.h58
-rwxr-xr-xmoses/src/Hypothesis.cpp224
-rwxr-xr-xmoses/src/Hypothesis.h91
-rwxr-xr-xmoses/src/LanguageModel.cpp4
-rwxr-xr-xmoses/src/LanguageModel.h3
-rwxr-xr-xmoses/src/LatticeEdge.cpp7
-rwxr-xr-xmoses/src/LatticeEdge.h29
-rw-r--r--moses/src/Makefile.am19
-rwxr-xr-xmoses/src/Manager.cpp28
-rwxr-xr-xmoses/src/Parameter.cpp2
-rwxr-xr-xmoses/src/PhraseDictionary.cpp28
-rwxr-xr-xmoses/src/StaticData.cpp20
-rwxr-xr-xmoses/src/StaticData.h60
-rw-r--r--moses/src/TargetPhrase.cpp30
-rw-r--r--moses/src/TargetPhrase.h66
-rwxr-xr-xmoses/src/TranslationOption.h30
-rwxr-xr-xmoses/src/TypeDef.h24
-rw-r--r--moses/src/UnknownWordHandler.cpp71
-rw-r--r--moses/src/UnknownWordHandler.h52
-rw-r--r--moses/src/WordDeletionTable.cpp54
-rw-r--r--moses/src/WordDeletionTable.h55
-rw-r--r--moses/src/WordInsertionTable.h0
25 files changed, 760 insertions, 268 deletions
diff --git a/moses/src/Arc.cpp b/moses/src/Arc.cpp
index fbc7c0d0c..2b43d90b2 100755
--- a/moses/src/Arc.cpp
+++ b/moses/src/Arc.cpp
@@ -42,7 +42,7 @@ std::ostream& operator<<(std::ostream& out, const Arc& arc)
{
out << *prevHypo;
}
- out << arc.GetPhrase();
+ out << arc.GetTargetPhrase();
// score
out << " [" << arc.GetScore( static_cast<ScoreType::ScoreType>(0));
diff --git a/moses/src/Arc.h b/moses/src/Arc.h
index 69c4c18b2..9c4762274 100755
--- a/moses/src/Arc.h
+++ b/moses/src/Arc.h
@@ -33,7 +33,7 @@ public:
Arc(const Arc &arc); // not implemented
- Arc( const float score[NUM_SCORES]
+ Arc( const float score[]
, const ScoreComponentCollection &transScoreComponent
, const ScoreColl &lmScoreComponent
, const ScoreColl &generationScoreColl
diff --git a/moses/src/DeletionHypothesis.cpp b/moses/src/DeletionHypothesis.cpp
new file mode 100644
index 000000000..88bcacb99
--- /dev/null
+++ b/moses/src/DeletionHypothesis.cpp
@@ -0,0 +1,69 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "DeletionHypothesis.h"
+
+
+
+/***
+ * calculate the score due to source words dropped; set the appropriate elements of m_score
+ */
+void DeletionHypothesis::CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable)
+{
+ m_score[ScoreType::DeletedWords] = wordDeletionTable.GetDeletionCost(sourceSentence.GetSubString(sourceWordsRange));
+}
+
+/***
+ * Set the total-score field from the various individual score parts
+ * (not necessarily using all of them)
+ */
+void DeletionHypothesis::SumIndividualScores(const StaticData& staticData)
+{
+ m_score[ScoreType::Total] = m_score[ScoreType::PhraseTrans]
+ + m_score[ScoreType::Generation]
+ + m_score[ScoreType::LanguageModelScore]
+ + m_score[ScoreType::Distortion] * staticData.GetWeightDistortion()
+ + m_score[ScoreType::WordPenalty] * staticData.GetWeightWordPenalty()
+ + m_score[ScoreType::DeletedWords]
+ + m_score[ScoreType::FutureScoreEnum];
+}
+
+void DeletionHypothesis::CalcScore(const StaticData& staticData, const SquareMatrix &futureScore, const Sentence &source)
+{
+ // DISTORTION COST
+ CalcDistortionScore();
+
+ // LANGUAGE MODEL COST
+ CalcLMScore(staticData.GetLanguageModel(Initial), staticData.GetLanguageModel(Other));
+
+ // WORD PENALTY
+ m_score[ScoreType::WordPenalty] = - (float) GetSize();
+
+ // FUTURE COST
+ CalcFutureScore(futureScore);
+
+ //cost for deleting source words
+// CalcDeletionScore(source, GetCurrSourceWordsRange(), staticData.GetWordDeletionTable());
+
+ //LEXICAL REORDERING COST
+ CalcLexicalReorderingScore();
+
+ // TOTAL COST
+ SumIndividualScores(staticData);
+}
diff --git a/moses/src/DeletionHypothesis.h b/moses/src/DeletionHypothesis.h
new file mode 100644
index 000000000..f6df1f64b
--- /dev/null
+++ b/moses/src/DeletionHypothesis.h
@@ -0,0 +1,58 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include "Sentence.h"
+#include "WordsBitmap.h"
+#include "WordsRange.h"
+#include "WordDeletionTable.h"
+#include "StaticData.h"
+#include "Hypothesis.h"
+#include "TranslationOption.h"
+
+/***
+ * Describe a hypothesis extension that involves translating a source phrase to the empty phrase
+ * (ie dropping the source words)
+ */
+class DeletionHypothesis : public Hypothesis
+{
+ friend class Hypothesis; //for the factory functions
+
+ protected:
+
+ DeletionHypothesis(const WordsBitmap &initialCoverage) : Hypothesis(Phrase(), initialCoverage) {}
+ DeletionHypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt) : Hypothesis(prevHypo, transOpt) {}
+ virtual ~DeletionHypothesis() {}
+
+ /***
+ * calculate the score due to source words dropped; set the appropriate elements of m_score
+ */
+ void CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable);
+
+ /***
+ * Set the total-score field from the various individual score parts
+ * (not necessarily using all of them)
+ */
+ virtual void SumIndividualScores(const StaticData& staticData);
+
+ public:
+
+ virtual void CalcScore(const StaticData& staticData, const SquareMatrix &futureScore, const Sentence &source);
+};
diff --git a/moses/src/Hypothesis.cpp b/moses/src/Hypothesis.cpp
index 77a715a1d..d70800d9b 100755
--- a/moses/src/Hypothesis.cpp
+++ b/moses/src/Hypothesis.cpp
@@ -28,6 +28,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Util.h"
#include "Arc.h"
#include "SquareMatrix.h"
+#include "StaticData.h"
+//#include "DeletionHypothesis.h"
//TODO: add this include in when it compiles
//#include "LexicalReordering.h"
@@ -41,6 +43,7 @@ Hypothesis::Hypothesis(const Phrase &phrase, const WordsBitmap &initialCoverage)
, m_sourceCompleted(initialCoverage)
, m_currSourceWordsRange(NOT_FOUND, NOT_FOUND)
, m_currTargetWordsRange(NOT_FOUND, NOT_FOUND)
+ , m_wordDeleted(false)
, m_id(s_numNodes++)
{ // used for initial seeding of trans process
// initialize scores
@@ -53,9 +56,10 @@ Hypothesis::Hypothesis(const Hypothesis &copy)
, m_sourceCompleted (copy.m_sourceCompleted )
, m_currSourceWordsRange (copy.m_currSourceWordsRange)
, m_currTargetWordsRange (copy.m_currTargetWordsRange)
+ , m_wordDeleted(false)
, m_id(s_numNodes++)
{
- m_phrase.AddWords( copy.m_phrase );
+ m_targetPhrase.AddWords( copy.m_targetPhrase );
// initialize scores
SetScore(copy.GetScore());
@@ -67,12 +71,16 @@ Hypothesis::Hypothesis(const Hypothesis &copy)
#endif
}
+/***
+ * continue prevHypo by appending the phrases in transOpt
+ */
Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt)
: LatticeEdge (Output, &prevHypo)
, m_sourceCompleted (prevHypo.m_sourceCompleted )
, m_currSourceWordsRange (prevHypo.m_currSourceWordsRange)
, m_currTargetWordsRange ( prevHypo.m_currTargetWordsRange.GetEndPos() + 1
,prevHypo.m_currTargetWordsRange.GetEndPos() + transOpt.GetPhrase().GetSize())
+ , m_wordDeleted(false)
, m_id(s_numNodes++)
{
const Phrase &possPhrase = transOpt.GetPhrase();
@@ -81,24 +89,24 @@ Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &tran
m_sourceCompleted.SetValue(wordsRange.GetStartPos(), wordsRange.GetEndPos(), true);
// add new words from poss trans
//m_phrase.AddWords(prev.m_phrase);
- m_phrase.AddWords(possPhrase);
+ m_targetPhrase.AddWords(possPhrase);
// scores
SetScore(prevHypo.GetScore());
m_score[ScoreType::PhraseTrans] += transOpt.GetTranslationScore();
m_score[ScoreType::FutureScoreEnum] += transOpt.GetFutureScore();
m_score[ScoreType::LanguageModelScore] += transOpt.GetNgramScore();
+// m_wordDeleted = transOpt.IsDeletionOption();
#ifdef N_BEST
// language model score (ngram)
m_lmScoreComponent = prevHypo.GetLMScoreComponent();
- const list< pair<size_t, float> > &nGramComponent = transOpt.GetTrigramComponent();
+ const std::vector< std::pair<size_t, float> > &nGramComponent = transOpt.GetTrigramComponent();
- list< pair<size_t, float> >::const_iterator iter;
- for (iter = nGramComponent.begin() ; iter != nGramComponent.end() ; ++iter)
+ for(unsigned int i = 0; i < nGramComponent.size(); i++)
{
- size_t lmId = (*iter).first;
- float score = (*iter).second;
+ size_t lmId = nGramComponent[i].first;
+ float score = nGramComponent[i].second;
m_lmScoreComponent[lmId] += score;
}
@@ -132,16 +140,40 @@ Hypothesis::~Hypothesis()
#endif
}
-Hypothesis *Hypothesis::CreateNext(const TranslationOption &transOpt) const
+/***
+ * return the subclass of Hypothesis most appropriate to the given translation option
+ */
+Hypothesis* Hypothesis::CreateNext(const TranslationOption &transOpt) const
{
- Hypothesis *clone = new Hypothesis(*this, transOpt);
- return clone;
+ return Create(*this, transOpt);
}
+/***
+ * return the subclass of Hypothesis most appropriate to the given translation option
+ */
+Hypothesis* Hypothesis::Create(const Hypothesis &prevHypo, const TranslationOption &transOpt)
+{
+ /*if(s_wordDeletionEnabled && transOpt.GetPhrase().GetSize() == 0) return new DeletionHypothesis(prevHypo, transOpt);
+ else*/ return new Hypothesis(prevHypo, transOpt);
+}
+/***
+ * return the subclass of Hypothesis most appropriate to the given target phrase
+ */
+Hypothesis* Hypothesis::Create(const Phrase& targetPhrase, const WordsBitmap &initialCoverage)
+{
+ /*if(s_wordDeletionEnabled && targetPhrase.GetSize() == 0) return new DeletionHypothesis(initialCoverage);
+ else*/ return new Hypothesis(targetPhrase, initialCoverage);
+}
-
-Hypothesis *Hypothesis::MergeNext(const TranslationOption &transOpt) const
+/***
+ * if any factors aren't set in our target phrase but are present in transOpt, copy them over
+ * (unless the factors that we do have fail to match the corresponding ones in transOpt,
+ * in which case presumably there's a programmer's error)
+ *
+ * return NULL if we aren't compatible with the given option
+ */
+Hypothesis* Hypothesis::MergeNext(const TranslationOption &transOpt) const
{
// check each word is compatible and merge 1-by-1
const Phrase &possPhrase = transOpt.GetPhrase();
@@ -151,7 +183,7 @@ Hypothesis *Hypothesis::MergeNext(const TranslationOption &transOpt) const
}
// ok, merge
- Hypothesis *clone = new Hypothesis(*this);
+ Hypothesis* clone = new Hypothesis(*this);
int currWord = 0;
size_t len = GetSize();
@@ -276,10 +308,6 @@ void Hypothesis::CalcLexicalReorderingScore()
// LatticeEdge.getPrevHypo()); //Previous Hypothesis
}
-
-
-
-
/**
* Calculates the overall language model score by combining the scores
* of language models generated for each of the factors. Because the factors
@@ -289,7 +317,6 @@ void Hypothesis::CalcLexicalReorderingScore()
* /param lmListInitial todo - describe this parameter
* /param lmListEnd todo - describe this parameter
*/
-
void Hypothesis::CalcLMScore(const LMList &lmListInitial, const LMList &lmListEnd)
{
const size_t startPos = m_currTargetWordsRange.GetStartPos();
@@ -316,8 +343,6 @@ void Hypothesis::CalcLMScore(const LMList &lmListInitial, const LMList &lmListEn
}
lmScore = languageModel.GetValue(contextFactor);
//cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
-
-
// main loop
for (size_t currPos = startPos + 1 ; currPos <= m_currTargetWordsRange.GetEndPos() ; currPos++)
@@ -361,53 +386,56 @@ void Hypothesis::CalcLMScore(const LMList &lmListInitial, const LMList &lmListEn
size_t nGramOrder = languageModel.GetNGramOrder();
float lmScore;
- // 1st n-gram
- vector<const Factor*> contextFactor(nGramOrder);
- size_t index = 0;
- for (int currPos = (int) startPos - (int) nGramOrder + 1 ; currPos <= (int) startPos ; currPos++)
- {
- if (currPos >= 0)
- contextFactor[index++] = GetFactor(currPos, factorType);
- else
- contextFactor[index++] = languageModel.GetSentenceStart();
- }
- lmScore = languageModel.GetValue(contextFactor);
- //cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
-
-
- // main loop
- size_t endPos = std::min(startPos + nGramOrder - 2
- , m_currTargetWordsRange.GetEndPos());
- for (size_t currPos = startPos + 1 ; currPos <= endPos ; currPos++)
+ if(m_currTargetWordsRange.GetWordsCount() > 0) //non-empty target phrase
{
- // shift all args down 1 place
- for (size_t i = 0 ; i < nGramOrder - 1 ; i++)
- contextFactor[i] = contextFactor[i + 1];
-
- // add last factor
- contextFactor.back() = GetFactor(currPos, factorType);
-
- lmScore += languageModel.GetValue(contextFactor);
+ // 1st n-gram
+ vector<const Factor*> contextFactor(nGramOrder);
+ size_t index = 0;
+ for (int currPos = (int) startPos - (int) nGramOrder + 1 ; currPos <= (int) startPos ; currPos++)
+ {
+ if (currPos >= 0)
+ contextFactor[index++] = GetFactor(currPos, factorType);
+ else
+ contextFactor[index++] = languageModel.GetSentenceStart();
+ }
+ lmScore = languageModel.GetValue(contextFactor);
//cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
-
- }
- // end of sentence
- if (m_sourceCompleted.IsComplete())
- {
- const size_t size = GetSize();
- contextFactor.back() = languageModel.GetSentenceEnd();
+ // main loop
+ size_t endPos = std::min(startPos + nGramOrder - 2
+ , m_currTargetWordsRange.GetEndPos());
+ for (size_t currPos = startPos + 1 ; currPos <= endPos ; currPos++)
+ {
+ // shift all args down 1 place
+ for (size_t i = 0 ; i < nGramOrder - 1 ; i++)
+ contextFactor[i] = contextFactor[i + 1];
+
+ // add last factor
+ contextFactor.back() = GetFactor(currPos, factorType);
+
+ lmScore += languageModel.GetValue(contextFactor);
+ //cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
+ }
- for (size_t i = 0 ; i < nGramOrder - 1 ; i ++)
+ // end of sentence
+ if (m_sourceCompleted.IsComplete())
{
- int currPos = size - nGramOrder + i + 1;
- if (currPos < 0)
- contextFactor[i] = languageModel.GetSentenceStart();
- else
- contextFactor[i] = GetFactor((size_t)currPos, factorType);
+ const size_t size = GetSize();
+ contextFactor.back() = languageModel.GetSentenceEnd();
+
+ for (size_t i = 0 ; i < nGramOrder - 1 ; i ++)
+ {
+ int currPos = size - nGramOrder + i + 1;
+ if (currPos < 0)
+ contextFactor[i] = languageModel.GetSentenceStart();
+ else
+ contextFactor[i] = GetFactor((size_t)currPos, factorType);
+ }
+ lmScore += languageModel.GetValue(contextFactor);
}
- lmScore += languageModel.GetValue(contextFactor);
}
+ else lmScore = 0; //the score associated with dropping source words is not part of the language model
+
m_score[ScoreType::LanguageModelScore] += lmScore * languageModel.GetWeight();
#ifdef N_BEST
size_t lmId = languageModel.GetId();
@@ -416,14 +444,8 @@ void Hypothesis::CalcLMScore(const LMList &lmListInitial, const LMList &lmListEn
}
}
-void Hypothesis::CalcScore(const LMList &lmListInitial
- , const LMList &lmListEnd
- , float weightDistortion
- , float weightWordPenalty
- , const SquareMatrix &futureScore
- , const Sentence &source)
+void Hypothesis::CalcDistortionScore()
{
- // DISTORTION COST
const WordsRange &prevRange = m_prevHypo->GetCurrSourceWordsRange()
, &currRange = GetCurrSourceWordsRange();
@@ -436,28 +458,52 @@ void Hypothesis::CalcScore(const LMList &lmListInitial
// distortions scores of all previous partial translations
m_score[ScoreType::Distortion] -= (float) currRange.CalcDistortion(prevRange) ;
}
+}
+
+/***
+ * calculate the score due to source words dropped; set the appropriate elements of m_score
+ */
+void Hypothesis::CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable)
+{
+ m_score[ScoreType::DeletedWords] =
+ wordDeletionTable.GetDeletionCost(sourceSentence.GetSubString(sourceWordsRange));
+}
+
+
+/***
+ * calculate the logarithm of our total translation score (sum up components)
+ */
+void Hypothesis::CalcScore(const StaticData& staticData, const SquareMatrix &futureScore, const Sentence &source)
+{
+ // DISTORTION COST
+ CalcDistortionScore();
// LANGUAGE MODEL COST
- CalcLMScore(lmListInitial, lmListEnd);
+ CalcLMScore(staticData.GetLanguageModel(Initial), staticData.GetLanguageModel(Other));
// WORD PENALTY
m_score[ScoreType::WordPenalty] = - (float) GetSize();
// FUTURE COST
CalcFutureScore(futureScore);
+
+ //LEXICAL REORDERING COST
+ CalcLexicalReorderingScore();
-
- //LEXICAL REORDERING COST
- CalcLexicalReorderingScore();
+ //cost for deleting source words
+ if (m_wordDeleted)
+ {
+ CalcDeletionScore(source, GetCurrSourceWordsRange(), staticData.GetWordDeletionTable());
+ }
// TOTAL COST
m_score[ScoreType::Total] = m_score[ScoreType::PhraseTrans]
+ m_score[ScoreType::Generation]
+ m_score[ScoreType::LanguageModelScore]
- + m_score[ScoreType::Distortion] * weightDistortion
- + m_score[ScoreType::WordPenalty] * weightWordPenalty
- + m_score[ScoreType::FutureScoreEnum]
- + m_score[ScoreType::LexicalReordering];
+ + m_score[ScoreType::Distortion] * staticData.GetWeightDistortion()
+ + m_score[ScoreType::WordPenalty] * staticData.GetWeightWordPenalty()
+ + m_score[ScoreType::DeletedWords] * staticData.GetWordDeletionWeight()
+ + m_score[ScoreType::FutureScoreEnum];
}
void Hypothesis::CalcFutureScore(const SquareMatrix &futureScore)
@@ -498,7 +544,7 @@ const Hypothesis* Hypothesis::GetPrevHypo()const{
}
/**
- * prints hypothesis information for pharaoh style logging
+ * print hypothesis information for pharaoh-style logging
*/
void Hypothesis::PrintHypothesis(const Sentence &source, float weightDistortion, float weightWordPenalty) const{
int start = m_prevHypo->m_currSourceWordsRange.GetEndPos() -1;
@@ -519,10 +565,10 @@ void Hypothesis::PrintHypothesis(const Sentence &source, float weightDistortion,
cout<<" )"<<endl;
cout<<"\tbase score "<<m_prevHypo->m_score[ScoreType::Total]<<endl;
cout<<"\tcovering "<<m_currSourceWordsRange.GetStartPos()<<"-"<<m_currSourceWordsRange.GetEndPos()<<": "<< source.GetSubString(m_currSourceWordsRange) <<endl;
- cout<<"\ttranslated as: "<<m_phrase<<" => translation cost "<<m_score[ScoreType::PhraseTrans]<<endl;
+ cout<<"\ttranslated as: "<<m_targetPhrase<<" => translation cost "<<m_score[ScoreType::PhraseTrans]<<endl;
cout<<"\tdistance: "<<GetCurrSourceWordsRange().CalcDistortion(m_prevHypo->GetCurrSourceWordsRange()) << " => distortion cost "<<(m_score[ScoreType::Distortion]*weightDistortion)<<endl;
cout<<"\tlanguage model cost "<<m_score[ScoreType::LanguageModelScore]<<endl;
- cout<<"\tword penalty "<<(m_score[ScoreType::WordPenalty]*weightWordPenalty)<<endl;
+ cout<<"\tword penalty "<<(m_score[ScoreType::WordPenalty]*weightWordPenalty)<< "\tdeletion cost "<<m_score[ScoreType::DeletedWords] << endl;
cout<<"\tscore "<<m_score[ScoreType::Total] - m_score[ScoreType::FutureScoreEnum]<<" + future cost "<<m_score[ScoreType::FutureScoreEnum]<<" = "<<m_score[ScoreType::Total]<<endl;
//PrintLMScores();
}
@@ -533,23 +579,19 @@ void Hypothesis::PrintHypothesis(const Sentence &source, float weightDistortion,
ostream& operator<<(ostream& out, const Hypothesis& hypothesis)
{
hypothesis.ToStream(out);
-
// words bitmap
-
out << "[" << hypothesis.m_sourceCompleted << "] ";
-
- out << " [" << hypothesis.GetScore( static_cast<ScoreType::ScoreType>(0));
- for (size_t i = 1 ; i < NUM_SCORES ; i++)
- {
- out << "," << hypothesis.GetScore( static_cast<ScoreType::ScoreType>(i));
- }
- out << "]";
+ // scores
+ out << " [" << hypothesis.GetScore( static_cast<ScoreType::ScoreType>(0));
+ for (size_t i = 1 ; i < NUM_SCORES ; i++)
+ {
+ out << "," << hypothesis.GetScore( static_cast<ScoreType::ScoreType>(i));
+ }
+ out << "]";
#ifdef N_BEST
- out << " " << hypothesis.GetScoreComponent();
- out << " " << hypothesis.GetGenerationScoreComponent();
+ out << " " << hypothesis.GetScoreComponent();
+ out << " " << hypothesis.GetGenerationScoreComponent();
#endif
-
-
return out;
}
diff --git a/moses/src/Hypothesis.h b/moses/src/Hypothesis.h
index b7082e9df..21dcaa777 100755
--- a/moses/src/Hypothesis.h
+++ b/moses/src/Hypothesis.h
@@ -36,7 +36,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ScoreComponentCollection.h"
class SquareMatrix;
+class StaticData;
class TranslationOption;
+class Sentence;
+class WordsRange;
+class WordDeletionTable;
class Hypothesis : public LatticeEdge
{
@@ -47,65 +51,82 @@ protected:
// of those in dictionary
WordsBitmap m_sourceCompleted;
WordsRange m_currSourceWordsRange, m_currTargetWordsRange;
+ bool m_wordDeleted;
#ifdef N_BEST
std::list<Arc*> m_arcList; //all arcs that end at the same lattice point as we do
#endif
/***
+ * Used for initializing translation process
+ */
+ Hypothesis(const Phrase &phrase, const WordsBitmap &initialCoverage);
+ // create next
+ Hypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt);
+
+ /***
* \return whether none of the factors clash
+ * \param phrase TODO ???
*/
bool IsCompatible(const Phrase &phrase) const;
void CalcFutureScore(const SquareMatrix &futureScore);
//void CalcFutureScore(float futureScore[256][256]);
void CalcLMScore(const LMList &lmListInitial, const LMList &lmListEnd);
+ void CalcDistortionScore();
//TODO: add appropriate arguments to score calculator
- void CalcLexicalReorderingScore();
+ void CalcLexicalReorderingScore();
+ void CalcDeletionScore(const Sentence& sourceSentence, const WordsRange& sourceWordsRange, const WordDeletionTable& wordDeletionTable);
public:
- static int s_numNodes;
- int m_id;
-
+ static int s_numNodes; //TODO what is this?
+ int m_id;
+
/***
* Deep copy
*/
- Hypothesis(const Hypothesis &copy);
+ Hypothesis(const Hypothesis &copy);
+
+ /***
+ * return the subclass of Hypothesis most appropriate to the given translation option
+ */
+ static Hypothesis* Create(const Hypothesis &prevHypo, const TranslationOption &transOpt);
+ /***
+ * return the subclass of Hypothesis most appropriate to the given target phrase
+ */
+ static Hypothesis* Create(const Phrase& targetPhrase, const WordsBitmap &initialCoverage);
- // used to create clone
- Hypothesis(const Phrase &phrase, const WordsBitmap &initialCoverage);
- // used for initial seeding of trans process
- Hypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt);
- // create next
~Hypothesis();
- inline Hypothesis *Clone() const
- {
- return new Hypothesis(*this);
- }
-
- Hypothesis *CreateNext(const TranslationOption &transOpt) const;
- Hypothesis *MergeNext(const TranslationOption &transOpt) const;
-
- int GetId()const;
- void PrintHypothesis( const Sentence &source, float weightDistortion, float weightWordPenalty) const;
+ /***
+ * return the subclass of Hypothesis most appropriate to the given translation option
+ */
+ Hypothesis* CreateNext(const TranslationOption &transOpt) const;
+ /***
+ * if any factors aren't set in our target phrase but are present in transOpt, copy them over
+ * (unless the factors that we do have fail to match the corresponding ones in transOpt,
+ * in which case presumably there's a programmer's error)
+ *
+ * return NULL if we aren't compatible with the given option
+ */
+ Hypothesis* MergeNext(const TranslationOption &transOpt) const;
+
+ virtual void PrintHypothesis( const Sentence &source, float weightDistortion, float weightWordPenalty) const;
// void PrintLMScores(const LMList &lmListInitial, const LMList &lmListEnd) const;
inline const WordsRange &GetCurrSourceWordsRange() const
{
return m_currSourceWordsRange;
}
- inline size_t GetCurrTargetLength() const
+
+ // subsequent translation should only translate this sub-phrase
+ virtual size_t GetCurrTargetLength() const
{
return m_currTargetWordsRange.GetWordsCount();
}
- // subsequent translation should only translate this sub-phrase
- void CalcScore(const LMList &lmListInitial
- , const LMList &lmListEnd
- , float weightDistortion
- , float weightWordPenalty
- , const SquareMatrix &futureScore
- , const Sentence &source) ;
+ virtual void CalcScore(const StaticData& staticData, const SquareMatrix &futureScore, const Sentence &source);
+
+ int GetId() const;
const Hypothesis* GetPrevHypo() const;
@@ -116,34 +137,34 @@ public:
}
inline const Phrase &GetPhrase() const
{
- return m_phrase;
+ return m_targetPhrase;
}
// curr
inline FactorArray &GetCurrFactorArray(size_t pos)
{
- return m_phrase.GetFactorArray(pos);
+ return m_targetPhrase.GetFactorArray(pos);
}
inline const FactorArray &GetCurrFactorArray(size_t pos) const
{
- return m_phrase.GetFactorArray(pos);
+ return m_targetPhrase.GetFactorArray(pos);
}
inline const Factor *GetCurrFactor(size_t pos, FactorType factorType) const
{
- return m_phrase.GetFactor(pos, factorType);
+ return m_targetPhrase.GetFactor(pos, factorType);
}
// recursive
inline const FactorArray &GetFactorArray(size_t pos) const
{
if (pos < m_currTargetWordsRange.GetStartPos())
return m_prevHypo->GetFactorArray(pos);
- return m_phrase.GetFactorArray(pos - m_currTargetWordsRange.GetStartPos());
+ return m_targetPhrase.GetFactorArray(pos - m_currTargetWordsRange.GetStartPos());
}
- inline const Factor *GetFactor(size_t pos, FactorType factorType) const
+ inline const Factor* GetFactor(size_t pos, FactorType factorType) const
{
if (pos < m_currTargetWordsRange.GetStartPos())
return m_prevHypo->GetFactor(pos, factorType);
- return m_phrase.GetFactor(pos - m_currTargetWordsRange.GetStartPos(), factorType);
+ return m_targetPhrase.GetFactor(pos - m_currTargetWordsRange.GetStartPos(), factorType);
}
/***
diff --git a/moses/src/LanguageModel.cpp b/moses/src/LanguageModel.cpp
index d2f0e7647..4f393a877 100755
--- a/moses/src/LanguageModel.cpp
+++ b/moses/src/LanguageModel.cpp
@@ -40,6 +40,9 @@ const LmId LanguageModel::UNKNOWN_LM_ID(0);
LanguageModel::LanguageModel() {}
LanguageModel::~LanguageModel() {}
+/***
+ * ngramComponent should be an invalid pointer iff n-best ranking is turned off
+ */
void LanguageModel::CalcScore(const Phrase &phrase
, float &fullScore
, float &ngramScore) const
@@ -79,4 +82,3 @@ void LanguageModel::CalcScore(const Phrase &phrase
}
fullScore += ngramScore;
}
-
diff --git a/moses/src/LanguageModel.h b/moses/src/LanguageModel.h
index 39eabef45..5db15c977 100755
--- a/moses/src/LanguageModel.h
+++ b/moses/src/LanguageModel.h
@@ -63,6 +63,9 @@ public:
{
return m_sentenceEnd;
}
+ /***
+ * ngramComponent should be an invalid pointer iff n-best ranking is turned off
+ */
void CalcScore(const Phrase &phrase
, float &fullScore
, float &ngramScore) const;
diff --git a/moses/src/LatticeEdge.cpp b/moses/src/LatticeEdge.cpp
index 9484ca3b6..b2f796b5e 100755
--- a/moses/src/LatticeEdge.cpp
+++ b/moses/src/LatticeEdge.cpp
@@ -19,6 +19,8 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#include <cstring> // memset
+
#include "LatticeEdge.h"
#include "LanguageModel.h"
@@ -30,10 +32,7 @@ LatticeEdge::~LatticeEdge()
void LatticeEdge::ResetScore()
{
- for (size_t i = 0 ; i < NUM_SCORES ; i++)
- {
- m_score[i] = 0;
- }
+ std::memset(m_score, 0, sizeof(float) * NUM_SCORES);
}
#ifdef N_BEST
diff --git a/moses/src/LatticeEdge.h b/moses/src/LatticeEdge.h
index 31026031b..d254d6f00 100755
--- a/moses/src/LatticeEdge.h
+++ b/moses/src/LatticeEdge.h
@@ -21,6 +21,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#pragma once
+#include <cstring> //memcpy()
#include <iostream>
#include <list>
#include "TypeDef.h"
@@ -43,8 +44,8 @@ protected:
// scores
float m_score[NUM_SCORES];
- const Hypothesis *m_prevHypo;
- Phrase m_phrase; //target phrase being created at the current decoding step
+ const Hypothesis* m_prevHypo;
+ Phrase m_targetPhrase; //target phrase being created at the current decoding step
#ifdef N_BEST
ScoreComponentCollection m_transScoreComponent;
@@ -54,14 +55,14 @@ protected:
public:
LatticeEdge(const LatticeEdge &copy); // not implemented
- LatticeEdge(const float score[NUM_SCORES]
+ LatticeEdge(const float score[]
, const ScoreComponentCollection &transScoreComponent
, const ScoreColl &lmScoreComponent
, const ScoreColl &generationScoreComponent
, const Phrase &phrase
, const Hypothesis *prevHypo)
:m_prevHypo(prevHypo)
- ,m_phrase(phrase)
+ ,m_targetPhrase(phrase)
#ifdef N_BEST
,m_transScoreComponent(transScoreComponent)
,m_generationScoreComponent(generationScoreComponent)
@@ -72,25 +73,25 @@ public:
}
LatticeEdge(FactorDirection direction, const Hypothesis *prevHypo)
:m_prevHypo(prevHypo)
- ,m_phrase(direction)
+ ,m_targetPhrase(direction)
{}
virtual ~LatticeEdge();
- inline const Phrase &GetPhrase() const
+ inline const Phrase &GetTargetPhrase() const
{
- return m_phrase;
+ return m_targetPhrase;
}
inline void SetFactor(size_t pos, FactorType factorType, const Factor *factor)
{ // pos starts from current phrase, not from beginning of 1st phrase
- m_phrase.SetFactor(pos, factorType, factor);
+ m_targetPhrase.SetFactor(pos, factorType, factor);
}
- inline void SetScore(const float score[NUM_SCORES])
+ /***
+ * score should be of length NUM_SCORES
+ */
+ inline void SetScore(const float score[])
{
- for (size_t currScore = 0 ; currScore < NUM_SCORES ; currScore++)
- {
- m_score[currScore] = score[currScore];
- }
+ std::memcpy(m_score, score, NUM_SCORES * sizeof(float));
}
void ResetScore();
@@ -130,7 +131,7 @@ public:
inline std::ostream& operator<<(std::ostream& out, const LatticeEdge& edge)
{
- out << edge.GetPhrase();
+ out << edge.GetTargetPhrase();
return out;
}
diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am
index 68380260d..acb560f65 100644
--- a/moses/src/Makefile.am
+++ b/moses/src/Makefile.am
@@ -2,18 +2,19 @@ lib_LIBRARIES = libmoses.a
libmoses_a_SOURCES = \
Arc.cpp \
ConfusionNet.cpp \
- FactorCollection.cpp \
+ DeletionHypothesis.cpp \
Factor.cpp \
+ FactorCollection.cpp \
FactorTypeSet.cpp \
GenerationDictionary.cpp \
+ Hypothesis.cpp \
HypothesisCollection.cpp \
HypothesisCollectionIntermediate.cpp \
- Hypothesis.cpp \
InputFileStream.cpp \
LanguageModel.cpp \
LatticeEdge.cpp \
LatticePath.cpp \
- LexicalReordering.cpp \
+ LexicalReordering.cpp \
Manager.cpp \
md5.cpp \
Parameter.cpp \
@@ -26,11 +27,13 @@ libmoses_a_SOURCES = \
TargetPhrase.cpp \
TranslationOption.cpp \
TranslationOptionCollection.cpp \
- UserMessage.cpp \
- Util.cpp \
- WeightOptimization.cpp \
- Word.cpp \
- WordsBitmap.cpp \
+ UnknownWordHandler.cpp \
+ UserMessage.cpp \
+ Util.cpp \
+ WeightOptimization.cpp \
+ Word.cpp \
+ WordDeletionTable.cpp \
+ WordsBitmap.cpp \
WordsRange.cpp
if INTERNAL_LM
diff --git a/moses/src/Manager.cpp b/moses/src/Manager.cpp
index a72337140..827e4b806 100755
--- a/moses/src/Manager.cpp
+++ b/moses/src/Manager.cpp
@@ -77,7 +77,7 @@ void Manager::ProcessSentence()
// seed hypothesis
{
- Hypothesis *hypo = new Hypothesis(m_source, m_possibleTranslations.GetInitialCoverage());
+ Hypothesis *hypo = Hypothesis::Create(m_source, m_possibleTranslations.GetInitialCoverage());
TRACE_ERR(m_possibleTranslations.GetInitialCoverage().GetWordsCount() << endl);
#ifdef N_BEST
LMList allLM = m_staticData.GetAllLM();
@@ -176,11 +176,7 @@ void Manager::ProcessOneHypothesis(const list < DecodeStep > &decodeStepList, co
{
Hypothesis *hypo = *iterHypo;
- hypo->CalcScore(m_staticData.GetLanguageModel(Initial)
- , m_staticData.GetLanguageModel(Other)
- , m_staticData.GetWeightDistortion()
- , m_staticData.GetWeightWordPenalty()
- , m_possibleTranslations.GetFutureScore(), m_source);
+ hypo->CalcScore(m_staticData, m_possibleTranslations.GetFutureScore(), m_source);
if(m_staticData.GetVerboseLevel() > 2)
{
hypo->PrintHypothesis(m_source, m_staticData.GetWeightDistortion(), m_staticData.GetWeightWordPenalty());
@@ -207,7 +203,6 @@ void Manager::ProcessOneHypothesis(const list < DecodeStep > &decodeStepList, co
++iterHypo;
}
}
-
}
void Manager::ProcessInitialTranslation(const Hypothesis &hypothesis, const DecodeStep &decodeStep, HypothesisCollectionIntermediate &outputHypoColl)
@@ -222,7 +217,7 @@ void Manager::ProcessInitialTranslation(const Hypothesis &hypothesis, const Deco
if ( !transOpt.Overlap(hypothesis))
{
- Hypothesis *newHypo = hypothesis.CreateNext(transOpt);
+ Hypothesis* newHypo = hypothesis.CreateNext(transOpt);
outputHypoColl.AddNoPrune( newHypo );
}
}
@@ -276,9 +271,6 @@ void Manager::ProcessInitialTranslation(const Hypothesis &hypothesis, const Deco
}
}
}
-
-
-
}
void Manager::ProcessTranslation(const Hypothesis &hypothesis, const DecodeStep &decodeStep, HypothesisCollectionIntermediate &outputHypoColl)
@@ -398,8 +390,8 @@ void Manager::CreateTranslationOptions(const Phrase &phrase, PhraseDictionary &p
/*
* changed to have an extendable unknown-word translation module -- EVH
*/
- //std::list<TranslationOption> unknownWordTranslations = m_staticData.GetUnknownWordHandler().GetPossibleTranslations(wordsRange, sourcePhrase, m_staticData, phraseDictionary);
- //m_possibleTranslations.insert(m_possibleTranslations.end(), unknownWordTranslations.begin(), unknownWordTranslations.end());
+// boost::shared_ptr<std::list<TranslationOption> > unknownWordTranslations = m_staticData.GetUnknownWordHandler()->GetPossibleTranslations(wordsRange, sourcePhrase, m_staticData, phraseDictionary);
+// m_possibleTranslations.insert(m_possibleTranslations.end(), unknownWordTranslations->begin(), unknownWordTranslations->end());
}
}
}
@@ -443,12 +435,10 @@ void Manager::CreateTranslationOptions(const Phrase &phrase, PhraseDictionary &p
//print information about future cost table when verbose option is set
-
if(m_staticData.GetVerboseLevel() > 2)
- {
- cout<<"future cost from "<<start<<" to "<<end<<" is "<<score[length]<<endl;
- }
-
+ {
+ cout<<"future cost from "<<start<<" to "<<end<<" is "<<score[length]<<endl;
+ }
}
}
}
@@ -542,7 +532,7 @@ void Manager::ProcessGeneration(const Hypothesis &hypothesis
}
// merge with existing hypothesis
- Hypothesis *mergeHypo = hypothesis.Clone();
+ Hypothesis *mergeHypo = new Hypothesis(hypothesis);
mergeHypo->MergeFactors(mergeWords, generationDictionary, generationScore, weight);
outputHypoColl.AddNoPrune(mergeHypo);
diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp
index b5b856579..f12ed72cc 100755
--- a/moses/src/Parameter.cpp
+++ b/moses/src/Parameter.cpp
@@ -48,6 +48,7 @@ Parameter::Parameter()
AddParam("weight-l");
AddParam("weight-t");
AddParam("weight-w");
+ AddParam("weight-e"); //source word deletion overall weight
AddParam("weight-generation");
AddParam("mapping");
AddParam("n-best-list");
@@ -224,6 +225,7 @@ bool Parameter::LoadParam(int argc, char* argv[])
OverwriteParam("-lm", "weight-l", argc, argv);
OverwriteParam("-tm", "weight-t", argc, argv);
OverwriteParam("-w", "weight-w", argc, argv);
+ OverwriteParam("-e", "weight-e", argc, argv);
OverwriteParam("-g", "weight-generation", argc, argv);
OverwriteParam("-n-best-list", "n-best-list", argc, argv);
OverwriteParam("-s", "stack", argc, argv);
diff --git a/moses/src/PhraseDictionary.cpp b/moses/src/PhraseDictionary.cpp
index d3e2e1840..2afcce5b7 100755
--- a/moses/src/PhraseDictionary.cpp
+++ b/moses/src/PhraseDictionary.cpp
@@ -66,22 +66,32 @@ void PhraseDictionary::Load(const std::vector<FactorType> &input
string line, prevSourcePhrase = "";
bool addPhrase = !filter;
size_t count = 0;
+ size_t line_num = 0;
while(getline(inFile, line))
{
- vector<string> token = TokenizeMultiCharSeparator( line , "|||" );
-
+ ++line_num;
+ vector<string> tokens = TokenizeMultiCharSeparator( line , "|||" );
+ if (tokens.size() != 3)
+ {
+ TRACE_ERR("Syntax error at " << filePath << ":" << line_num);
+ abort(); // TODO- error handling
+ }
+ if (tokens[1].find_first_not_of(" \t", 0) == string::npos) {
+ TRACE_ERR(filePath << ":" << line_num << ": phrase contains empty target, skipping\n");
+ continue;
+ }
if (!filter)
{
- if (token[0] != prevSourcePhrase)
- phraseVector = Phrase::Parse(token[0]);
+ if (tokens[0] != prevSourcePhrase)
+ phraseVector = Phrase::Parse(tokens[0]);
}
- else if (token[0] == prevSourcePhrase)
+ else if (tokens[0] == prevSourcePhrase)
{ // same source phrase as prev line.
}
else
{
- phraseVector = Phrase::Parse(token[0]);
- prevSourcePhrase = token[0];
+ phraseVector = Phrase::Parse(tokens[0]);
+ prevSourcePhrase = tokens[0];
if (Contains(phraseVector, inputPhraseList, input))
addPhrase = true;
@@ -91,7 +101,7 @@ void PhraseDictionary::Load(const std::vector<FactorType> &input
if (addPhrase)
{
- vector<float> scoreVector = Tokenize<float>(token[2]);
+ vector<float> scoreVector = Tokenize<float>(tokens[2]);
assert(scoreVector.size() == m_noScoreComponent);
// source
@@ -99,7 +109,7 @@ void PhraseDictionary::Load(const std::vector<FactorType> &input
sourcePhrase.CreateFromString( input, phraseVector, factorCollection);
//target
TargetPhrase targetPhrase(Output, this);
- targetPhrase.CreateFromString( output, token[1], factorCollection);
+ targetPhrase.CreateFromString( output, tokens[1], factorCollection);
// component score, for n-best output
targetPhrase.SetScore(scoreVector, weight, languageModels, weightWP);
diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp
index f0edc1213..91e02f4c5 100755
--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@@ -91,9 +91,17 @@ bool StaticData::LoadParameters(int argc, char* argv[])
//input-factors
const vector<string> &inputFactorVector = m_parameter.GetParam("input-factors");
for(size_t i=0; i<inputFactorVector.size(); i++)
- {
+ {
m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i]));
}
+
+ //source word deletion
+ if(m_parameter.GetParam("dtable-file").size() > 0)
+ {
+ m_wordDeletionWeight = Scan<float>(m_parameter.GetParam("weight-e")[0]);
+ m_wordDeletionEnabled = true;
+ if (GetVerboseLevel() > 0) { std::cerr << "Word deletion enabled." << std::endl; }
+ } else { m_wordDeletionEnabled = false; }
// load Lexical Reordering model
// check to see if the lexical reordering parameter exists
@@ -281,6 +289,7 @@ bool StaticData::LoadParameters(int argc, char* argv[])
: TransformScore(DEFAULT_BEAM_THRESHOLD);
// Unknown Word Processing -- wade
+ //TODO replace this w/general word dropping -- EVH
if (m_parameter.GetParam("drop-unknown").size() == 1)
{ m_dropUnknown = Scan<size_t>( m_parameter.GetParam("drop-unknown")[0]); }
else
@@ -413,6 +422,7 @@ void StaticData::LoadPhraseTables(bool filter
for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++)
{
vector<string> token = Tokenize(translationVector[currDict]);
+ //characteristics of the phrase table
vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
,output = Tokenize<FactorType>(token[1], ",");
string filePath= token[3];
@@ -467,7 +477,13 @@ void StaticData::LoadPhraseTables(bool filter
timer.check("Finished loading PhraseTable");
}
}
-
+/*
+ //load word deletion table
+ if(m_parameter.GetParam("dtable-file").size() > 0)
+ {
+ m_wordDeletionTable.Load(m_parameter.GetParam("dtable-file")[0], *this);
+ }
+*/
timer.check("Finished loading phrase tables");
}
diff --git a/moses/src/StaticData.h b/moses/src/StaticData.h
index 4e2a32ec1..0ab55b46a 100755
--- a/moses/src/StaticData.h
+++ b/moses/src/StaticData.h
@@ -23,6 +23,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <list>
#include <vector>
+#include <boost/shared_ptr.hpp>
#include "TypeDef.h"
#include "PhraseDictionary.h"
#include "GenerationDictionary.h"
@@ -33,6 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "InputOutput.h"
#include "DecodeStep.h"
//#include "UnknownWordHandler.h"
+#include "WordDeletionTable.h"
class StaticData
{
@@ -40,16 +42,17 @@ protected:
FactorCollection m_factorCollection;
std::vector<PhraseDictionary*> m_phraseDictionary;
std::vector<GenerationDictionary*> m_generationDictionary;
+ WordDeletionTable m_wordDeletionTable;
std::list < DecodeStep > m_decodeStepList;
- Parameter m_parameter;
- std::vector<FactorType> m_inputFactorOrder;
- std::vector<LMList> m_languageModel;
+ Parameter m_parameter;
+ std::vector<FactorType> m_inputFactorOrder;
+// boost::shared_ptr<UnknownWordHandler> m_unknownWordHandler; //defaults to NULL; pointer allows polymorphism
+ std::vector<LMList> m_languageModel;
LexicalReordering *m_lexReorder;
-// UnknownWordHandler m_unknownWordHandler; //defaults to NULL; pointer allows polymorphism
// Initial = 0 = can be used when creating poss trans
// Other = 1 = used to calculate LM score once all steps have been processed
float m_beamThreshold
- ,m_weightDistortion, m_weightWordPenalty;
+ ,m_weightDistortion, m_weightWordPenalty, m_wordDeletionWeight;
// PhraseTrans, Generation & LanguageModelScore has multiple weights.
int m_maxDistortion;
// do it differently from old pharaoh
@@ -61,16 +64,28 @@ protected:
std::vector<std::string> m_mySQLParam;
InputOutput *m_inputOutput;
bool m_fLMsLoaded;
- int m_dropUnknown;
-
+ /***
+ * false = treat unknown words as proper nouns, and translate them as themselves;
+ * true = drop (ignore) them
+ */
+ bool m_dropUnknown;
+ bool m_wordDeletionEnabled;
+
size_t m_verboseLevel;
public:
StaticData();
~StaticData();
+ /***
+ * also initialize the Parameter object
+ */
bool LoadParameters(int argc, char* argv[]);
+ /***
+ * load not only the main phrase table but also any auxiliary tables that depend on which features are being used
+ * (eg word-deletion, word-insertion tables)
+ */
void LoadPhraseTables(bool filter
, const std::string &inputFileHash
, const std::list< Phrase > &inputPhraseList);
@@ -79,7 +94,7 @@ public:
LoadPhraseTables(false, "", std::list< Phrase >());
}
void LoadMapping();
-/* void SetUnknownWordHandler(UnknownWordHandler &unknownWordHandler)
+/* void SetUnknownWordHandler(boost::shared_ptr<UnknownWordHandler> unknownWordHandler)
{
m_unknownWordHandler = unknownWordHandler;
}
@@ -103,17 +118,17 @@ public:
{
return m_decodeStepList;
}
-
- inline int GetDropUnknown() const
- {
- return m_dropUnknown;
- }
+
+ inline bool GetDropUnknown() const
+ {
+ return m_dropUnknown;
+ }
/*
- UnknownWordHandler &GetUnknownWordHandler()
+ boost::shared_ptr<UnknownWordHandler> GetUnknownWordHandler()
{
return m_unknownWordHandler;
}
-*/
+*/
FactorCollection &GetFactorCollection()
{
return m_factorCollection;
@@ -131,6 +146,14 @@ public:
{
return m_weightWordPenalty;
}
+ float GetWordDeletionWeight() const
+ {
+ return m_wordDeletionWeight;
+ }
+ bool LittleChrisAsksWhetherWordDeletionIsEnabledAndWeAnswerHim() const
+ {
+ return m_wordDeletionEnabled;
+ }
size_t GetMaxHypoStackSize() const
{
return m_maxHypoStackSize;
@@ -168,6 +191,13 @@ public:
{
return m_cachePath;
}
+ /***
+ * only call this if word deletion is enabled
+ */
+ const WordDeletionTable& GetWordDeletionTable() const
+ {
+ return m_wordDeletionTable;
+ }
size_t GetVerboseLevel() const
{
diff --git a/moses/src/TargetPhrase.cpp b/moses/src/TargetPhrase.cpp
index 7a5e22476..bdd54c827 100644
--- a/moses/src/TargetPhrase.cpp
+++ b/moses/src/TargetPhrase.cpp
@@ -34,11 +34,12 @@ TargetPhrase::TargetPhrase(FactorDirection direction, const PhraseDictionary *ph
{
}
+// used when creating translations of unknown words:
// TODO the two versions of SetScore have two problems:
// 1) they are badly named- computePhraseScores would probably be better
// 2) they duplicate way too much code between them
void TargetPhrase::SetScore(const LMList &languageModels, float weightWP)
-{ // used when creating translations of unknown words:
+{
m_transScore = m_ngramScore = 0;
m_fullScore = weightWP;
@@ -54,16 +55,14 @@ void TargetPhrase::SetScore(const LMList &languageModels, float weightWP)
float fullScore, nGramScore;
- #ifdef N_BEST
- (*lmIter)->CalcScore(*this, fullScore, nGramScore);
- size_t lmId = (*lmIter)->GetId();
- pair<size_t, float> store(lmId, nGramScore);
- m_ngramComponent.push_back(store);
- #else
- // this is really, really ugly (a reference to an object at NULL
- // is asking for trouble). TODO
- (*lmIter)->CalcScore(*this, fullScore, nGramScore);
- #endif
+#ifdef N_BEST
+ (*lmIter)->CalcScore(*this, fullScore, nGramScore);
+ size_t lmId = (*lmIter)->GetId();
+ pair<size_t, float> store(lmId, nGramScore);
+ m_ngramComponent.push_back(store);
+#else
+ (*lmIter)->CalcScore(*this, fullScore, nGramScore);
+#endif
m_fullScore += fullScore * weightLM;
m_ngramScore += nGramScore * weightLM;
@@ -104,19 +103,16 @@ void TargetPhrase::SetScore(const vector<float> &scoreVector, const vector<float
float fullScore, nGramScore;
#ifdef N_BEST
lm.CalcScore(*this, fullScore, nGramScore);
- size_t lmId = lm.GetId();
- pair<size_t, float> store(lmId, nGramScore);
- m_ngramComponent.push_back(store);
+ size_t lmId = lm.GetId();
+ pair<size_t, float> store(lmId, nGramScore);
+ m_ngramComponent.push_back(store);
#else
- // this is really, really ugly (a reference to an object at NULL
- // is asking for trouble). TODO
lm.CalcScore(*this, fullScore, nGramScore);
#endif
// total LM score so far
totalNgramScore += nGramScore * weightLM;
totalFullScore += fullScore * weightLM;
-
}
}
m_ngramScore = totalNgramScore;
diff --git a/moses/src/TargetPhrase.h b/moses/src/TargetPhrase.h
index 2c403c890..c9a19d9fb 100644
--- a/moses/src/TargetPhrase.h
+++ b/moses/src/TargetPhrase.h
@@ -29,30 +29,18 @@ class PhraseDictionary;
class TargetPhrase: public Phrase
{
- friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
+ friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
protected:
float m_transScore, m_ngramScore, m_fullScore;
#ifdef N_BEST
ScoreComponent m_scoreComponent;
- std::list< std::pair<size_t, float> > m_lmScoreComponent;
- std::list< std::pair<size_t, float> > m_ngramComponent;
+ std::vector< std::pair<size_t, float> > m_lmScoreComponent;
+ std::vector< std::pair<size_t, float> > m_ngramComponent;
#endif
public:
TargetPhrase(FactorDirection direction, const PhraseDictionary *phraseDictionary);
-
- /***
- * Deep copy
- *
- TargetPhrase(const TargetPhrase& phrase)
- : Phrase(phrase.GetDirection()), m_transScore(phrase.m_transScore), m_ngramScore(phrase.m_ngramScore), m_fullScore(phrase.m_fullScore)
-#ifdef N_BEST
- , m_scoreComponent(phrase.m_scoreComponent), m_lmScoreComponent(phrase.m_lmScoreComponent), m_ngramComponent(phrase.m_ngramComponent)
-#endif
- {
- }
- */
void SetScore(const std::vector<float> &scoreVector, const std::vector<float> &weightT,
const LMList &languageModels, float weightWP);
@@ -61,33 +49,39 @@ public:
void ResetScore();
void SetWeights(const std::vector<float> &weightT);
- inline float GetTranslationScore() const
- {
- return m_transScore;
- }
- //TODO is this really the best name?
- inline float GetFutureScore() const
- {
- return m_fullScore;
- }
- inline float GetNgramScore() const
- {
- return m_ngramScore;
- }
+ inline float GetTranslationScore() const
+ {
+ return m_transScore;
+ }
+ /***
+ * return the estimated score resulting from our being added to a sentence
+ * (it's an estimate because we don't have full n-gram info for the language model
+ * without using the (unknown) full sentence)
+ *
+ * TODO is this really the best name?
+ */
+ inline float GetFutureScore() const
+ {
+ return m_fullScore;
+ }
+ inline float GetNgramScore() const
+ {
+ return m_ngramScore;
+ }
#ifdef N_BEST
inline const ScoreComponent &GetScoreComponents() const
{
return m_scoreComponent;
}
- inline const std::list< std::pair<size_t, float> > &GetLMScoreComponent() const
- {
- return m_lmScoreComponent;
- }
- inline const std::list< std::pair<size_t, float> > &GetNgramComponent() const
- {
- return m_ngramComponent;
- }
+ inline const std::vector< std::pair<size_t, float> > &GetLMScoreComponent() const
+ {
+ return m_lmScoreComponent;
+ }
+ inline const std::vector< std::pair<size_t, float> > &GetNgramComponent() const
+ {
+ return m_ngramComponent;
+ }
#endif
};
diff --git a/moses/src/TranslationOption.h b/moses/src/TranslationOption.h
index 03cfde5a4..57b675e65 100755
--- a/moses/src/TranslationOption.h
+++ b/moses/src/TranslationOption.h
@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
/***
* Specify source and target words for a possible translation. m_targetPhrase points to a phrase-table entry.
- * The source word range is zero-indexed, so it can't refer to an empty range.
+ * The source word range is zero-indexed, so it can't refer to an empty range. The target phrase may be empty.
*/
class TranslationOption
{
@@ -48,26 +48,48 @@ public:
TranslationOption(const WordsRange &wordsRange, const TargetPhrase &targetPhrase);
bool Overlap(const Hypothesis &hypothesis) const;
+ /***
+ * return start index of source phrase
+ */
inline size_t GetStartPos() const
{
return m_sourceWordsRange.GetStartPos();
}
+ /***
+ * return end index of source phrase
+ */
inline size_t GetEndPos() const
{
return m_sourceWordsRange.GetEndPos();
}
+ /***
+ * return length of source phrase
+ */
inline size_t GetSize() const
{
return m_sourceWordsRange.GetEndPos() - m_sourceWordsRange.GetStartPos() + 1;
}
+ /***
+ * return source words range
+ */
inline const WordsRange &GetWordsRange() const
{
return m_sourceWordsRange;
}
- inline const Phrase &GetPhrase() const
+ /***
+ * return target phrase
+ */
+ inline const Phrase& GetPhrase() const
{
return m_targetPhrase;
}
+ /***
+ * returns true if the source phrase translates into nothing
+ */
+ inline bool IsDeletionOption() const
+ {
+ return m_targetPhrase.GetSize() == 0;
+ }
inline float GetTranslationScore() const
{
return m_targetPhrase.GetTranslationScore();
@@ -86,11 +108,11 @@ public:
{
return m_transScoreComponent;
}
- inline const std::list< std::pair<size_t, float> > &GetLMScoreComponent() const
+ inline const std::vector< std::pair<size_t, float> > &GetLMScoreComponent() const
{
return m_targetPhrase.GetLMScoreComponent();
}
- inline const std::list< std::pair<size_t, float> > &GetTrigramComponent() const
+ inline const std::vector< std::pair<size_t, float> > &GetTrigramComponent() const
{
return m_targetPhrase.GetNgramComponent();
}
diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h
index ff7107109..4c62356fa 100755
--- a/moses/src/TypeDef.h
+++ b/moses/src/TypeDef.h
@@ -105,25 +105,27 @@ enum DecodeType
namespace ScoreType {
enum ScoreType
{
- PhraseTrans
- ,Generation
- ,LanguageModelScore
- ,Distortion
- ,WordPenalty
- ,FutureScoreEnum
- ,LexicalReordering
- ,Total
+ PhraseTrans = 0,
+ Generation,
+ LanguageModelScore,
+ Distortion,
+ WordPenalty,
+ DeletedWords, //source words dropped from translation
+ InsertedWords, //words inserted into target phrase independently of phrase translation
+ FutureScoreEnum,
+ LexicalReordering,
+ Total
};
-};
+}
// count of above
-const size_t NUM_SCORES = 8;
+const size_t NUM_SCORES = 10;
namespace LexReorderType
{
enum LexReorderType
{
- Monotone
+ Monotone //TODO what the jiggers do these symbols mean?
,Msd
,Forward
,Backward
diff --git a/moses/src/UnknownWordHandler.cpp b/moses/src/UnknownWordHandler.cpp
new file mode 100644
index 000000000..45fa03624
--- /dev/null
+++ b/moses/src/UnknownWordHandler.cpp
@@ -0,0 +1,71 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "StaticData.h"
+#include "TranslationOption.h"
+#include "UnknownWordHandler.h"
+
+/***
+ * default implementation: assume the word/phrase is a proper noun and set it as its own translation
+ */
+boost::shared_ptr<std::list<TranslationOption> > UnknownWordHandler::GetPossibleTranslations(
+ const WordsRange& sourceWordsRange, const Phrase& sourcePhrase, StaticData& staticData, PhraseDictionary& phraseDictionary) const
+{
+ TargetPhrase targetPhrase(Output, &phraseDictionary);
+ FactorArray &targetWord = targetPhrase.AddWord();
+ const FactorArray &sourceWord = sourcePhrase.GetFactorArray(0);
+
+ //start processing source phrase: here, just copy factors to target
+ const FactorTypeSet &targetFactors = phraseDictionary.GetFactorsUsed(Output);
+ for (unsigned int currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++)
+ {
+ if (targetFactors.Contains(currFactor))
+ {
+ FactorType factorType = static_cast<FactorType>(currFactor);
+
+ const Factor *factor = sourceWord[factorType], *unknownFactor;
+ switch (factorType)
+ {
+ case POS:
+ unknownFactor = staticData.GetFactorCollection().AddFactor(Output, factorType, UNKNOWN_FACTOR);
+ targetWord[factorType] = unknownFactor;
+ break;
+ default:
+ unknownFactor = staticData.GetFactorCollection().AddFactor(Output, factorType, factor->GetString());
+ targetWord[factorType] = unknownFactor;
+ break;
+ }
+ }
+ }
+ LMList languageModels = staticData.GetAllLM();
+ targetPhrase.SetScore(languageModels, staticData.GetWeightWordPenalty());
+
+ /*
+ * add possible translations to the phrase table
+ * (so that if we hit this source phrase again, we won't reprocess it because it won't still be unknown)
+ */
+ phraseDictionary.AddEquivPhrase(sourcePhrase, targetPhrase);
+
+ //turn phrase-table entries into TranslationOption objects
+ const TargetPhraseCollection *phraseColl = phraseDictionary.FindEquivPhrase(sourcePhrase);
+ boost::shared_ptr<std::list<TranslationOption> > transOpts(new std::list<TranslationOption>);
+ for(TargetPhraseCollection::const_iterator i = phraseColl->begin(); i != phraseColl->end(); i++)
+ transOpts->push_back(TranslationOption(sourceWordsRange, *i));
+ return transOpts;
+}
diff --git a/moses/src/UnknownWordHandler.h b/moses/src/UnknownWordHandler.h
new file mode 100644
index 000000000..23ce78dc5
--- /dev/null
+++ b/moses/src/UnknownWordHandler.h
@@ -0,0 +1,52 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include <list>
+#include <boost/shared_ptr.hpp>
+#include "TargetPhrase.h"
+#include "WordsRange.h"
+#include "PhraseDictionary.h"
+
+class StaticData;
+class TranslationOption;
+
+/***
+ * Provide analysis of source-language words the phrase table can't help us with. This default implementation
+ * assumes all unknown words are proper names; it's meant to be inherited. The unknown-word handler used
+ * is set in main().
+ */
+class UnknownWordHandler
+{
+ public:
+
+ UnknownWordHandler() {}
+ virtual ~UnknownWordHandler() {}
+
+ /***
+ * \param sourceWordsRange A group of consecutive source words we can't translate via the phrase table
+ * \param sourcePhrase The source words to be translated
+ * \param staticData
+ * \param phraseDictionary A modifiable phrase table
+ * \return A list of possible translations for the given source phrase
+ */
+ virtual boost::shared_ptr<std::list<TranslationOption> > GetPossibleTranslations(
+ const WordsRange& sourceWordsRange, const Phrase& sourcePhrase, StaticData& staticData, PhraseDictionary& phraseDictionary) const;
+};
diff --git a/moses/src/WordDeletionTable.cpp b/moses/src/WordDeletionTable.cpp
new file mode 100644
index 000000000..201652d6f
--- /dev/null
+++ b/moses/src/WordDeletionTable.cpp
@@ -0,0 +1,54 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include <cstdlib>
+#include <iostream>
+using std::ifstream;
+#include <vector>
+using std::vector;
+#include "TypeDef.h"
+#include "StaticData.h"
+#include "WordDeletionTable.h"
+using std::string;
+
+void WordDeletionTable::Load(const string& filename, StaticData& staticData)
+{
+ std::cout << "in WordDeletionTable::Load()" << std::endl;
+ ifstream infile(filename.c_str());
+ if(!infile)
+ {
+ std::cerr << "WordDeletionTable::Load(): can't open '" << filename << "' for read; exiting" << std::endl;
+ exit(-1);
+ }
+
+ //each line is of format PHRASE ||| DELETION_COST
+ string line;
+ while(getline(infile, line, '\n'))
+ {
+ vector<string> token = TokenizeMultiCharSeparator(line, "|||");
+ //parse phrase
+ Phrase sourcePhrase(Input);
+ const std::vector<FactorType>& input = staticData.GetInputFactorOrder();
+ sourcePhrase.CreateFromString(input, token[0], staticData.GetFactorCollection());
+ //parse cost
+ m_deletionCosts[sourcePhrase] = Scan<float>(token[1]);
+ std::cout << "dtable entry: " << sourcePhrase << " -> " << m_deletionCosts[sourcePhrase] << std::endl;
+ }
+ infile.close();
+}
diff --git a/moses/src/WordDeletionTable.h b/moses/src/WordDeletionTable.h
new file mode 100644
index 000000000..8fedf913c
--- /dev/null
+++ b/moses/src/WordDeletionTable.h
@@ -0,0 +1,55 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <stdexcept>
+#include "Phrase.h"
+
+class StaticData;
+
+class WordDeletionTable
+{
+ typedef float COST_TYPE;
+
+ protected:
+
+ std::map<Phrase, COST_TYPE> m_deletionCosts; //map each source-language phrase to the cost of deleting it
+
+ public:
+
+ /***
+ * should only be called once for a given instance
+ */
+ void Load(const std::string& filename, StaticData& staticData);
+
+ /***
+ * \throw invalid_argument if the given phrase isn't in our table
+ */
+ COST_TYPE GetDeletionCost(const Phrase& sourcePhrase) const throw(std::invalid_argument)
+ {
+ std::cout << "WordDeletionTable::GetDeletionCost()" << std::endl;
+ std::map<Phrase, COST_TYPE>::const_iterator i = m_deletionCosts.find(sourcePhrase);
+ if(i == m_deletionCosts.end())
+ throw std::invalid_argument("WordDeletionTable::GetDeletionCost()");
+ return i->second;
+ }
+};
diff --git a/moses/src/WordInsertionTable.h b/moses/src/WordInsertionTable.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/moses/src/WordInsertionTable.h