Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorzens <zens@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-19 04:16:29 +0400
committerzens <zens@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-19 04:16:29 +0400
commit648bd1dfcdc337f15ddf88823515e0cbedf62bd4 (patch)
tree893e3a23ce02ef0dd1f356498fd2bb04feaa2014 /moses
parent5449e11bb91459dc84bb07ff26ad7143b47df420 (diff)
- prepared confusion net input
-> new base class InputType is used throughout the decoder instead of Sentence Sentence and ConfusionNet derive from this class -> Manager etc. do not know if the input is a sentence or a confusion net (but could check if REALLY needed) - two separate classes derived from TranslationOptionCollection: one for text input and one for confusion net input - score computation in PhraseDictionaryTree.cpp and some optimizations git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@183 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses')
-rw-r--r--moses/src/ConfusionNet.cpp16
-rw-r--r--moses/src/ConfusionNet.h16
-rw-r--r--moses/src/Dictionary.h3
-rwxr-xr-xmoses/src/Hypothesis.cpp11
-rwxr-xr-xmoses/src/Hypothesis.h10
-rw-r--r--moses/src/Makefile.am15
-rwxr-xr-xmoses/src/Manager.cpp29
-rwxr-xr-xmoses/src/Manager.h13
-rwxr-xr-xmoses/src/PhraseDictionary.h3
-rw-r--r--moses/src/PhraseDictionaryTree.cpp303
-rw-r--r--moses/src/PhraseDictionaryTree.h72
-rwxr-xr-xmoses/src/Sentence.h48
-rw-r--r--moses/src/TranslationOptionCollection.cpp170
-rwxr-xr-xmoses/src/TranslationOptionCollection.h17
-rw-r--r--moses/src/TranslationOptionCollectionConfusionNet.cpp25
-rw-r--r--moses/src/TranslationOptionCollectionConfusionNet.h23
-rw-r--r--moses/src/TranslationOptionCollectionText.cpp178
-rw-r--r--moses/src/TranslationOptionCollectionText.h22
-rwxr-xr-xmoses/src/WordsRange.h6
19 files changed, 598 insertions, 382 deletions
diff --git a/moses/src/ConfusionNet.cpp b/moses/src/ConfusionNet.cpp
index f3a9c8a66..1d8f734cc 100644
--- a/moses/src/ConfusionNet.cpp
+++ b/moses/src/ConfusionNet.cpp
@@ -5,7 +5,7 @@
#include "FactorCollection.h"
#include "Util.h"
-ConfusionNet::ConfusionNet(FactorCollection* p) : m_factorCollection(p) {}
+ConfusionNet::ConfusionNet(FactorCollection* p) : InputType(),m_factorCollection(p) {}
void ConfusionNet::SetFactorCollection(FactorCollection *p)
{
@@ -31,7 +31,7 @@ void ConfusionNet::String2Word(const std::string& s,Word& w,const std::vector<Fa
bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& factorOrder) {
assert(m_factorCollection);
- clear();
+ Clear();
std::string line;
while(getline(in,line)) {
std::istringstream is(line);
@@ -55,7 +55,7 @@ bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& f
}
bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector<FactorType>& factorOrder) {
assert(m_factorCollection);
- clear();
+ Clear();
std::string line;
if(!getline(in,line)) return 0;
size_t s;
@@ -74,7 +74,6 @@ bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector<FactorType>& f
std::cerr<<"WARN: neg costs: "<<data[i][j].second<<" -> set to 0\n";
data[i][j].second=0.0;}
String2Word(word,data[i][j].first,factorOrder);
- // data[i][j].first.SetFactor(Surface,m_factorCollection->AddFactor(Input,Surface,word));
} else return 0;
}
return !data.empty();
@@ -90,3 +89,12 @@ void ConfusionNet::Print(std::ostream& out) const {
}
out<<"\n\n";
}
+
+Phrase ConfusionNet::GetSubString(const WordsRange&) const {
+ std::cerr<<"ERROR: call to ConfusionNet::GetSubString\n";
+ abort();
+ return Phrase();}
+const Factor* ConfusionNet::GetFactor(size_t pos, FactorType factorType) const {
+ std::cerr<<"ERROR: call to ConfusionNet::GetFactor\n";
+ abort();
+ return 0;}
diff --git a/moses/src/ConfusionNet.h b/moses/src/ConfusionNet.h
index ec3ff0f65..7665892ff 100644
--- a/moses/src/ConfusionNet.h
+++ b/moses/src/ConfusionNet.h
@@ -4,9 +4,11 @@
#include <vector>
#include <iostream>
#include "Word.h"
+#include "Input.h"
+
class FactorCollection;
-class ConfusionNet {
+class ConfusionNet : public InputType {
public:
typedef std::vector<std::pair<Word,float> > Column;
@@ -21,13 +23,19 @@ class ConfusionNet {
const Column& GetColumn(size_t i) const {assert(i<data.size());return data[i];}
const Column& operator[](size_t i) const {return GetColumn(i);}
- bool empty() const {return data.empty();}
- size_t size() const {return data.size();}
- void clear() {data.clear();}
+ bool Empty() const {return data.empty();}
+ size_t GetSize() const {return data.size();}
+ void Clear() {data.clear();}
bool Read(std::istream&,const std::vector<FactorType>& factorOrder,int format=0);
void Print(std::ostream&) const;
+
+
+ Phrase GetSubString(const WordsRange&) const;
+ const Factor* GetFactor(size_t pos, FactorType factorType) const;
+
+
private:
bool ReadFormat0(std::istream&,const std::vector<FactorType>& factorOrder);
bool ReadFormat1(std::istream&,const std::vector<FactorType>& factorOrder);
diff --git a/moses/src/Dictionary.h b/moses/src/Dictionary.h
index 1b77359cb..a41f6d349 100644
--- a/moses/src/Dictionary.h
+++ b/moses/src/Dictionary.h
@@ -22,8 +22,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#pragma once
#include <vector>
+#include "FactorTypeSet.h"
-class FactorTypeSet;
+//class FactorTypeSet;
class Dictionary
{
diff --git a/moses/src/Hypothesis.cpp b/moses/src/Hypothesis.cpp
index fd4327d92..66cb6ad14 100755
--- a/moses/src/Hypothesis.cpp
+++ b/moses/src/Hypothesis.cpp
@@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Arc.h"
#include "SquareMatrix.h"
#include "StaticData.h"
+#include "Input.h"
//TODO: add this include in when it compiles
//#include "LexicalReordering.h"
@@ -37,7 +38,7 @@ using namespace std;
int Hypothesis::s_numNodes = 0;
-Hypothesis::Hypothesis(const Phrase &phrase, const WordsBitmap &initialCoverage)
+Hypothesis::Hypothesis(const WordsBitmap &initialCoverage)
: LatticeEdge(Output, NULL)
, m_sourceCompleted(initialCoverage)
, m_currSourceWordsRange(NOT_FOUND, NOT_FOUND)
@@ -159,10 +160,10 @@ Hypothesis* Hypothesis::Create(const Hypothesis &prevHypo, const TranslationOpti
/***
* return the subclass of Hypothesis most appropriate to the given target phrase
*/
-Hypothesis* Hypothesis::Create(const Phrase& targetPhrase, const WordsBitmap &initialCoverage)
+Hypothesis* Hypothesis::Create(const WordsBitmap &initialCoverage)
{
/*if(s_wordDeletionEnabled && targetPhrase.GetSize() == 0) return new DeletionHypothesis(initialCoverage);
- else*/ return new Hypothesis(targetPhrase, initialCoverage);
+ else*/ return new Hypothesis(initialCoverage);
}
/***
@@ -462,7 +463,7 @@ void Hypothesis::CalcDistortionScore()
/***
* calculate the logarithm of our total translation score (sum up components)
*/
-void Hypothesis::CalcScore(const StaticData& staticData, const SquareMatrix &futureScore, const Sentence &source)
+void Hypothesis::CalcScore(const StaticData& staticData, const SquareMatrix &futureScore)
{
// DISTORTION COST
CalcDistortionScore();
@@ -528,7 +529,7 @@ const Hypothesis* Hypothesis::GetPrevHypo()const{
/**
* print hypothesis information for pharaoh-style logging
*/
-void Hypothesis::PrintHypothesis(const Sentence &source, float weightDistortion, float weightWordPenalty) const{
+void Hypothesis::PrintHypothesis(const InputType &source, float weightDistortion, float weightWordPenalty) const{
int start = m_prevHypo->m_currSourceWordsRange.GetEndPos() -1;
int end = m_prevHypo->m_currSourceWordsRange.GetEndPos();
cout<<"creating hypothesis "<< m_id <<" from "<< m_prevHypo->m_id<<" ( ... ";
diff --git a/moses/src/Hypothesis.h b/moses/src/Hypothesis.h
index 21dcaa777..1b943a26d 100755
--- a/moses/src/Hypothesis.h
+++ b/moses/src/Hypothesis.h
@@ -38,7 +38,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
class SquareMatrix;
class StaticData;
class TranslationOption;
-class Sentence;
+class InputType;
class WordsRange;
class WordDeletionTable;
@@ -59,7 +59,7 @@ protected:
/***
* Used for initializing translation process
*/
- Hypothesis(const Phrase &phrase, const WordsBitmap &initialCoverage);
+ Hypothesis(const WordsBitmap &initialCoverage);
// create next
Hypothesis(const Hypothesis &prevHypo, const TranslationOption &transOpt);
@@ -94,7 +94,7 @@ public:
/***
* return the subclass of Hypothesis most appropriate to the given target phrase
*/
- static Hypothesis* Create(const Phrase& targetPhrase, const WordsBitmap &initialCoverage);
+ static Hypothesis* Create(const WordsBitmap &initialCoverage);
~Hypothesis();
@@ -111,7 +111,7 @@ public:
*/
Hypothesis* MergeNext(const TranslationOption &transOpt) const;
- virtual void PrintHypothesis( const Sentence &source, float weightDistortion, float weightWordPenalty) const;
+ virtual void PrintHypothesis( const InputType &source, float weightDistortion, float weightWordPenalty) const;
// void PrintLMScores(const LMList &lmListInitial, const LMList &lmListEnd) const;
inline const WordsRange &GetCurrSourceWordsRange() const
{
@@ -124,7 +124,7 @@ public:
return m_currTargetWordsRange.GetWordsCount();
}
- virtual void CalcScore(const StaticData& staticData, const SquareMatrix &futureScore, const Sentence &source);
+ virtual void CalcScore(const StaticData& staticData, const SquareMatrix &futureScore);
int GetId() const;
diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am
index 94f4e189c..c8a319b54 100644
--- a/moses/src/Makefile.am
+++ b/moses/src/Makefile.am
@@ -2,6 +2,7 @@ lib_LIBRARIES = libmoses.a
libmoses_a_SOURCES = \
Arc.cpp \
ConfusionNet.cpp \
+ CreateTargetPhraseCollection.cpp \
Factor.cpp \
FactorCollection.cpp \
FactorTypeSet.cpp \
@@ -9,6 +10,7 @@ libmoses_a_SOURCES = \
Hypothesis.cpp \
HypothesisCollection.cpp \
HypothesisCollectionIntermediate.cpp \
+ Input.cpp \
InputFileStream.cpp \
LanguageModel.cpp \
LatticeEdge.cpp \
@@ -26,12 +28,13 @@ libmoses_a_SOURCES = \
TargetPhrase.cpp \
TranslationOption.cpp \
TranslationOptionCollection.cpp \
- UnknownWordHandler.cpp \
- UserMessage.cpp \
- Util.cpp \
- WeightOptimization.cpp \
- Word.cpp \
- WordsBitmap.cpp \
+ TranslationOptionCollectionText.cpp \
+ TranslationOptionCollectionConfusionNet.cpp \
+ UserMessage.cpp \
+ Util.cpp \
+ WeightOptimization.cpp \
+ Word.cpp \
+ WordsBitmap.cpp \
WordsRange.cpp
if INTERNAL_LM
diff --git a/moses/src/Manager.cpp b/moses/src/Manager.cpp
index dc0e55307..907969362 100755
--- a/moses/src/Manager.cpp
+++ b/moses/src/Manager.cpp
@@ -32,11 +32,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-Manager::Manager(const Sentence &sentence, StaticData &staticData)
-:m_source(sentence)
-,m_hypoStack(sentence.GetSize() + 1)
+Manager::Manager(InputType const& source,
+ TranslationOptionCollection& toc,
+ StaticData &staticData)
+:m_source(source)
+,m_hypoStack(source.GetSize() + 1)
,m_staticData(staticData)
-,m_possibleTranslations(sentence)
+,m_possibleTranslations(toc) //dynamic_cast<Sentence const&>(source))
{
std::vector < HypothesisCollection >::iterator iterStack;
for (iterStack = m_hypoStack.begin() ; iterStack != m_hypoStack.end() ; ++iterStack)
@@ -77,13 +79,13 @@ void Manager::ProcessSentence()
// seed hypothesis
{
- Hypothesis *hypo = Hypothesis::Create(m_source, m_possibleTranslations.GetInitialCoverage());
- TRACE_ERR(m_possibleTranslations.GetInitialCoverage().GetWordsCount() << endl);
+ Hypothesis *hypo = Hypothesis::Create(m_possibleTranslations.GetInitialCoverage());
+ TRACE_ERR(m_possibleTranslations.GetInitialCoverage().GetWordsCount() << endl);
#ifdef N_BEST
- LMList allLM = m_staticData.GetAllLM();
- hypo->ResizeComponentScore(allLM, decodeStepList);
+ LMList allLM = m_staticData.GetAllLM();
+ hypo->ResizeComponentScore(allLM, decodeStepList);
#endif
- m_hypoStack[m_possibleTranslations.GetInitialCoverage().GetWordsCount()].AddPrune(hypo);
+ m_hypoStack[m_possibleTranslations.GetInitialCoverage().GetWordsCount()].AddPrune(hypo);
}
// go thru each stack
@@ -176,7 +178,7 @@ void Manager::ProcessOneHypothesis(const list < DecodeStep > &decodeStepList, co
{
Hypothesis *hypo = *iterHypo;
- hypo->CalcScore(m_staticData, m_possibleTranslations.GetFutureScore(), m_source);
+ hypo->CalcScore(m_staticData, m_possibleTranslations.GetFutureScore());
if(m_staticData.GetVerboseLevel() > 2)
{
hypo->PrintHypothesis(m_source, m_staticData.GetWeightDistortion(), m_staticData.GetWeightWordPenalty());
@@ -295,9 +297,8 @@ void Manager::ProcessTranslation(const Hypothesis &hypothesis, const DecodeStep
// actual implementation
const WordsRange &sourceWordsRange = hypothesis.GetCurrSourceWordsRange();
- const Phrase sourcePhrase = m_source.GetSubString(sourceWordsRange);
const PhraseDictionary &phraseDictionary = decodeStep.GetPhraseDictionary();
- const TargetPhraseCollection *phraseColl = phraseDictionary.FindEquivPhrase(sourcePhrase);
+ const TargetPhraseCollection *phraseColl = CreateTargetPhraseCollection(&phraseDictionary,&m_source,sourceWordsRange);
if (phraseColl != NULL)
{
@@ -332,7 +333,7 @@ void Manager::ProcessTranslation(const Hypothesis &hypothesis, const DecodeStep
if (targetFactor == NULL)
{
- const Factor *sourceFactor = sourcePhrase.GetFactor(0, factorType)
+ const Factor *sourceFactor = m_source.GetFactor(sourceWordsRange.GetStartPos(), factorType)
,*unkownfactor;
switch (factorType)
{
@@ -354,6 +355,7 @@ void Manager::ProcessTranslation(const Hypothesis &hypothesis, const DecodeStep
}
+#if 0
/***
* Add to m_possibleTranslations all possible translations the phrase table gives us for
* the given phrase
@@ -460,6 +462,7 @@ void Manager::CreateTranslationOptions(const Phrase &phrase, PhraseDictionary &p
}
}
}
+#endif
// helpers
typedef pair<Word, float> WordPair;
diff --git a/moses/src/Manager.h b/moses/src/Manager.h
index da540974d..b49c22604 100755
--- a/moses/src/Manager.h
+++ b/moses/src/Manager.h
@@ -23,13 +23,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <vector>
#include <list>
-#include "Sentence.h"
+#include "Input.h"
#include "Hypothesis.h"
#include "StaticData.h"
#include "TranslationOption.h"
#include "HypothesisCollection.h"
#include "HypothesisCollectionIntermediate.h"
-#include "TranslationOptionCollection.h"
+#include "TranslationOptionCollectionText.h"
#include "LatticePathList.h"
#include "SquareMatrix.h"
#include "WordsBitmap.h"
@@ -41,11 +41,12 @@ class Manager
{
protected:
// data
- Sentence m_source;
+ InputType const& m_source;
+
std::vector < HypothesisCollection > m_hypoStack;
// no of elements = no of words in source + 1
StaticData &m_staticData;
- TranslationOptionCollection m_possibleTranslations;
+ TranslationOptionCollection& m_possibleTranslations;
// functions
void ProcessOneStack(const std::list < DecodeStep > &decodeStepList
@@ -61,13 +62,15 @@ protected:
void ProcessGeneration(const Hypothesis &hypothesis
, const DecodeStep &decodeStep
, HypothesisCollectionIntermediate &outputHypoColl);
+#if 0
void CreateTranslationOptions(const Phrase &phrase
, PhraseDictionary &phraseDictionary
, const LMList &lmListInitial);
+#endif
void OutputHypoStack(int stack = -1);
void OutputHypoStackSize();
public:
- Manager(const Sentence &sentence, StaticData &staticData);
+ Manager(InputType const& source, TranslationOptionCollection&, StaticData &staticData);
~Manager();
void ProcessSentence();
diff --git a/moses/src/PhraseDictionary.h b/moses/src/PhraseDictionary.h
index d899fd2e3..d8c374279 100755
--- a/moses/src/PhraseDictionary.h
+++ b/moses/src/PhraseDictionary.h
@@ -29,8 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Phrase.h"
#include "TargetPhrase.h"
#include "Dictionary.h"
-
-typedef std::list <TargetPhrase> TargetPhraseCollection;
+#include "CreateTargetPhraseCollection.h"
class StaticData;
diff --git a/moses/src/PhraseDictionaryTree.cpp b/moses/src/PhraseDictionaryTree.cpp
index 00cea5399..c45b8b76d 100644
--- a/moses/src/PhraseDictionaryTree.cpp
+++ b/moses/src/PhraseDictionaryTree.cpp
@@ -47,12 +47,11 @@ public:
std::pair<typename M::iterator,bool> p
=m.insert(std::make_pair(k,data.size()));
if(p.second) data.push_back(k);
- assert(p.first->second>=0);
assert(static_cast<size_t>(p.first->second)<data.size());
return p.first->second;
}
const Key& symbol(LabelId i) const {
- assert(i>=0);assert(static_cast<size_t>(i)<data.size());
+ assert(static_cast<size_t>(i)<data.size());
return data[i];}
typedef typename V::const_iterator const_iterator;
@@ -145,7 +144,7 @@ struct PDTimp {
ObjectPool<PPimp> pPool;
PDTimp() : os(0),ot(0),m_factorCollection(0) {PTF::setDefault(InvalidOffT);}
- ~PDTimp() {if(os) fClose(os);if(ot) fClose(ot);}
+ ~PDTimp() {if(os) fClose(os);if(ot) fClose(ot);FreeMemory();}
void FreeMemory()
{
@@ -153,21 +152,15 @@ struct PDTimp {
pPool.reset();
}
- int Read(const std::string& fn) ;
-
- off_t FindOffT(const IPhrase& f) const
- {
- if(f.empty()) return InvalidOffT;
- if(f[0]>=data.size()) return InvalidOffT;
- if(data[f[0]]) return data[f[0]]->find(f); else return InvalidOffT;
- }
+ int Read(const std::string& fn);
void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands)
{
- off_t tCandOffset=FindOffT(f);
- // std::cerr<<"offset of tgtcand: "<<tCandOffset<<" "<<InvalidOffT<<" for phrase '"<<f<<"'\n";
- if(tCandOffset==InvalidOffT) return;
- fSeek(ot,tCandOffset);
+ if(f.empty()) return;
+ if(f[0]>=data.size()) return;
+ if(!data[f[0]]) return;
+ assert(data[f[0]]->find(f)!=InvalidOffT);
+ fSeek(ot,data[f[0]]->find(f));
tgtCands.readBin(ot);
}
@@ -183,6 +176,8 @@ struct PDTimp {
tgtCands.readBin(ot);
}
void PrintTgtCand(const TgtCands& tcands,std::ostream& out) const;
+
+ // convert target candidates from internal data structure to the external one
void ConvertTgtCand(const TgtCands& tcands,std::vector<FactorTgtCand>& rv,FactorType oft) const
{
for(TgtCands::const_iterator i=tcands.begin();i!=tcands.end();++i)
@@ -259,12 +254,12 @@ int PDTimp::Read(const std::string& fn)
void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
{
- for(size_t i=0;i<tcand.size();++i)
+ for(size_t i=0;i<tcand.size();++i)
{
out<<i<<" -- "<<tcand[i].GetScores()<<" -- ";
const IPhrase& iphr=tcand[i].GetPhrase();
for(size_t j=0;j<iphr.size();++j)
- out<<tv.symbol(iphr[j])<<" ";
+ out<<tv.symbol(iphr[j])<<" ";
out<<'\n';
}
}
@@ -455,7 +450,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
}
FILE *oi=fOpen(ofi.c_str(),"wb");
- size_t vob=fWriteVector(oi,vo);
+ fWriteVector(oi,vo);
fClose(oi);
imp->sv.Write(ofsv);
@@ -508,33 +503,36 @@ PhraseDictionaryTree::GetTargetCandidates(PrefixPtr p,
//
////////////////////////////////////////////////////////////
+
+#include <numeric>
#include "Word.h"
#include "Phrase.h"
#include "ConfusionNet.h"
+#include "WordsRange.h"
// Generates all tuples from n indexes with ranges 0 to card[j]-1, respectively..
// Input: number of indexes and ranges: ranges[0] ... ranges[num_idx-1]
// Output: number of tuples and monodimensional array of tuples.
// Reference: mixed-radix generation algorithm (D. E. Knuth, TAOCP v. 4.2)
-size_t GenerateTuples(int num_idx,int* ranges,int *&tuples)
+size_t GenerateTuples(unsigned num_idx,unsigned* ranges,unsigned *&tuples)
{
- int* single_tuple=(int *) new int[num_idx+1];
- int num_tuples=1;
+ unsigned* single_tuple= new unsigned[num_idx+1];
+ unsigned num_tuples=1;
- for (int k=0;k<num_idx;++k)
+ for (unsigned k=0;k<num_idx;++k)
{
num_tuples *= ranges[k];
single_tuple[k]=0;
}
- tuples=new int[num_idx * num_tuples];
+ tuples=new unsigned[num_idx * num_tuples];
// we need this additional element for the last iteration
single_tuple[num_idx]=0;
- int j=0;
- for (int n=0;n<num_tuples;++n){
- memcpy((void *)((tuples + n * num_idx)),(void *)single_tuple,num_idx * sizeof(int));
+ unsigned j=0;
+ for (unsigned n=0;n<num_tuples;++n){
+ memcpy((void *)((tuples + n * num_idx)),(void *)single_tuple,num_idx * sizeof(unsigned));
j=0;
while (single_tuple[j]==ranges[j]-1){single_tuple[j]=0; ++j;}
++single_tuple[j];
@@ -546,7 +544,6 @@ size_t GenerateTuples(int num_idx,int* ranges,int *&tuples)
typedef PhraseDictionaryTree::PrefixPtr PPtr;
typedef std::vector<PPtr> vPPtr;
-typedef std::pair<size_t,size_t> Range;
typedef std::vector<std::vector<Factor const*> > mPhrase;
std::ostream& operator<<(std::ostream& out,const mPhrase& p) {
@@ -562,14 +559,14 @@ std::ostream& operator<<(std::ostream& out,const mPhrase& p) {
struct State {
vPPtr ptrs;
- Range range;
+ WordsRange range;
float score;
State() : range(0,0),score(0.0) {}
State(size_t b,size_t e,const vPPtr& v,float sc=0.0) : ptrs(v),range(b,e),score(sc) {}
- size_t begin() const {return range.first;}
- size_t end() const {return range.second;}
+ size_t begin() const {return range.GetStartPos();}
+ size_t end() const {return range.GetEndPos();}
float GetScore() const {return score;}
};
@@ -580,129 +577,199 @@ std::ostream& operator<<(std::ostream& out,const State& s) {
return out;
}
+typedef std::map<mPhrase,float> E2Costs;
-void GenerateCandidates(const ConfusionNet& src,
- std::vector<PhraseDictionaryTree const*>& pdicts) {
- vPPtr root(pdicts.size());
- std::vector<FactorType> inF(pdicts.size()),outF(pdicts.size());
- for(size_t i=0;i<pdicts.size();++i)
- {
- root[i]=pdicts[i]->GetRoot();
- inF[i]=pdicts[i]->GetInputFactorType();
- outF[i]=pdicts[i]->GetOutputFactorType();
+struct GCData {
+ const std::vector<PhraseDictionaryTree const*>& pdicts;
+ const std::vector<std::vector<float> >& weights;
+ std::vector<FactorType> inF,outF;
+ size_t distinctOutputFactors;
+ vPPtr root;
+ size_t totalTuples,distinctTuples;
+
+
+ GCData(const std::vector<PhraseDictionaryTree const*>& a,
+ const std::vector<std::vector<float> >& b)
+ : pdicts(a),weights(b),totalTuples(0),distinctTuples(0) {
+
+ assert(pdicts.size()==weights.size());
+ std::set<FactorType> distinctOutFset;
+ inF.resize(pdicts.size());
+ outF.resize(pdicts.size());
+ root.resize(pdicts.size());
+ for(size_t i=0;i<pdicts.size();++i)
+ {
+ root[i]=pdicts[i]->GetRoot();
+ inF[i]=pdicts[i]->GetInputFactorType();
+ outF[i]=pdicts[i]->GetOutputFactorType();
+ distinctOutFset.insert(pdicts[i]->GetOutputFactorType());
+ }
+ distinctOutputFactors=distinctOutFset.size();
+ }
+
+ FactorType OutFT(size_t i) const {return outF[i];}
+ FactorType InFT(size_t i) const {return inF[i];}
+ size_t DistinctOutFactors() const {return distinctOutputFactors;}
+
+ const vPPtr& GetRoot() const {return root;}
+
+};
+
+typedef std::vector<Factor const*> vFactor;
+typedef std::vector<std::pair<float,vFactor> > TgtCandList;
+
+typedef std::vector<TgtCandList> OutputFactor2TgtCandList;
+typedef std::vector<OutputFactor2TgtCandList*> Len2Cands;
+
+void GeneratePerFactorTgtList(size_t factorType,PPtr pptr,GCData& data,Len2Cands& len2cands)
+{
+ std::vector<FactorTgtCand> cands;
+ data.pdicts[factorType]->GetTargetCandidates(pptr,cands);
+
+ for(std::vector<FactorTgtCand>::const_iterator cand=cands.begin();cand!=cands.end();++cand) {
+ assert(data.weights[factorType].size()==cand->second.size());
+ float costs=std::inner_product(data.weights[factorType].begin(),
+ data.weights[factorType].end(),
+ cand->second.begin(),
+ 0.0);
+
+ size_t len=cand->first.size();
+ if(len>=len2cands.size()) len2cands.resize(len+1,0);
+ if(!len2cands[len]) len2cands[len]=new OutputFactor2TgtCandList(data.DistinctOutFactors());
+ OutputFactor2TgtCandList &outf2tcandlist=*len2cands[len];
+
+ outf2tcandlist[data.OutFT(factorType)].push_back(std::make_pair(costs,cand->first));
+ }
+}
+
+void GenerateTupleTgtCands(OutputFactor2TgtCandList& tCand,E2Costs& e2costs,GCData& data)
+{
+ // check if candidates are non-empty
+ bool gotCands=1;
+ for(size_t j=0;gotCands && j<tCand.size();++j)
+ gotCands &= !tCand[j].empty();
+
+ if(gotCands) {
+ // enumerate tuples
+ assert(data.DistinctOutFactors()==tCand.size());
+ std::vector<unsigned> radix(data.DistinctOutFactors());
+ for(size_t i=0;i<tCand.size();++i) radix[i]=tCand[i].size();
+
+ unsigned *tuples=0;
+ size_t numTuples=GenerateTuples(radix.size(),&radix[0],tuples);
+
+ data.totalTuples+=numTuples;
+
+ for(size_t i=0;i<numTuples;++i)
+ {
+ mPhrase e(radix.size());float costs=0.0;
+ for(size_t j=0;j<radix.size();++j)
+ {
+ assert(tuples[radix.size()*i+j]<tCand[j].size());
+ std::pair<float,vFactor> const& mycand=tCand[j][tuples[radix.size()*i+j]];
+ e[j]=mycand.second;
+ costs+=mycand.first;
+ }
+#ifdef DEBUG
+ bool mismatch=0;
+ for(size_t j=1;!mismatch && j<e.size();++j)
+ if(e[j].size()!=e[j-1].size()) mismatch=1;
+ assert(mismatch==0);
+#endif
+ std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(e,costs));
+ if(p.second) ++data.distinctTuples;
+ else {
+ // entry known, take min of costs, alternative: sum probs
+ if(costs<p.first->second) p.first->second=costs;
+ }
+ }
+ delete [] tuples;
}
+}
+
+void GenerateCandidates_(E2Costs& e2costs,const vPPtr& nextP,GCData& data)
+{
+ Len2Cands len2cands;
+ // generate candidates for each element of nextP:
+ for(size_t factorType=0;factorType<nextP.size();++factorType)
+ if(nextP[factorType])
+ GeneratePerFactorTgtList(factorType,nextP[factorType],data,len2cands);
+
+ // for each length: enumerate tuples, compute score, and insert in e2costs
+ for(size_t len=0;len<len2cands.size();++len) if(len2cands[len])
+ GenerateTupleTgtCands(*len2cands[len],e2costs,data);
+}
+
+void GenerateCandidates(const ConfusionNet& src,
+ const std::vector<PhraseDictionaryTree const*>& pdicts,
+ const std::vector<std::vector<float> >& weights) {
+ GCData data(pdicts,weights);
std::vector<State> stack;
- for(size_t i=0;i<src.size();++i) stack.push_back(State(i,i,root));
-
- size_t totalTuples=0,distinctTuples=0,lengthMismatch=0;
+ for(size_t i=0;i<src.GetSize();++i) stack.push_back(State(i,i,data.GetRoot()));
- std::map<Range,std::set<mPhrase> > cov2E;
+ std::map<WordsRange,E2Costs> cov2E;
- std::cerr<<"start while loop. initial stack size: "<<stack.size()<<"\n";
+ // std::cerr<<"start while loop. initial stack size: "<<stack.size()<<"\n";
while(!stack.empty())
{
State curr(stack.back());
stack.pop_back();
- std::cerr<<"processing state "<<curr<<" stack size: "<<stack.size()<<"\n";
+ //std::cerr<<"processing state "<<curr<<" stack size: "<<stack.size()<<"\n";
- assert(curr.end()<src.size());
+ assert(curr.end()<src.GetSize());
const ConfusionNet::Column &currCol=src[curr.end()];
- for(size_t i=0;i<currCol.size();++i)
+ for(size_t colidx=0;colidx<currCol.size();++colidx)
{
- const Word& w=currCol[i].first;
+ const Word& w=currCol[colidx].first;
vPPtr nextP(curr.ptrs);
for(size_t j=0;j<nextP.size();++j)
- nextP[j]=pdicts[j]->Extend(nextP[j],w.GetFactor(inF[j])->GetString());
+ nextP[j]=pdicts[j]->Extend(nextP[j],
+ w.GetFactor(data.InFT(j))->GetString());
bool valid=1;
- for(size_t j=0;valid && j<nextP.size();++j)
- if(!nextP[j]) valid=0;
- // valid &= (nextP[j] ? 1 : 0);
-
+ for(size_t j=0;j<nextP.size();++j) if(!nextP[j]) {valid=0;break;}
+
if(valid)
{
- if(curr.end()+1<src.size())
+ if(curr.end()+1<src.GetSize())
stack.push_back(State(curr.begin(),curr.end()+1,nextP,
- curr.GetScore()+currCol[i].second));
-
-
- std::vector<std::vector<FactorTgtCand>* > tCand;
+ curr.GetScore()+currCol[colidx].second));
- // generate candidates for each element of nextP:
- for(size_t j=0;j<nextP.size();++j) if(nextP[j])
- {
- if(outF[j]>=tCand.size()) tCand.resize(outF[j]+1,0);
- if(!tCand[outF[j]]) tCand[outF[j]]=new std::vector<FactorTgtCand>;
- pdicts[j]->GetTargetCandidates(nextP[j],*(tCand[outF[j]]));
- }
-
- // check if candidates are non-empty
- bool gotCands=1;
- for(size_t j=0;gotCands && j<tCand.size();++j)
- gotCands &= tCand[j] && !tCand[j]->empty();
-
- if(gotCands) {
- // enumerate tuples
-
-
- std::vector<int> radix(tCand.size());
- for(size_t i=0;i<tCand.size();++i) radix[i]=tCand[i]->size();
-
- int *tuples;
- size_t numTuples=GenerateTuples(radix.size(),&radix[0],tuples);
-
- totalTuples+=numTuples;
-
- for(size_t i=0;i<numTuples;++i)
- {
- mPhrase e(radix.size());
- for(size_t j=0;j<radix.size();++j)
- {
- assert(tCand[j]); // should be superfluous, but ...
- assert(tuples[radix.size()*i+j]<tCand[j]->size());
- e[j]=(*tCand[j])[tuples[radix.size()*i+j]].first;
- }
-
- bool mismatch=0;
- for(size_t j=1;!mismatch && j<e.size();++j)
- if(e[j].size()!=e[j-1].size()) mismatch=1;
-
- if(mismatch) ++lengthMismatch;
- else if(cov2E[Range(curr.begin(),curr.end()+1)].insert(e).second) ++distinctTuples;
- }
-
-
- delete [] tuples;
- }
-
+ E2Costs &e2costs=cov2E[WordsRange(curr.begin(),curr.end()+1)];
+ GenerateCandidates_(e2costs,nextP,data);
}
}
+ // check if there are translations of one-word phrases ...
//if(curr.begin()==curr.end() && tCand.empty()) {}
- }
- std::cerr<<"tuple stats: total: "<<totalTuples
- <<" distinct: "<<distinctTuples<<" ("<<(distinctTuples/(0.01*totalTuples))
- <<"%) lengthMismatch: "<<lengthMismatch<<" ("<<(lengthMismatch/(0.01*totalTuples))<<"%)\n";
+ } // end while(!stack.empty())
+
+ // print statistics for debugging purposes
+ std::cerr<<"tuple stats: total: "<<data.totalTuples
+ <<" distinct: "<<data.distinctTuples<<" ("
+ <<(data.distinctTuples/(0.01*data.totalTuples))
+ <<"%)\n";
std::cerr<<"per coverage set:\n";
- for(std::map<Range,std::set<mPhrase> >::const_iterator i=cov2E.begin();i!=cov2E.end();++i) {
- std::cerr<<i->first.first<<","<<i->first.second<<" -- distinct cands: "<<i->second.size()<<"\n";
+ for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
+ i!=cov2E.end();++i) {
+ std::cerr<<i->first<<" -- distinct cands: "
+ <<i->second.size()<<"\n";
}
std::cerr<<"\n\n";
std::cerr<<"full list:\n";
- for(std::map<Range,std::set<mPhrase> >::const_iterator i=cov2E.begin();i!=cov2E.end();++i) {
- std::cerr<<i->first.first<<","<<i->first.second<<" -- distinct cands: "<<i->second.size()<<"\n";
- for(std::set<mPhrase>::const_iterator j=i->second.begin();j!=i->second.end();++j)
- std::cerr<<*j<<"\n";
+ for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
+ i!=cov2E.end();++i) {
+ std::cerr<<i->first<<" -- distinct cands: "
+ <<i->second.size()<<"\n";
+ for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j)
+ std::cerr<<j->first<<" -- "<<j->second<<"\n";
}
-
-
-
}
diff --git a/moses/src/PhraseDictionaryTree.h b/moses/src/PhraseDictionaryTree.h
index fe85eec2c..b2728af7b 100644
--- a/moses/src/PhraseDictionaryTree.h
+++ b/moses/src/PhraseDictionaryTree.h
@@ -22,15 +22,6 @@ class PhraseDictionaryTree : public Dictionary {
PDTimp *imp; //implementation
FactorType m_inFactorType,m_outFactorType;
public:
-
- class PrefixPtr {
- PPimp* imp;
- friend class PDTimp;
- public:
- PrefixPtr(PPimp* x=0) : imp(x) {}
- operator bool() const;
- };
-
PhraseDictionaryTree(size_t noScoreComponent,
FactorCollection* factorCollection=0,
FactorType inputFactorType=Surface,
@@ -38,44 +29,73 @@ public:
virtual ~PhraseDictionaryTree();
- DecodeType GetDecodeType() const
- {
- return Translate;
- }
- size_t GetSize() const
- {
- return 0;
- }
-
+ DecodeType GetDecodeType() const {return Translate;}
+ size_t GetSize() const {return 0;}
FactorType GetInputFactorType() const {return m_inFactorType;}
FactorType GetOutputFactorType() const {return m_outFactorType;}
// convert from ascii phrase table format
- int Create(std::istream& In,const std::string& OutFileNamePrefix);
- int Read(const std::string& FileNamePrefix);
+ // note: only creates table, does not keep it in memory
+ // -> use Read(outFileNamePrefix);
+ int Create(std::istream& in,const std::string& outFileNamePrefix);
+
+ int Read(const std::string& fileNamePrefix);
- // free memory used by the prefix tree
+ // free memory used by the prefix tree etc.
void FreeMemory() const;
- // access with full src phrase
+
+ /**************************************
+ * access with full source phrase *
+ **************************************/
+ // get the target candidates for a given factor sequence/phrase
void GetTargetCandidates(const std::vector<const Factor*>& src,
std::vector<FactorTgtCand>& rv) const;
+
+ // print target candidates for a given phrase, mainly for debugging
void PrintTargetCandidates(const std::vector<std::string>& src,
std::ostream& out) const;
- // access to prefix tree
+
+
+ /*****************************
+ * access to prefix tree *
+ *****************************/
+
+ // 'pointer' into prefix tree
+ // the only permitted direct operation is a check for NULL,
+ // e.g. PrefixPtr p; if(p) ...
+ // other usage only through PhraseDictionaryTree-functions below
+
+ class PrefixPtr {
+ PPimp* imp;
+ friend class PDTimp;
+ public:
+ PrefixPtr(PPimp* x=0) : imp(x) {}
+ operator bool() const;
+ };
+
+ // return pointer to root node
PrefixPtr GetRoot() const;
- PrefixPtr Extend(PrefixPtr,const std::string&) const;
+ // extend pointer with a word/Factorstring and return the resulting successor
+ // pointer. If there is no such successor node, the result will evaluate to
+ // false. Requirement: the input pointer p evaluates to true.
+ PrefixPtr Extend(PrefixPtr p,const std::string& s) const;
+ // get the target candidates for a given prefix pointer
+ // requirement: the pointer has to evaluate to true
void GetTargetCandidates(PrefixPtr p,
std::vector<FactorTgtCand>& rv) const;
- void PrintTargetCandidates(PrefixPtr p,std::ostream& out) const;
+ // print target candidates for a given prefix pointer to a stream, mainly
+ // for debugging
+ void PrintTargetCandidates(PrefixPtr p,std::ostream& out) const;
};
void GenerateCandidates(const ConfusionNet& src,
- std::vector<PhraseDictionaryTree const*>& pdicts) ;
+ const std::vector<PhraseDictionaryTree const*>& pdicts,
+ const std::vector<std::vector<float> >& weights) ;
#endif /*PHRASEDICTIONARYTREE_H_*/
diff --git a/moses/src/Sentence.h b/moses/src/Sentence.h
index 18d4916b0..f0a68af20 100755
--- a/moses/src/Sentence.h
+++ b/moses/src/Sentence.h
@@ -25,32 +25,38 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#include "Word.h"
#include "Phrase.h"
+#include "Input.h"
+
+class WordsRangs;
/***
* a Sentence is a Phrase with an ID
*/
-class Sentence : public Phrase
+class Sentence : public Phrase, public InputType
{
-protected:
- long m_translationId;
-
+ protected:
Sentence()
- {
- }
-public:
- Sentence(FactorDirection direction)
- :Phrase(direction)
- {
- }
-
- // for db stuff
- long GetTranslationId()
- {
- return m_translationId;
- }
- void SetTranslationId(long translationId)
- { // for db stuff;
- m_translationId = translationId;
- }
+ {
+ }
+ public:
+ Sentence(FactorDirection direction) : InputType(),Phrase(direction)
+ {
+ }
+
+ Phrase GetSubString(const WordsRange& r) const
+ {
+ return Phrase::GetSubString(r);
+ }
+
+ const Factor* GetFactor(size_t pos, FactorType factorType) const
+ {
+ return Phrase::GetFactor(pos,factorType);
+ }
+
+ size_t GetSize() const
+ {
+ return Phrase::GetSize();
+ }
+
};
diff --git a/moses/src/TranslationOptionCollection.cpp b/moses/src/TranslationOptionCollection.cpp
index 47890a390..37ab55b38 100644
--- a/moses/src/TranslationOptionCollection.cpp
+++ b/moses/src/TranslationOptionCollection.cpp
@@ -18,7 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "TranslationOptionCollection.h"
-#include "Sentence.h"
+//#include "Sentence.h"
#include "DecodeStep.h"
#include "LanguageModel.h"
#include "PhraseDictionary.h"
@@ -26,170 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-TranslationOptionCollection::TranslationOptionCollection(const Sentence &inputSentence)
- : m_inputSentence(inputSentence)
- ,m_futureScore(inputSentence.GetSize())
- ,m_initialCoverage(inputSentence.GetSize())
- {
- }
-
-void TranslationOptionCollection::CreateTranslationOptions(
- const list < DecodeStep > &decodeStepList
- , const LMList &languageModels
- , const LMList &allLM
- , FactorCollection &factorCollection
- , float weightWordPenalty
- , bool dropUnknown
- , size_t verboseLevel)
+TranslationOptionCollection::TranslationOptionCollection(size_t srcSize)
+ : m_futureScore(srcSize),m_initialCoverage(srcSize)
{
- // loop over all substrings of the source sentence, look them up
- // in the phraseDictionary (which is the- possibly filtered-- phrase
- // table loaded on initialization), generate TranslationOption objects
- // for all phrases
- //
- // possible optimization- don't consider phrases longer than the longest
- // phrase in the PhraseDictionary?
-
- PhraseDictionary &phraseDictionary = decodeStepList.front().GetPhraseDictionary();
- for (size_t startPos = 0 ; startPos < m_inputSentence.GetSize() ; startPos++)
- {
- // reuse phrase, add next word on
- Phrase sourcePhrase( m_inputSentence.GetDirection());
-
- for (size_t endPos = startPos ; endPos < m_inputSentence.GetSize() ; endPos++)
- {
- const WordsRange wordsRange(startPos, endPos);
-
- FactorArray &newWord = sourcePhrase.AddWord();
- Word::Copy(newWord, m_inputSentence.GetFactorArray(endPos));
-
- const TargetPhraseCollection *phraseColl = phraseDictionary.FindEquivPhrase(sourcePhrase);
- if (phraseColl != NULL)
- {
- if (verboseLevel >= 3) {
- cout << "[" << sourcePhrase << "; " << startPos << "-" << endPos << "]\n";
- }
- TargetPhraseCollection::const_iterator iterTargetPhrase;
- for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != phraseColl->end() ; ++iterTargetPhrase)
- {
- const TargetPhrase &targetPhrase = *iterTargetPhrase;
-
- const WordsRange wordsRange(startPos, endPos);
- TranslationOption transOpt(wordsRange
- , targetPhrase);
-
- push_back(transOpt);
- if (verboseLevel >= 3) {
- cout << "\t" << transOpt << "\n";
- }
- }
- if (verboseLevel >= 3) { cout << endl; }
- }
- else if (sourcePhrase.GetSize() == 1)
- {
- // unknown word, add to target, and add as poss trans
- // float weightWP = m_staticData.GetWeightWordPenalty();
- const FactorTypeSet &targetFactors = phraseDictionary.GetFactorsUsed(Output);
- int isDigit = 0;
- if (dropUnknown)
- {
- const Factor *f = sourcePhrase.GetFactor(0, static_cast<FactorType>(0)); // surface @ 0
- std::string s = f->ToString();
- isDigit = s.find_first_of("0123456789");
- if (isDigit == string::npos) isDigit = 0;
- else isDigit = 1;
- // modify the starting bitmap
- }
- if (!dropUnknown || isDigit)
- {
- // add to dictionary
- TargetPhrase targetPhraseOrig(Output, &phraseDictionary);
- FactorArray &targetWord = targetPhraseOrig.AddWord();
-
- const FactorArray &sourceWord = sourcePhrase.GetFactorArray(0);
-
- for (unsigned int currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++)
- {
- if (targetFactors.Contains(currFactor))
- {
- FactorType factorType = static_cast<FactorType>(currFactor);
-
- const Factor *factor = sourceWord[factorType]
- ,*unkownfactor;
- switch (factorType)
- {
- case POS:
- unkownfactor = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR);
- targetWord[factorType] = unkownfactor;
- break;
- default:
- unkownfactor = factorCollection.AddFactor(Output, factorType, factor->GetString());
- targetWord[factorType] = unkownfactor;
- break;
- }
- }
- }
-
- targetPhraseOrig.SetScore(allLM, weightWordPenalty);
-
- phraseDictionary.AddEquivPhrase(sourcePhrase, targetPhraseOrig);
- const TargetPhraseCollection *phraseColl = phraseDictionary.FindEquivPhrase(sourcePhrase);
- const TargetPhrase &targetPhrase = *phraseColl->begin();
-
- TranslationOption transOpt(wordsRange, targetPhrase);
-
- push_back(transOpt);
- }
- else // drop source word
- { m_initialCoverage.SetValue(startPos, startPos,1); }
- }
- }
- }
-
- // create future score matrix
- // for each span in the source phrase (denoted by start and end)
- for(size_t startPos = 0; startPos < m_inputSentence.GetSize() ; startPos++)
- {
- for(size_t endPos = startPos; endPos < m_inputSentence.GetSize() ; endPos++)
- {
- size_t length = endPos - startPos + 1;
- vector< float > score(length + 1);
- score[0] = 0;
- for(size_t currLength = 1 ; currLength <= length ; currLength++)
- // initalize their future cost to -infinity
- {
- score[currLength] = - numeric_limits<float>::infinity();
- }
-
- for(size_t currLength = 0 ; currLength < length ; currLength++)
- {
- // iterate over possible translations of this source subphrase and
- // keep track of the highest cost option
- TranslationOptionCollection::const_iterator iterTransOpt;
- for(iterTransOpt = begin() ; iterTransOpt != end() ; ++iterTransOpt)
- {
- const TranslationOption &transOpt = *iterTransOpt;
- size_t index = currLength + transOpt.GetSize();
-
- if (transOpt.GetStartPos() == currLength + startPos
- && transOpt.GetEndPos() <= endPos
- && transOpt.GetFutureScore() + score[currLength] > score[index])
- {
- score[index] = transOpt.GetFutureScore() + score[currLength];
- }
- }
- }
- // record the highest cost option in the future cost table.
- m_futureScore.SetScore(startPos, endPos, score[length]);
-
- //print information about future cost table when verbose option is set
-
- if(verboseLevel > 0)
- {
- cout<<"future cost from "<<startPos<<" to "<<endPos<<" is "<<score[length]<<endl;
- }
- }
- }
-
}
+TranslationOptionCollection::~TranslationOptionCollection() {}
+
diff --git a/moses/src/TranslationOptionCollection.h b/moses/src/TranslationOptionCollection.h
index 57183bde5..37709018b 100755
--- a/moses/src/TranslationOptionCollection.h
+++ b/moses/src/TranslationOptionCollection.h
@@ -27,28 +27,33 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "SquareMatrix.h"
#include "WordsBitmap.h"
-class Sentence;
class DecodeStep;
class LanguageModel;
class FactorCollection;
class TranslationOptionCollection : public std::list< TranslationOption >
{
+ TranslationOptionCollection(const TranslationOptionCollection&); // no copy constructor
protected:
- const Sentence &m_inputSentence;
SquareMatrix m_futureScore;
WordsBitmap m_initialCoverage;
+
+ TranslationOptionCollection(size_t srcSize);
public:
- TranslationOptionCollection(const Sentence &inputSentence);
-
- void CreateTranslationOptions(const std::list < DecodeStep > &decodeStepList
+ virtual ~TranslationOptionCollection();
+
+ virtual void CreateTranslationOptions(const std::list < DecodeStep > &decodeStepList
, const LMList &languageModels
, const LMList &allLM
, FactorCollection &factorCollection
, float weightWordPenalty
, bool dropUnknown
- , size_t verboseLevel);
+ , size_t verboseLevel) =0;
+ // get length/size of source input
+ virtual size_t GetSourceSize() const=0;
+
+
inline const SquareMatrix &GetFutureScore()
{
return m_futureScore;
diff --git a/moses/src/TranslationOptionCollectionConfusionNet.cpp b/moses/src/TranslationOptionCollectionConfusionNet.cpp
new file mode 100644
index 000000000..e7854cd6a
--- /dev/null
+++ b/moses/src/TranslationOptionCollectionConfusionNet.cpp
@@ -0,0 +1,25 @@
+// $Id$
+#include "TranslationOptionCollectionConfusionNet.h"
+#include "ConfusionNet.h"
+#include "DecodeStep.h"
+#include "LanguageModel.h"
+#include "PhraseDictionary.h"
+#include "FactorCollection.h"
+
+TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(const ConfusionNet &input)
+ : TranslationOptionCollection(input.GetSize()),m_inputCN(input) {}
+
+size_t TranslationOptionCollectionConfusionNet::GetSourceSize() const
+{
+ return m_inputCN.GetSize();
+}
+void TranslationOptionCollectionConfusionNet::
+CreateTranslationOptions(const std::list < DecodeStep > &decodeStepList,
+ const LMList &languageModels,
+ const LMList &allLM,
+ FactorCollection &factorCollection,
+ float weightWordPenalty,
+ bool dropUnknown,
+ size_t verboseLevel)
+{
+}
diff --git a/moses/src/TranslationOptionCollectionConfusionNet.h b/moses/src/TranslationOptionCollectionConfusionNet.h
new file mode 100644
index 000000000..938fba121
--- /dev/null
+++ b/moses/src/TranslationOptionCollectionConfusionNet.h
@@ -0,0 +1,23 @@
+// $Id$
+#ifndef TRANSLATIONOPTIONCOLLECTIONCONFUSIONNET_H_
+#define TRANSLATIONOPTIONCOLLECTIONCONFUSIONNET_H_
+#include "TranslationOptionCollection.h"
+
+class ConfusionNet;
+
+class TranslationOptionCollectionConfusionNet : public TranslationOptionCollection {
+ const ConfusionNet &m_inputCN;
+ public:
+ TranslationOptionCollectionConfusionNet(const ConfusionNet &input);
+
+ void CreateTranslationOptions(const std::list < DecodeStep > &decodeStepList,
+ const LMList &languageModels,
+ const LMList &allLM,
+ FactorCollection &factorCollection,
+ float weightWordPenalty,
+ bool dropUnknown,
+ size_t verboseLevel);
+
+ size_t GetSourceSize() const;
+};
+#endif
diff --git a/moses/src/TranslationOptionCollectionText.cpp b/moses/src/TranslationOptionCollectionText.cpp
new file mode 100644
index 000000000..ec971458e
--- /dev/null
+++ b/moses/src/TranslationOptionCollectionText.cpp
@@ -0,0 +1,178 @@
+// $Id$
+#include "TranslationOptionCollectionText.h"
+#include "Sentence.h"
+#include "DecodeStep.h"
+#include "LanguageModel.h"
+#include "PhraseDictionary.h"
+#include "FactorCollection.h"
+
+
+TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &inputSentence)
+ : TranslationOptionCollection(inputSentence.GetSize()),m_inputSentence(inputSentence) {}
+
+size_t TranslationOptionCollectionText::GetSourceSize() const
+{
+ return m_inputSentence.GetSize();
+}
+
+
+void TranslationOptionCollectionText::
+CreateTranslationOptions(const std::list < DecodeStep > &decodeStepList,
+ const LMList &languageModels,
+ const LMList &allLM,
+ FactorCollection &factorCollection,
+ float weightWordPenalty,
+ bool dropUnknown,
+ size_t verboseLevel)
+{
+ // loop over all substrings of the source sentence, look them up
+ // in the phraseDictionary (which is the- possibly filtered-- phrase
+ // table loaded on initialization), generate TranslationOption objects
+ // for all phrases
+ //
+ // possible optimization- don't consider phrases longer than the longest
+ // phrase in the PhraseDictionary?
+
+ PhraseDictionary &phraseDictionary = decodeStepList.front().GetPhraseDictionary();
+ for (size_t startPos = 0 ; startPos < m_inputSentence.GetSize() ; startPos++)
+ {
+ // reuse phrase, add next word on
+ Phrase sourcePhrase( m_inputSentence.GetDirection());
+
+ for (size_t endPos = startPos ; endPos < m_inputSentence.GetSize() ; endPos++)
+ {
+ const WordsRange wordsRange(startPos, endPos);
+
+ FactorArray &newWord = sourcePhrase.AddWord();
+ Word::Copy(newWord, m_inputSentence.GetFactorArray(endPos));
+
+ const TargetPhraseCollection *phraseColl = phraseDictionary.FindEquivPhrase(sourcePhrase);
+ if (phraseColl != NULL)
+ {
+ if (verboseLevel >= 3) {
+ std::cout << "[" << sourcePhrase << "; " << startPos << "-" << endPos << "]\n";
+ }
+ TargetPhraseCollection::const_iterator iterTargetPhrase;
+ for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != phraseColl->end() ; ++iterTargetPhrase)
+ {
+ const TargetPhrase &targetPhrase = *iterTargetPhrase;
+
+ const WordsRange wordsRange(startPos, endPos);
+ TranslationOption transOpt(wordsRange
+ , targetPhrase);
+
+ push_back(transOpt);
+ if (verboseLevel >= 3) {
+ std::cout << "\t" << transOpt << "\n";
+ }
+ }
+ if (verboseLevel >= 3) { std::cout << std::endl; }
+ }
+ else if (sourcePhrase.GetSize() == 1)
+ {
+ // unknown word, add to target, and add as poss trans
+ // float weightWP = m_staticData.GetWeightWordPenalty();
+ const FactorTypeSet &targetFactors = phraseDictionary.GetFactorsUsed(Output);
+ int isDigit = 0;
+ if (dropUnknown)
+ {
+ const Factor *f = sourcePhrase.GetFactor(0, static_cast<FactorType>(0)); // surface @ 0
+ std::string s = f->ToString();
+ isDigit = s.find_first_of("0123456789");
+ if (isDigit == std::string::npos) isDigit = 0;
+ else isDigit = 1;
+ // modify the starting bitmap
+ }
+ if (!dropUnknown || isDigit)
+ {
+ // add to dictionary
+ TargetPhrase targetPhraseOrig(Output, &phraseDictionary);
+ FactorArray &targetWord = targetPhraseOrig.AddWord();
+
+ const FactorArray &sourceWord = sourcePhrase.GetFactorArray(0);
+
+ for (unsigned int currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++)
+ {
+ if (targetFactors.Contains(currFactor))
+ {
+ FactorType factorType = static_cast<FactorType>(currFactor);
+
+ const Factor *factor = sourceWord[factorType]
+ ,*unkownfactor;
+ switch (factorType)
+ {
+ case POS:
+ unkownfactor = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR);
+ targetWord[factorType] = unkownfactor;
+ break;
+ default:
+ unkownfactor = factorCollection.AddFactor(Output, factorType, factor->GetString());
+ targetWord[factorType] = unkownfactor;
+ break;
+ }
+ }
+ }
+
+ targetPhraseOrig.SetScore(allLM, weightWordPenalty);
+
+ phraseDictionary.AddEquivPhrase(sourcePhrase, targetPhraseOrig);
+ const TargetPhraseCollection *phraseColl = phraseDictionary.FindEquivPhrase(sourcePhrase);
+ const TargetPhrase &targetPhrase = *phraseColl->begin();
+
+ TranslationOption transOpt(wordsRange, targetPhrase);
+
+ push_back(transOpt);
+ }
+ else // drop source word
+ { m_initialCoverage.SetValue(startPos, startPos,1); }
+ }
+ }
+ }
+
+ // create future score matrix
+ // for each span in the source phrase (denoted by start and end)
+ for(size_t startPos = 0; startPos < m_inputSentence.GetSize() ; startPos++)
+ {
+ for(size_t endPos = startPos; endPos < m_inputSentence.GetSize() ; endPos++)
+ {
+ size_t length = endPos - startPos + 1;
+ std::vector< float > score(length + 1);
+ score[0] = 0;
+ for(size_t currLength = 1 ; currLength <= length ; currLength++)
+ // initalize their future cost to -infinity
+ {
+ score[currLength] = - std::numeric_limits<float>::infinity();
+ }
+
+ for(size_t currLength = 0 ; currLength < length ; currLength++)
+ {
+ // iterate over possible translations of this source subphrase and
+ // keep track of the highest cost option
+ TranslationOptionCollection::const_iterator iterTransOpt;
+ for(iterTransOpt = begin() ; iterTransOpt != end() ; ++iterTransOpt)
+ {
+ const TranslationOption &transOpt = *iterTransOpt;
+ size_t index = currLength + transOpt.GetSize();
+
+ if (transOpt.GetStartPos() == currLength + startPos
+ && transOpt.GetEndPos() <= endPos
+ && transOpt.GetFutureScore() + score[currLength] > score[index])
+ {
+ score[index] = transOpt.GetFutureScore() + score[currLength];
+ }
+ }
+ }
+ // record the highest cost option in the future cost table.
+ m_futureScore.SetScore(startPos, endPos, score[length]);
+
+ //print information about future cost table when verbose option is set
+
+ if(verboseLevel > 0)
+ {
+ std::cout<<"future cost from "<<startPos<<" to "<<endPos<<" is "<<score[length]<<std::endl;
+ }
+ }
+ }
+
+}
+
diff --git a/moses/src/TranslationOptionCollectionText.h b/moses/src/TranslationOptionCollectionText.h
new file mode 100644
index 000000000..2b085722e
--- /dev/null
+++ b/moses/src/TranslationOptionCollectionText.h
@@ -0,0 +1,22 @@
+// $Id$
+#ifndef TRANSLATIONOPTIONCOLLECTIONTEXT_H_
+#define TRANSLATIONOPTIONCOLLECTIONTEXT_H_
+#include "TranslationOptionCollection.h"
+#include "Sentence.h"
+
+class TranslationOptionCollectionText : public TranslationOptionCollection {
+ Sentence const& m_inputSentence;
+ public:
+ TranslationOptionCollectionText(Sentence const& inputSentence);
+
+ void CreateTranslationOptions(const std::list < DecodeStep > &decodeStepList,
+ const LMList &languageModels,
+ const LMList &allLM,
+ FactorCollection &factorCollection,
+ float weightWordPenalty,
+ bool dropUnknown,
+ size_t verboseLevel);
+
+ size_t GetSourceSize() const;
+};
+#endif
diff --git a/moses/src/WordsRange.h b/moses/src/WordsRange.h
index f06cf5633..aa3b4c281 100755
--- a/moses/src/WordsRange.h
+++ b/moses/src/WordsRange.h
@@ -53,6 +53,10 @@ public:
{
return (m_startPos == NOT_FOUND) ? 0 : m_endPos - m_startPos + 1;
}
-
+ inline bool operator<(const WordsRange& x) const
+ {
+ return (m_startPos<x.m_startPos
+ || (m_startPos==x.m_startPos && m_endPos<x.m_endPos));
+ }
};