diff options
author | hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-10-05 13:45:49 +0400 |
---|---|---|
committer | hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-10-05 13:45:49 +0400 |
commit | efd3ada355fe31a24b0f59d7c7431071df606e69 (patch) | |
tree | b3526a057542db99966ebe91de79e9e7d1a6eb23 /moses | |
parent | e5231db5477d226c34db68a15d79b74fcde306f5 (diff) |
added comments
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@857 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses')
-rw-r--r-- | moses/src/DecodeStep.h | 2 | ||||
-rw-r--r-- | moses/src/DecodeStepGeneration.cpp | 2 | ||||
-rw-r--r-- | moses/src/DecodeStepGeneration.h | 2 | ||||
-rw-r--r-- | moses/src/DecodeStepTranslation.cpp | 4 | ||||
-rw-r--r-- | moses/src/DecodeStepTranslation.h | 2 | ||||
-rw-r--r-- | moses/src/TranslationOptionCollection.cpp | 163 | ||||
-rwxr-xr-x | moses/src/TranslationOptionCollection.h | 26 |
7 files changed, 118 insertions, 83 deletions
diff --git a/moses/src/DecodeStep.h b/moses/src/DecodeStep.h index d31e3cb54..b6dae8c1b 100644 --- a/moses/src/DecodeStep.h +++ b/moses/src/DecodeStep.h @@ -102,6 +102,6 @@ public: , PartialTranslOptColl &outputPartialTranslOptColl , FactorCollection &factorCollection , TranslationOptionCollection *toc - , bool observeTableLimit) const = 0; + , bool adhereTableLimit) const = 0; }; diff --git a/moses/src/DecodeStepGeneration.cpp b/moses/src/DecodeStepGeneration.cpp index 998e6dc98..0677424a2 100644 --- a/moses/src/DecodeStepGeneration.cpp +++ b/moses/src/DecodeStepGeneration.cpp @@ -79,7 +79,7 @@ void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOp , PartialTranslOptColl &outputPartialTranslOptColl , FactorCollection &factorCollection , TranslationOptionCollection *toc - , bool observeTableLimit) const + , bool adhereTableLimit) const { //TRACE_ERR(inputPartialTranslOpt << endl); if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) diff --git a/moses/src/DecodeStepGeneration.h b/moses/src/DecodeStepGeneration.h index 66adc5f8f..f4ab90761 100644 --- a/moses/src/DecodeStepGeneration.h +++ b/moses/src/DecodeStepGeneration.h @@ -40,7 +40,7 @@ public: , PartialTranslOptColl &outputPartialTranslOptColl , FactorCollection &factorCollection , TranslationOptionCollection *toc - , bool observeTableLimit) const; + , bool adhereTableLimit) const; private: TranslationOption *MergeGeneration(const TranslationOption& oldTO, Phrase &mergePhrase diff --git a/moses/src/DecodeStepTranslation.cpp b/moses/src/DecodeStepTranslation.cpp index b0517dde0..1095f0024 100644 --- a/moses/src/DecodeStepTranslation.cpp +++ b/moses/src/DecodeStepTranslation.cpp @@ -53,7 +53,7 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO , PartialTranslOptColl &outputPartialTranslOptColl , FactorCollection &factorCollection , TranslationOptionCollection *toc - , bool observeTableLimit) const + , bool adhereTableLimit) const { //TRACE_ERR(inputPartialTranslOpt << endl); if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) @@ -75,7 +75,7 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO if (phraseColl != NULL) { TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd; - iterEnd = (!observeTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit; + iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit; for (iterTargetPhrase = phraseColl->begin(); iterTargetPhrase != iterEnd; ++iterTargetPhrase) { diff --git a/moses/src/DecodeStepTranslation.h b/moses/src/DecodeStepTranslation.h index 86c1e250b..218892ddc 100644 --- a/moses/src/DecodeStepTranslation.h +++ b/moses/src/DecodeStepTranslation.h @@ -39,7 +39,7 @@ public: , PartialTranslOptColl &outputPartialTranslOptColl , FactorCollection &factorCollection , TranslationOptionCollection *toc - , bool observeTableLimit) const; + , bool adhereTableLimit) const; private: TranslationOption *MergeTranslation(const TranslationOption& oldTO, const TargetPhrase &targetPhrase) const; }; diff --git a/moses/src/TranslationOptionCollection.cpp b/moses/src/TranslationOptionCollection.cpp index 8a1bd9508..80edc0afd 100644 --- a/moses/src/TranslationOptionCollection.cpp +++ b/moses/src/TranslationOptionCollection.cpp @@ -33,7 +33,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA using namespace std; -/** constructor; since translation options are indexed by coverage span, the corresponding data structure is initialized here */ +/** constructor; since translation options are indexed by coverage span, the corresponding data structure is initialized here + * This fn should be called by inherited classes +*/ TranslationOptionCollection::TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage) : m_source(src) ,m_futureScore(src.GetSize()) @@ -108,6 +110,22 @@ void TranslationOptionCollection::Prune() << "Total translation options pruned: " << totalPruned << std::endl); } +/** Force a creation of a translation option where there are none for a particular source position. +* ie. where a source word has not been translated, create a translation option by +* 1. not observing the table limits on phrase/generation tables +* 2. using the handler ProcessUnknownWord() +* Call this function once translation option collection has been filled with translation options +* +* This function calls for unknown words is complicated by the fact it must handle different input types. +* The call stack is +* Base::ProcessUnknownWord() +* Inherited::ProcessUnknownWord() +* Base::ProcessOneUnknownWord() +* +* \param decodeStepList list of decoding steps +* \param factorCollection input sentence with all factors +*/ + void TranslationOptionCollection::ProcessUnknownWord(const std::list < DecodeStep* > &decodeStepList, FactorCollection &factorCollection) { size_t size = m_source.GetSize(); @@ -147,11 +165,75 @@ void TranslationOptionCollection::ProcessUnknownWord(const std::list < DecodeSte } } -/** compute the future score matrix used in search */ -void TranslationOptionCollection::CalcFutureScore() +/** special handling of ONE unknown words. Either add temporarily add word to translation table, + * or drop the translation. + * This function should be called by the ProcessOneUnknownWord() in the inherited class + * At the moment, this unknown word handler is a bit of a hack, if copies over each factor from source + * to target word, or uses the 'UNK' factor. + * Ideally, this function should be in a class which can be expanded upon, for example, + * to create a morphologically aware handler. + * + * \param sourceWord the unknown word + * \param sourcePos + * \param factorCollection input sentence with all factors + */ +void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord, + size_t sourcePos + , FactorCollection &factorCollection) { - // create future score matrix in a dynamic programming fashion + // unknown word, add as trans opt + + size_t isDigit = 0; + if (StaticData::Instance()->GetDropUnknown()) + { + const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface + std::string s = f->ToString(); + isDigit = s.find_first_of("0123456789"); + if (isDigit == string::npos) + isDigit = 0; + else + isDigit = 1; + // modify the starting bitmap + } + + TranslationOption *transOpt; + if (! StaticData::Instance()->GetDropUnknown() || isDigit) + { + // add to dictionary + TargetPhrase targetPhrase(Output); + Word &targetWord = targetPhrase.AddWord(); + + for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) + { + FactorType factorType = static_cast<FactorType>(currFactor); + + const Factor *sourceFactor = sourceWord[currFactor]; + if (sourceFactor == NULL) + targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR); + else + targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString()); + } + + targetPhrase.SetScore(); + + transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0); + } + else + { // drop source word. create blank trans opt + const TargetPhrase targetPhrase(Output); + transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0); + } + transOpt->CalcScore(); + Add(transOpt); +} + +/** compute future score matrix in a dynamic programming fashion. + * This matrix used in search. + * Call this function once translation option collection has been filled with translation options +*/ +void TranslationOptionCollection::CalcFutureScore() +{ // setup the matrix (ignore lower triangle, set upper triangle to -inf size_t size = m_source.GetSize(); // the width of the matrix @@ -260,19 +342,20 @@ void TranslationOptionCollection::CreateTranslationOptions(const list < DecodeSt CalcFutureScore(); } -/** subroutine for CreateTranslationOptions: collect translation options - * that exactly cover a specific input span +/** collect translation options that exactly cover a specific input span. + * Called by CreateTranslationOptions() and ProcessUnknownWord() * \param decodeStepList list of decoding steps * \param factorCollection input sentence with all factors * \param startPos first position in input sentence * \param lastPos last position in input sentence + * \param adhereTableLimit whether phrase & generation table limits are adhered to */ void TranslationOptionCollection::CreateTranslationOptionsForRange( const list < DecodeStep* > &decodeStepList , FactorCollection &factorCollection , size_t startPos , size_t endPos - , bool observeTableLimit) + , bool adhereTableLimit) { // partial trans opt stored in here PartialTranslOptColl* oldPtoc = new PartialTranslOptColl; @@ -283,7 +366,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange( ProcessInitialTranslation(decodeStep, factorCollection , *oldPtoc - , startPos, endPos, observeTableLimit ); + , startPos, endPos, adhereTableLimit ); // do rest of decode steps size_t totalEarlyPruned = 0; @@ -304,7 +387,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange( , *newPtoc , factorCollection , this - , observeTableLimit); + , adhereTableLimit); } // last but 1 partial trans not required anymore totalEarlyPruned += newPtoc->GetPrunedCount(); @@ -330,68 +413,16 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange( // cerr << "Early translation options pruned: " << totalEarlyPruned << endl; } - -/** special handling of unknown words: add special translation (or drop) */ -void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord, - size_t sourcePos - , FactorCollection &factorCollection) -{ - // unknown word, add as trans opt - - size_t isDigit = 0; - if (StaticData::Instance()->GetDropUnknown()) - { - const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface - std::string s = f->ToString(); - isDigit = s.find_first_of("0123456789"); - if (isDigit == string::npos) - isDigit = 0; - else - isDigit = 1; - // modify the starting bitmap - } - - TranslationOption *transOpt; - if (! StaticData::Instance()->GetDropUnknown() || isDigit) - { - // add to dictionary - TargetPhrase targetPhrase(Output); - Word &targetWord = targetPhrase.AddWord(); - - for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) - { - FactorType factorType = static_cast<FactorType>(currFactor); - - const Factor *sourceFactor = sourceWord[currFactor]; - if (sourceFactor == NULL) - targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR); - else - targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString()); - } - - targetPhrase.SetScore(); - - transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0); - } - else - { // drop source word. create blank trans opt - const TargetPhrase targetPhrase(Output); - transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0); - } - - transOpt->CalcScore(); - Add(transOpt); -} - - -/** initialize list of partial translation options by applying the first translation step */ +/** initialize list of partial translation options by applying the first translation step + * Ideally, this function should be in DecodeStepTranslation class + */ void TranslationOptionCollection::ProcessInitialTranslation( const DecodeStep &decodeStep , FactorCollection &factorCollection , PartialTranslOptColl &outputPartialTranslOptColl , size_t startPos , size_t endPos - , bool observeTableLimit) + , bool adhereTableLimit) { const PhraseDictionaryBase &phraseDictionary = decodeStep.GetPhraseDictionary(); const size_t tableLimit = phraseDictionary.GetTableLimit(); @@ -403,7 +434,7 @@ void TranslationOptionCollection::ProcessInitialTranslation( VERBOSE(3,"[" << m_source.GetSubString(wordsRange) << "; " << startPos << "-" << endPos << "]\n"); TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd; - iterEnd = (!observeTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit; + iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit; for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != iterEnd ; ++iterTargetPhrase) { diff --git a/moses/src/TranslationOptionCollection.h b/moses/src/TranslationOptionCollection.h index 141e8d6cf..b71e7094e 100755 --- a/moses/src/TranslationOptionCollection.h +++ b/moses/src/TranslationOptionCollection.h @@ -40,7 +40,7 @@ class Word; typedef std::vector<const TranslationOption*> TranslationOptionList; -/** Contains all phrase translations applicable to current sentence. +/** Contains all phrase translations applicable to current input type (a sentence or confusion network). * A key insight into efficient decoding is that various input * conditions (lattices, factored input, normal text, xml markup) * all lead to the same decoding algorithm: hypotheses are expanded @@ -48,7 +48,11 @@ typedef std::vector<const TranslationOption*> TranslationOptionList; * * The precomputation of a collection of instances of such TranslationOption * depends on the input condition, but they all are presented to - * decoding algorithm in the same form, using this class. **/ + * decoding algorithm in the same form, using this class. + * + * This class cannot, and should not be instantiated directly. Instantiate 1 of the inherited + * classes instead, for a particular input type + **/ class TranslationOptionCollection { @@ -56,9 +60,9 @@ class TranslationOptionCollection TranslationOptionCollection(const TranslationOptionCollection&); /*< no copy constructor */ protected: std::vector< std::vector< TranslationOptionList > > m_collection; /*< contains translation options */ - InputType const &m_source; - SquareMatrix m_futureScore; /*< matrix of future costs for parts of the sentence */ - const size_t m_maxNoTransOptPerCoverage; /*< maximum number of translation options per input span (phrase) */ + InputType const &m_source; /*< reference to the input */ + SquareMatrix m_futureScore; /*< matrix of future costs for contiguous parts (span) of the input */ + const size_t m_maxNoTransOptPerCoverage; /*< maximum number of translation options per input span (phrase???) */ FactorCollection *m_factorCollection; TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage); @@ -68,7 +72,7 @@ protected: virtual void ProcessInitialTranslation(const DecodeStep &decodeStep , FactorCollection &factorCollection , PartialTranslOptColl &outputPartialTranslOptColl - , size_t startPos, size_t endPos, bool observeTableLimit ); + , size_t startPos, size_t endPos, bool adhereTableLimit ); void ProcessUnknownWord(const std::list < DecodeStep* > &decodeStepList, FactorCollection &factorCollection); virtual void ProcessOneUnknownWord(const Word &sourceWord @@ -86,7 +90,10 @@ protected: return m_collection[startPos][endPos - startPos]; } void Add(const TranslationOption *translationOption); - + + virtual void ProcessUnknownWord(size_t sourcePos + , FactorCollection &factorCollection)=0; + public: virtual ~TranslationOptionCollection(); const InputType& GetSource() const { return m_source; } @@ -94,9 +101,6 @@ public: // get length/size of source input size_t GetSize() const; - virtual void ProcessUnknownWord(size_t sourcePos - , FactorCollection &factorCollection)=0; - virtual void CreateTranslationOptions(const std::list < DecodeStep* > &decodeStepList , FactorCollection &factorCollection); @@ -104,7 +108,7 @@ public: , FactorCollection &factorCollection , size_t startPosition , size_t endPosition - , bool observeTableLimit); + , bool adhereTableLimit); /** returns future cost matrix for sentence */ inline virtual const SquareMatrix &GetFutureScore() const |