diff options
Diffstat (limited to 'moses/src/TranslationOptionCollection.cpp')
-rw-r--r-- | moses/src/TranslationOptionCollection.cpp | 583 |
1 files changed, 0 insertions, 583 deletions
diff --git a/moses/src/TranslationOptionCollection.cpp b/moses/src/TranslationOptionCollection.cpp deleted file mode 100644 index 5a4522847..000000000 --- a/moses/src/TranslationOptionCollection.cpp +++ /dev/null @@ -1,583 +0,0 @@ -// $Id$ - -/*********************************************************************** -Moses - factored phrase-based language decoder -Copyright (C) 2006 University of Edinburgh - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#include "TranslationOptionCollection.h" -#include "Sentence.h" -#include "DecodeStep.h" -#include "LanguageModel.h" -#include "PhraseDictionary.h" -#include "FactorCollection.h" -#include "Input.h" -#include "Util.h" - -#include "StaticData.h" - -using namespace std; - -TranslationOptionCollection::TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage) - : m_source(src) - ,m_futureScore(src.GetSize()) - ,m_unknownWordPos(src.GetSize()) - ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage) -{ - // create 2-d vector - size_t size = src.GetSize(); - for (size_t startPos = 0 ; startPos < size ; ++startPos) - { - m_collection.push_back( vector< TranslationOptionList >() ); - for (size_t endPos = startPos ; endPos < size ; ++endPos) - { - m_collection[startPos].push_back( TranslationOptionList() ); - } - } -} - -TranslationOptionCollection::~TranslationOptionCollection() -{ - // delete all trans opt - size_t size = m_source.GetSize(); - for (size_t startPos = 0 ; startPos < size ; ++startPos) - { - for (size_t endPos = startPos ; endPos < size ; ++endPos) - { - RemoveAllInColl<TranslationOptionList::iterator>(GetTranslationOptionList(startPos, endPos)); - } - } -} - -// helper -bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b) -{ - return a->GetTotalScore() > b->GetTotalScore(); -} - -void TranslationOptionCollection::Prune() -{ - if (m_maxNoTransOptPerCoverage == 0) - return; - - size_t total = 0; - size_t totalPruned = 0; - size_t size = m_source.GetSize(); - for (size_t startPos = 0 ; startPos < size ; ++startPos) - { - for (size_t endPos = startPos ; endPos < size ; ++endPos) - { - TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos); - total += fullList.size(); - if (fullList.size() <= m_maxNoTransOptPerCoverage) - continue; - - // sort in vector - nth_element(fullList.begin(), fullList.begin() + m_maxNoTransOptPerCoverage, fullList.end(), CompareTranslationOption); - - totalPruned += fullList.size() - m_maxNoTransOptPerCoverage; - - // delete the rest - for (size_t i = m_maxNoTransOptPerCoverage ; i < fullList.size() ; ++i) - { - delete fullList[i]; - } - fullList.resize(m_maxNoTransOptPerCoverage); - } - } - if (StaticData::Instance()->GetVerboseLevel() >= 1) - { - std::cerr << " Total translation options: " << total << std::endl; - std::cerr << "Total translation options pruned: " << totalPruned << std::endl; - } -} - -void TranslationOptionCollection::CalcFutureScore(size_t verboseLevel) -{ - // create future score matrix in a dynamic programming fashion - - // setup the matrix (ignore lower triangle, set upper triangle to -inf - size_t size = m_source.GetSize(); // the width of the matrix - - for(size_t row=0; row<size; row++) { - for(size_t col=row; col<size; col++) { - m_futureScore.SetScore(row, col, -numeric_limits<float>::infinity()); - } - } - - // walk all the translation options and record the cheapest option for each span - for (size_t startPos = 0 ; startPos < m_source.GetSize() ; ++startPos) - { - for (size_t endPos = startPos ; endPos < m_source.GetSize() ; ++endPos) - { - TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos); - - TranslationOptionList::const_iterator iterTransOpt; - for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) - { - const TranslationOption &transOpt = **iterTransOpt; - float score = transOpt.GetFutureScore(); - if (score > m_futureScore.GetScore(startPos, endPos)) - m_futureScore.SetScore(startPos, endPos, score); - } - } - } - - // now fill all the cells in the strictly upper triangle - // there is no way to modify the diagonal now, in the case - // where no translation option covers a single-word span, - // we leave the +inf in the matrix - // like in chart parsing we want each cell to contain the highest score - // of the full-span trOpt or the sum of scores of joining two smaller spans - - for(size_t colstart = 1; colstart < size ; colstart++) { - for(size_t diagshift = 0; diagshift < size-colstart ; diagshift++) { - size_t startPos = diagshift; - size_t endPos = colstart+diagshift; - for(size_t joinAt = startPos; joinAt < endPos ; joinAt++) { - float joinedScore = m_futureScore.GetScore(startPos, joinAt) - + m_futureScore.GetScore(joinAt+1, endPos); - /* // uncomment to see the cell filling scheme - cerr << "[" <<startPos<<","<<endPos<<"] <-? ["<<startPos<<","<<joinAt<<"]+["<<joinAt+1<<","<<endPos - << "] (colstart: "<<colstart<<", diagshift: "<<diagshift<<")"<<endl; - */ - if (joinedScore > m_futureScore.GetScore(startPos, endPos)) - m_futureScore.SetScore(startPos, endPos, joinedScore); - } - } - } - - if(verboseLevel > 2) - { - int total = 0; - for(size_t row=0; row<size; row++) - { - for(size_t col=row; col<size; col++) - { - int count = GetTranslationOptionList(row, col).size(); - TRACE_ERR("translation options spanning from " - << row <<" to "<< col <<" is " - << count <<endl); - total += count; - } - } - cout << "translation options generated in total: "<< total << endl; - - for(size_t row=0; row<size; row++) - for(size_t col=row; col<size; col++) - cout<<"future cost from "<< row <<" to "<< col <<" is "<< m_futureScore.GetScore(row, col) <<endl; - } -} - - -// helpers -typedef pair<Word, float> WordPair; -typedef list< WordPair > WordList; -// 1st = word -// 2nd = score -typedef list< WordPair >::const_iterator WordListIterator; - -inline void IncrementIterators(vector< WordListIterator > &wordListIterVector - , const vector< WordList > &wordListVector) -{ - for (size_t currPos = 0 ; currPos < wordListVector.size() ; currPos++) - { - WordListIterator &iter = wordListIterVector[currPos]; - iter++; - if (iter != wordListVector[currPos].end()) - { // eg. 4 -> 5 - return; - } - else - { // eg 9 -> 10 - iter = wordListVector[currPos].begin(); - } - } -} - -void TranslationOptionCollection::ProcessGeneration( - const TranslationOption &inputPartialTranslOpt - , const DecodeStep &decodeStep - , PartialTranslOptColl &outputPartialTranslOptColl - , int dropUnknown - , FactorCollection &factorCollection - , float weightWordPenalty) -{ - //TRACE_ERR(inputPartialTranslOpt << endl); - if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) - { // word deletion - - TranslationOption *newTransOpt = new TranslationOption(inputPartialTranslOpt); - outputPartialTranslOptColl.Add(newTransOpt); - - return; - } - - // normal generation step - const GenerationDictionary &generationDictionary = decodeStep.GetGenerationDictionary(); - const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange(); - const float weight = generationDictionary.GetWeight(); - - const Phrase &targetPhrase = inputPartialTranslOpt.GetTargetPhrase(); - size_t targetLength = targetPhrase.GetSize(); - - // generation list for each word in hypothesis - vector< WordList > wordListVector(targetLength); - - // create generation list - int wordListVectorPos = 0; - for (size_t currPos = 0 ; currPos < targetLength ; currPos++) - { - WordList &wordList = wordListVector[wordListVectorPos]; - const FactorArray &factorArray = targetPhrase.GetFactorArray(currPos); - - const OutputWordCollection *wordColl = generationDictionary.FindWord(factorArray); - - if (wordColl == NULL) - { // word not found in generation dictionary - ProcessUnknownWord(sourceWordsRange.GetStartPos(), dropUnknown, factorCollection, weightWordPenalty); - return; - } - else - { - OutputWordCollection::const_iterator iterWordColl; - for (iterWordColl = wordColl->begin() ; iterWordColl != wordColl->end(); ++iterWordColl) - { - const Word &outputWord = (*iterWordColl).first; - float score = (*iterWordColl).second; - wordList.push_back(WordPair(outputWord, score)); - } - - wordListVectorPos++; - } - } - - // use generation list (wordList) - // set up iterators - size_t numIteration = 1; - vector< WordListIterator > wordListIterVector(targetLength); - vector< const Word* > mergeWords(targetLength); - for (size_t currPos = 0 ; currPos < targetLength ; currPos++) - { - wordListIterVector[currPos] = wordListVector[currPos].begin(); - numIteration *= wordListVector[currPos].size(); - } - - // go thru each possible factor for each word & create hypothesis - for (size_t currIter = 0 ; currIter < numIteration ; currIter++) - { - float generationScore = 0; // total score for this string of words - - // create vector of words with new factors for last phrase - for (size_t currPos = 0 ; currPos < targetLength ; currPos++) - { - const WordPair &wordPair = *wordListIterVector[currPos]; - mergeWords[currPos] = &(wordPair.first); - generationScore += wordPair.second; - } - - // merge with existing trans opt - Phrase genPhrase(Output, mergeWords); - TranslationOption *newTransOpt = inputPartialTranslOpt.MergeGeneration(genPhrase, &generationDictionary, generationScore, weight); - if (newTransOpt != NULL) - { - outputPartialTranslOptColl.Add(newTransOpt); - } - - // increment iterators - IncrementIterators(wordListIterVector, wordListVector); - } -} - - -void TranslationOptionCollection::ProcessTranslation( - const TranslationOption &inputPartialTranslOpt - , const DecodeStep &decodeStep - , PartialTranslOptColl &outputPartialTranslOptColl - , int dropUnknown - , FactorCollection &factorCollection - , float weightWordPenalty) -{ - //TRACE_ERR(inputPartialTranslOpt << endl); - if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) - { // word deletion - - outputPartialTranslOptColl.Add(new TranslationOption(inputPartialTranslOpt)); - - return; - } - - // normal trans step - const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange(); - const PhraseDictionaryBase &phraseDictionary = decodeStep.GetPhraseDictionary(); - const TargetPhraseCollection *phraseColl = phraseDictionary.GetTargetPhraseCollection(m_source,sourceWordsRange); - - if (phraseColl != NULL) - { - TargetPhraseCollection::const_iterator iterTargetPhrase; - - for (iterTargetPhrase = phraseColl->begin(); iterTargetPhrase != phraseColl->end(); ++iterTargetPhrase) - { - const TargetPhrase& targetPhrase = *iterTargetPhrase; - - TranslationOption *newTransOpt = inputPartialTranslOpt.MergeTranslation(targetPhrase); - if (newTransOpt != NULL) - { - outputPartialTranslOptColl.Add( newTransOpt ); - } - } - } - else if (sourceWordsRange.GetWordsCount() == 1) - { // unknown handler - ProcessUnknownWord(sourceWordsRange.GetStartPos(), dropUnknown, factorCollection, weightWordPenalty); - } -} - - -/*** - * Add to m_possibleTranslations all possible translations the phrase table gives us for - * the given phrase - * - * \param phrase The source phrase to translate - * \param phraseDictionary The phrase table - * \param lmListInitial A list of language models - */ -void TranslationOptionCollection::CreateTranslationOptions( - const list < DecodeStep > &decodeStepList - , const LMList &allLM - , FactorCollection &factorCollection - , float weightWordPenalty - , bool dropUnknown - , size_t verboseLevel) -{ - m_allLM = &allLM; - // partial trans opt stored in here - vector < PartialTranslOptColl* > outputPartialTranslOptCollVec( decodeStepList.size() ); - outputPartialTranslOptCollVec[0] = new PartialTranslOptColl(); - - // initial translation step - list < DecodeStep >::const_iterator iterStep = decodeStepList.begin(); - const DecodeStep &decodeStep = *iterStep; - - ProcessInitialTranslation(decodeStep, factorCollection - , weightWordPenalty, dropUnknown - , verboseLevel, *outputPartialTranslOptCollVec[0]); - - // do rest of decode steps - - int indexStep = 0; - for (++iterStep ; iterStep != decodeStepList.end() ; ++iterStep) - { - const DecodeStep &decodeStep = *iterStep; - - outputPartialTranslOptCollVec[indexStep + 1] = new PartialTranslOptColl(); - PartialTranslOptColl &inputPartialTranslOptColl = *outputPartialTranslOptCollVec[indexStep] - ,&outputPartialTranslOptColl = *outputPartialTranslOptCollVec[indexStep + 1]; - - // is it translation or generation - switch (decodeStep.GetDecodeType()) - { - case Translate: - { - // go thru each intermediate trans opt just created - PartialTranslOptColl::const_iterator iterPartialTranslOpt; - for (iterPartialTranslOpt = inputPartialTranslOptColl.begin() ; iterPartialTranslOpt != inputPartialTranslOptColl.end() ; ++iterPartialTranslOpt) - { - const TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt; - ProcessTranslation(inputPartialTranslOpt - , decodeStep - , outputPartialTranslOptColl - , dropUnknown - , factorCollection - , weightWordPenalty); - } - break; - } - case Generate: - { - // go thru each hypothesis just created - PartialTranslOptColl::const_iterator iterPartialTranslOpt; - for (iterPartialTranslOpt = inputPartialTranslOptColl.begin() ; iterPartialTranslOpt != inputPartialTranslOptColl.end() ; ++iterPartialTranslOpt) - { - const TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt; - ProcessGeneration(inputPartialTranslOpt - , decodeStep - , outputPartialTranslOptColl - , dropUnknown - , factorCollection - , weightWordPenalty); - } - break; - } - case InsertNullFertilityWord: - { // TODO ask chris or evan what should be done - assert(false); - break; - } - } - // last but 1 partial trans not required anymore - delete outputPartialTranslOptCollVec[indexStep]; - indexStep++; - } // for (++iterStep - - // add to real trans opt list - PartialTranslOptColl &lastPartialTranslOptColl = *outputPartialTranslOptCollVec[decodeStepList.size() - 1]; - PartialTranslOptColl::iterator iterColl; - for (iterColl = lastPartialTranslOptColl.begin() ; iterColl != lastPartialTranslOptColl.end() ; iterColl++) - { - TranslationOption *transOpt = *iterColl; - transOpt->CalcScore(allLM, weightWordPenalty); - Add(transOpt); - } - - lastPartialTranslOptColl.DetachAll(); - delete outputPartialTranslOptCollVec[decodeStepList.size() - 1]; - - // Prune - Prune(); - - // future score - CalcFutureScore(verboseLevel); -} - - - -void TranslationOptionCollection::ProcessOneUnknownWord(const FactorArray &sourceWord, - size_t sourcePos - , int dropUnknown - , FactorCollection &factorCollection - , float weightWordPenalty) -{ - // unknown word, add as trans opt - - size_t isDigit = 0; - if (dropUnknown) - { - const Factor *f = sourceWord[0]; // ??? hack. shouldn't know which factor is surface - std::string s = f->ToString(); - isDigit = s.find_first_of("0123456789"); - if (isDigit == string::npos) - isDigit = 0; - else - isDigit = 1; - // modify the starting bitmap - } - - TranslationOption *transOpt; - if (!dropUnknown || isDigit) - { - // add to dictionary - TargetPhrase targetPhrase(Output); - FactorArray &targetWord = targetPhrase.AddWord(); - - for (unsigned int currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++) - { - FactorType factorType = static_cast<FactorType>(currFactor); - - const Factor *sourceFactor = sourceWord[currFactor]; - if (sourceFactor == NULL) - targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR); - else - targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString()); - } - - targetPhrase.SetScore(weightWordPenalty); - - transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0); - } - else - { // drop source word. create blank trans opt - const TargetPhrase targetPhrase(Output); - transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0); - } - - transOpt->CalcScore(*m_allLM, weightWordPenalty); - Add(transOpt); - - m_unknownWordPos.SetValue(sourcePos, true); -} - - - -void TranslationOptionCollection::ProcessInitialTranslation( - const DecodeStep &decodeStep - , FactorCollection &factorCollection - , float weightWordPenalty - , int dropUnknown - , size_t verboseLevel - , PartialTranslOptColl &outputPartialTranslOptColl) -{ - // loop over all substrings of the source sentence, look them up - // in the phraseDictionary (which is the- possibly filtered-- phrase - // table loaded on initialization), generate TranslationOption objects - // for all phrases - // - // possible optimization- don't consider phrases longer than the longest - // phrase in the PhraseDictionary? - - const PhraseDictionaryBase &phraseDictionary = decodeStep.GetPhraseDictionary(); - for (size_t startPos = 0 ; startPos < m_source.GetSize() ; startPos++) - { - if (m_unknownWordPos.GetValue(startPos)) - { // unknown word but already processed. skip - continue; - } - - for (size_t endPos = startPos ; endPos < m_source.GetSize() ; endPos++) - { - const WordsRange wordsRange(startPos, endPos); - const TargetPhraseCollection *phraseColl = phraseDictionary.GetTargetPhraseCollection(m_source,wordsRange); - if (phraseColl != NULL) - { - if (verboseLevel >= 3) - { - cout << "[" << m_source.GetSubString(wordsRange) << "; " << startPos << "-" << endPos << "]\n"; - } - - TargetPhraseCollection::const_iterator iterTargetPhrase; - for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != phraseColl->end() ; ++iterTargetPhrase) - { - const TargetPhrase &targetPhrase = *iterTargetPhrase; - outputPartialTranslOptColl.push_back ( new TranslationOption(wordsRange, targetPhrase) ); - - if (verboseLevel >= 3) - { - cout << "\t" << targetPhrase << "\n"; - } - } - if (verboseLevel >= 3) - { - cout << endl; - } - } - else if (wordsRange.GetWordsCount() == 1) - { - ProcessUnknownWord(startPos, dropUnknown, factorCollection, weightWordPenalty); - continue; - } - } - } -} - -void TranslationOptionCollection::Add(const TranslationOption *translationOption) -{ - const WordsRange &coverage = translationOption->GetSourceWordsRange(); - m_collection[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()].push_back(translationOption); -} - -TO_STRING_BODY(TranslationOptionCollection); - |