Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/src/TranslationOptionCollection.cpp')
-rw-r--r--moses/src/TranslationOptionCollection.cpp583
1 files changed, 0 insertions, 583 deletions
diff --git a/moses/src/TranslationOptionCollection.cpp b/moses/src/TranslationOptionCollection.cpp
deleted file mode 100644
index 5a4522847..000000000
--- a/moses/src/TranslationOptionCollection.cpp
+++ /dev/null
@@ -1,583 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#include "TranslationOptionCollection.h"
-#include "Sentence.h"
-#include "DecodeStep.h"
-#include "LanguageModel.h"
-#include "PhraseDictionary.h"
-#include "FactorCollection.h"
-#include "Input.h"
-#include "Util.h"
-
-#include "StaticData.h"
-
-using namespace std;
-
-TranslationOptionCollection::TranslationOptionCollection(InputType const& src, size_t maxNoTransOptPerCoverage)
- : m_source(src)
- ,m_futureScore(src.GetSize())
- ,m_unknownWordPos(src.GetSize())
- ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
-{
- // create 2-d vector
- size_t size = src.GetSize();
- for (size_t startPos = 0 ; startPos < size ; ++startPos)
- {
- m_collection.push_back( vector< TranslationOptionList >() );
- for (size_t endPos = startPos ; endPos < size ; ++endPos)
- {
- m_collection[startPos].push_back( TranslationOptionList() );
- }
- }
-}
-
-TranslationOptionCollection::~TranslationOptionCollection()
-{
- // delete all trans opt
- size_t size = m_source.GetSize();
- for (size_t startPos = 0 ; startPos < size ; ++startPos)
- {
- for (size_t endPos = startPos ; endPos < size ; ++endPos)
- {
- RemoveAllInColl<TranslationOptionList::iterator>(GetTranslationOptionList(startPos, endPos));
- }
- }
-}
-
-// helper
-bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b)
-{
- return a->GetTotalScore() > b->GetTotalScore();
-}
-
-void TranslationOptionCollection::Prune()
-{
- if (m_maxNoTransOptPerCoverage == 0)
- return;
-
- size_t total = 0;
- size_t totalPruned = 0;
- size_t size = m_source.GetSize();
- for (size_t startPos = 0 ; startPos < size ; ++startPos)
- {
- for (size_t endPos = startPos ; endPos < size ; ++endPos)
- {
- TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos);
- total += fullList.size();
- if (fullList.size() <= m_maxNoTransOptPerCoverage)
- continue;
-
- // sort in vector
- nth_element(fullList.begin(), fullList.begin() + m_maxNoTransOptPerCoverage, fullList.end(), CompareTranslationOption);
-
- totalPruned += fullList.size() - m_maxNoTransOptPerCoverage;
-
- // delete the rest
- for (size_t i = m_maxNoTransOptPerCoverage ; i < fullList.size() ; ++i)
- {
- delete fullList[i];
- }
- fullList.resize(m_maxNoTransOptPerCoverage);
- }
- }
- if (StaticData::Instance()->GetVerboseLevel() >= 1)
- {
- std::cerr << " Total translation options: " << total << std::endl;
- std::cerr << "Total translation options pruned: " << totalPruned << std::endl;
- }
-}
-
-void TranslationOptionCollection::CalcFutureScore(size_t verboseLevel)
-{
- // create future score matrix in a dynamic programming fashion
-
- // setup the matrix (ignore lower triangle, set upper triangle to -inf
- size_t size = m_source.GetSize(); // the width of the matrix
-
- for(size_t row=0; row<size; row++) {
- for(size_t col=row; col<size; col++) {
- m_futureScore.SetScore(row, col, -numeric_limits<float>::infinity());
- }
- }
-
- // walk all the translation options and record the cheapest option for each span
- for (size_t startPos = 0 ; startPos < m_source.GetSize() ; ++startPos)
- {
- for (size_t endPos = startPos ; endPos < m_source.GetSize() ; ++endPos)
- {
- TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
-
- TranslationOptionList::const_iterator iterTransOpt;
- for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt)
- {
- const TranslationOption &transOpt = **iterTransOpt;
- float score = transOpt.GetFutureScore();
- if (score > m_futureScore.GetScore(startPos, endPos))
- m_futureScore.SetScore(startPos, endPos, score);
- }
- }
- }
-
- // now fill all the cells in the strictly upper triangle
- // there is no way to modify the diagonal now, in the case
- // where no translation option covers a single-word span,
- // we leave the +inf in the matrix
- // like in chart parsing we want each cell to contain the highest score
- // of the full-span trOpt or the sum of scores of joining two smaller spans
-
- for(size_t colstart = 1; colstart < size ; colstart++) {
- for(size_t diagshift = 0; diagshift < size-colstart ; diagshift++) {
- size_t startPos = diagshift;
- size_t endPos = colstart+diagshift;
- for(size_t joinAt = startPos; joinAt < endPos ; joinAt++) {
- float joinedScore = m_futureScore.GetScore(startPos, joinAt)
- + m_futureScore.GetScore(joinAt+1, endPos);
- /* // uncomment to see the cell filling scheme
- cerr << "[" <<startPos<<","<<endPos<<"] <-? ["<<startPos<<","<<joinAt<<"]+["<<joinAt+1<<","<<endPos
- << "] (colstart: "<<colstart<<", diagshift: "<<diagshift<<")"<<endl;
- */
- if (joinedScore > m_futureScore.GetScore(startPos, endPos))
- m_futureScore.SetScore(startPos, endPos, joinedScore);
- }
- }
- }
-
- if(verboseLevel > 2)
- {
- int total = 0;
- for(size_t row=0; row<size; row++)
- {
- for(size_t col=row; col<size; col++)
- {
- int count = GetTranslationOptionList(row, col).size();
- TRACE_ERR("translation options spanning from "
- << row <<" to "<< col <<" is "
- << count <<endl);
- total += count;
- }
- }
- cout << "translation options generated in total: "<< total << endl;
-
- for(size_t row=0; row<size; row++)
- for(size_t col=row; col<size; col++)
- cout<<"future cost from "<< row <<" to "<< col <<" is "<< m_futureScore.GetScore(row, col) <<endl;
- }
-}
-
-
-// helpers
-typedef pair<Word, float> WordPair;
-typedef list< WordPair > WordList;
-// 1st = word
-// 2nd = score
-typedef list< WordPair >::const_iterator WordListIterator;
-
-inline void IncrementIterators(vector< WordListIterator > &wordListIterVector
- , const vector< WordList > &wordListVector)
-{
- for (size_t currPos = 0 ; currPos < wordListVector.size() ; currPos++)
- {
- WordListIterator &iter = wordListIterVector[currPos];
- iter++;
- if (iter != wordListVector[currPos].end())
- { // eg. 4 -> 5
- return;
- }
- else
- { // eg 9 -> 10
- iter = wordListVector[currPos].begin();
- }
- }
-}
-
-void TranslationOptionCollection::ProcessGeneration(
- const TranslationOption &inputPartialTranslOpt
- , const DecodeStep &decodeStep
- , PartialTranslOptColl &outputPartialTranslOptColl
- , int dropUnknown
- , FactorCollection &factorCollection
- , float weightWordPenalty)
-{
- //TRACE_ERR(inputPartialTranslOpt << endl);
- if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0)
- { // word deletion
-
- TranslationOption *newTransOpt = new TranslationOption(inputPartialTranslOpt);
- outputPartialTranslOptColl.Add(newTransOpt);
-
- return;
- }
-
- // normal generation step
- const GenerationDictionary &generationDictionary = decodeStep.GetGenerationDictionary();
- const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
- const float weight = generationDictionary.GetWeight();
-
- const Phrase &targetPhrase = inputPartialTranslOpt.GetTargetPhrase();
- size_t targetLength = targetPhrase.GetSize();
-
- // generation list for each word in hypothesis
- vector< WordList > wordListVector(targetLength);
-
- // create generation list
- int wordListVectorPos = 0;
- for (size_t currPos = 0 ; currPos < targetLength ; currPos++)
- {
- WordList &wordList = wordListVector[wordListVectorPos];
- const FactorArray &factorArray = targetPhrase.GetFactorArray(currPos);
-
- const OutputWordCollection *wordColl = generationDictionary.FindWord(factorArray);
-
- if (wordColl == NULL)
- { // word not found in generation dictionary
- ProcessUnknownWord(sourceWordsRange.GetStartPos(), dropUnknown, factorCollection, weightWordPenalty);
- return;
- }
- else
- {
- OutputWordCollection::const_iterator iterWordColl;
- for (iterWordColl = wordColl->begin() ; iterWordColl != wordColl->end(); ++iterWordColl)
- {
- const Word &outputWord = (*iterWordColl).first;
- float score = (*iterWordColl).second;
- wordList.push_back(WordPair(outputWord, score));
- }
-
- wordListVectorPos++;
- }
- }
-
- // use generation list (wordList)
- // set up iterators
- size_t numIteration = 1;
- vector< WordListIterator > wordListIterVector(targetLength);
- vector< const Word* > mergeWords(targetLength);
- for (size_t currPos = 0 ; currPos < targetLength ; currPos++)
- {
- wordListIterVector[currPos] = wordListVector[currPos].begin();
- numIteration *= wordListVector[currPos].size();
- }
-
- // go thru each possible factor for each word & create hypothesis
- for (size_t currIter = 0 ; currIter < numIteration ; currIter++)
- {
- float generationScore = 0; // total score for this string of words
-
- // create vector of words with new factors for last phrase
- for (size_t currPos = 0 ; currPos < targetLength ; currPos++)
- {
- const WordPair &wordPair = *wordListIterVector[currPos];
- mergeWords[currPos] = &(wordPair.first);
- generationScore += wordPair.second;
- }
-
- // merge with existing trans opt
- Phrase genPhrase(Output, mergeWords);
- TranslationOption *newTransOpt = inputPartialTranslOpt.MergeGeneration(genPhrase, &generationDictionary, generationScore, weight);
- if (newTransOpt != NULL)
- {
- outputPartialTranslOptColl.Add(newTransOpt);
- }
-
- // increment iterators
- IncrementIterators(wordListIterVector, wordListVector);
- }
-}
-
-
-void TranslationOptionCollection::ProcessTranslation(
- const TranslationOption &inputPartialTranslOpt
- , const DecodeStep &decodeStep
- , PartialTranslOptColl &outputPartialTranslOptColl
- , int dropUnknown
- , FactorCollection &factorCollection
- , float weightWordPenalty)
-{
- //TRACE_ERR(inputPartialTranslOpt << endl);
- if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0)
- { // word deletion
-
- outputPartialTranslOptColl.Add(new TranslationOption(inputPartialTranslOpt));
-
- return;
- }
-
- // normal trans step
- const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
- const PhraseDictionaryBase &phraseDictionary = decodeStep.GetPhraseDictionary();
- const TargetPhraseCollection *phraseColl = phraseDictionary.GetTargetPhraseCollection(m_source,sourceWordsRange);
-
- if (phraseColl != NULL)
- {
- TargetPhraseCollection::const_iterator iterTargetPhrase;
-
- for (iterTargetPhrase = phraseColl->begin(); iterTargetPhrase != phraseColl->end(); ++iterTargetPhrase)
- {
- const TargetPhrase& targetPhrase = *iterTargetPhrase;
-
- TranslationOption *newTransOpt = inputPartialTranslOpt.MergeTranslation(targetPhrase);
- if (newTransOpt != NULL)
- {
- outputPartialTranslOptColl.Add( newTransOpt );
- }
- }
- }
- else if (sourceWordsRange.GetWordsCount() == 1)
- { // unknown handler
- ProcessUnknownWord(sourceWordsRange.GetStartPos(), dropUnknown, factorCollection, weightWordPenalty);
- }
-}
-
-
-/***
- * Add to m_possibleTranslations all possible translations the phrase table gives us for
- * the given phrase
- *
- * \param phrase The source phrase to translate
- * \param phraseDictionary The phrase table
- * \param lmListInitial A list of language models
- */
-void TranslationOptionCollection::CreateTranslationOptions(
- const list < DecodeStep > &decodeStepList
- , const LMList &allLM
- , FactorCollection &factorCollection
- , float weightWordPenalty
- , bool dropUnknown
- , size_t verboseLevel)
-{
- m_allLM = &allLM;
- // partial trans opt stored in here
- vector < PartialTranslOptColl* > outputPartialTranslOptCollVec( decodeStepList.size() );
- outputPartialTranslOptCollVec[0] = new PartialTranslOptColl();
-
- // initial translation step
- list < DecodeStep >::const_iterator iterStep = decodeStepList.begin();
- const DecodeStep &decodeStep = *iterStep;
-
- ProcessInitialTranslation(decodeStep, factorCollection
- , weightWordPenalty, dropUnknown
- , verboseLevel, *outputPartialTranslOptCollVec[0]);
-
- // do rest of decode steps
-
- int indexStep = 0;
- for (++iterStep ; iterStep != decodeStepList.end() ; ++iterStep)
- {
- const DecodeStep &decodeStep = *iterStep;
-
- outputPartialTranslOptCollVec[indexStep + 1] = new PartialTranslOptColl();
- PartialTranslOptColl &inputPartialTranslOptColl = *outputPartialTranslOptCollVec[indexStep]
- ,&outputPartialTranslOptColl = *outputPartialTranslOptCollVec[indexStep + 1];
-
- // is it translation or generation
- switch (decodeStep.GetDecodeType())
- {
- case Translate:
- {
- // go thru each intermediate trans opt just created
- PartialTranslOptColl::const_iterator iterPartialTranslOpt;
- for (iterPartialTranslOpt = inputPartialTranslOptColl.begin() ; iterPartialTranslOpt != inputPartialTranslOptColl.end() ; ++iterPartialTranslOpt)
- {
- const TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
- ProcessTranslation(inputPartialTranslOpt
- , decodeStep
- , outputPartialTranslOptColl
- , dropUnknown
- , factorCollection
- , weightWordPenalty);
- }
- break;
- }
- case Generate:
- {
- // go thru each hypothesis just created
- PartialTranslOptColl::const_iterator iterPartialTranslOpt;
- for (iterPartialTranslOpt = inputPartialTranslOptColl.begin() ; iterPartialTranslOpt != inputPartialTranslOptColl.end() ; ++iterPartialTranslOpt)
- {
- const TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
- ProcessGeneration(inputPartialTranslOpt
- , decodeStep
- , outputPartialTranslOptColl
- , dropUnknown
- , factorCollection
- , weightWordPenalty);
- }
- break;
- }
- case InsertNullFertilityWord:
- { // TODO ask chris or evan what should be done
- assert(false);
- break;
- }
- }
- // last but 1 partial trans not required anymore
- delete outputPartialTranslOptCollVec[indexStep];
- indexStep++;
- } // for (++iterStep
-
- // add to real trans opt list
- PartialTranslOptColl &lastPartialTranslOptColl = *outputPartialTranslOptCollVec[decodeStepList.size() - 1];
- PartialTranslOptColl::iterator iterColl;
- for (iterColl = lastPartialTranslOptColl.begin() ; iterColl != lastPartialTranslOptColl.end() ; iterColl++)
- {
- TranslationOption *transOpt = *iterColl;
- transOpt->CalcScore(allLM, weightWordPenalty);
- Add(transOpt);
- }
-
- lastPartialTranslOptColl.DetachAll();
- delete outputPartialTranslOptCollVec[decodeStepList.size() - 1];
-
- // Prune
- Prune();
-
- // future score
- CalcFutureScore(verboseLevel);
-}
-
-
-
-void TranslationOptionCollection::ProcessOneUnknownWord(const FactorArray &sourceWord,
- size_t sourcePos
- , int dropUnknown
- , FactorCollection &factorCollection
- , float weightWordPenalty)
-{
- // unknown word, add as trans opt
-
- size_t isDigit = 0;
- if (dropUnknown)
- {
- const Factor *f = sourceWord[0]; // ??? hack. shouldn't know which factor is surface
- std::string s = f->ToString();
- isDigit = s.find_first_of("0123456789");
- if (isDigit == string::npos)
- isDigit = 0;
- else
- isDigit = 1;
- // modify the starting bitmap
- }
-
- TranslationOption *transOpt;
- if (!dropUnknown || isDigit)
- {
- // add to dictionary
- TargetPhrase targetPhrase(Output);
- FactorArray &targetWord = targetPhrase.AddWord();
-
- for (unsigned int currFactor = 0 ; currFactor < NUM_FACTORS ; currFactor++)
- {
- FactorType factorType = static_cast<FactorType>(currFactor);
-
- const Factor *sourceFactor = sourceWord[currFactor];
- if (sourceFactor == NULL)
- targetWord[factorType] = factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR);
- else
- targetWord[factorType] = factorCollection.AddFactor(Output, factorType, sourceFactor->GetString());
- }
-
- targetPhrase.SetScore(weightWordPenalty);
-
- transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0);
- }
- else
- { // drop source word. create blank trans opt
- const TargetPhrase targetPhrase(Output);
- transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos), targetPhrase, 0);
- }
-
- transOpt->CalcScore(*m_allLM, weightWordPenalty);
- Add(transOpt);
-
- m_unknownWordPos.SetValue(sourcePos, true);
-}
-
-
-
-void TranslationOptionCollection::ProcessInitialTranslation(
- const DecodeStep &decodeStep
- , FactorCollection &factorCollection
- , float weightWordPenalty
- , int dropUnknown
- , size_t verboseLevel
- , PartialTranslOptColl &outputPartialTranslOptColl)
-{
- // loop over all substrings of the source sentence, look them up
- // in the phraseDictionary (which is the- possibly filtered-- phrase
- // table loaded on initialization), generate TranslationOption objects
- // for all phrases
- //
- // possible optimization- don't consider phrases longer than the longest
- // phrase in the PhraseDictionary?
-
- const PhraseDictionaryBase &phraseDictionary = decodeStep.GetPhraseDictionary();
- for (size_t startPos = 0 ; startPos < m_source.GetSize() ; startPos++)
- {
- if (m_unknownWordPos.GetValue(startPos))
- { // unknown word but already processed. skip
- continue;
- }
-
- for (size_t endPos = startPos ; endPos < m_source.GetSize() ; endPos++)
- {
- const WordsRange wordsRange(startPos, endPos);
- const TargetPhraseCollection *phraseColl = phraseDictionary.GetTargetPhraseCollection(m_source,wordsRange);
- if (phraseColl != NULL)
- {
- if (verboseLevel >= 3)
- {
- cout << "[" << m_source.GetSubString(wordsRange) << "; " << startPos << "-" << endPos << "]\n";
- }
-
- TargetPhraseCollection::const_iterator iterTargetPhrase;
- for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != phraseColl->end() ; ++iterTargetPhrase)
- {
- const TargetPhrase &targetPhrase = *iterTargetPhrase;
- outputPartialTranslOptColl.push_back ( new TranslationOption(wordsRange, targetPhrase) );
-
- if (verboseLevel >= 3)
- {
- cout << "\t" << targetPhrase << "\n";
- }
- }
- if (verboseLevel >= 3)
- {
- cout << endl;
- }
- }
- else if (wordsRange.GetWordsCount() == 1)
- {
- ProcessUnknownWord(startPos, dropUnknown, factorCollection, weightWordPenalty);
- continue;
- }
- }
- }
-}
-
-void TranslationOptionCollection::Add(const TranslationOption *translationOption)
-{
- const WordsRange &coverage = translationOption->GetSourceWordsRange();
- m_collection[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()].push_back(translationOption);
-}
-
-TO_STRING_BODY(TranslationOptionCollection);
-