Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/src/StaticData.cpp')
-rwxr-xr-xmoses/src/StaticData.cpp620
1 files changed, 0 insertions, 620 deletions
diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp
deleted file mode 100755
index 6529c3155..000000000
--- a/moses/src/StaticData.cpp
+++ /dev/null
@@ -1,620 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#include <string>
-#include <cassert>
-
-#include "PhraseDictionary.h"
-#include "GenerationDictionary.h"
-#include "DummyScoreProducers.h"
-#include "StaticData.h"
-#include "Util.h"
-#include "FactorCollection.h"
-#include "HypothesisCollection.h"
-#include "Timer.h"
-#include "boost/filesystem/operations.hpp" // boost::filesystem::exists
-#include "boost/algorithm/string/case_conv.hpp" //boost::algorithm::to_lower
-#include "LanguageModel.h"
-#include "LanguageModelFactory.h"
-#include "LexicalReordering.h"
-#include "SentenceStats.h"
-
-#ifndef WIN32
-#include "PhraseDictionaryTreeAdaptor.h"
-#endif
-
-using namespace std;
-
-extern Timer timer;
-
-StaticData* StaticData::s_instance(0);
-
-StaticData::StaticData()
-:m_languageModel(2)
-,m_lexReorder(NULL)
-,m_inputOutput(NULL)
-,m_fLMsLoaded(false)
-,m_inputType(0)
-,m_numInputScores(0)
-,m_distortionScoreProducer(0)
-,m_wpProducer(0)
-
-{
- s_instance = this;
-}
-
-bool StaticData::LoadParameters(int argc, char* argv[])
-{
- if (!m_parameter.LoadParam(argc, argv))
- return false;
-
- // input type has to specified BEFORE loading the phrase tables!
- if(m_parameter.GetParam("inputtype").size())
- m_inputType=Scan<int>(m_parameter.GetParam("inputtype")[0]);
- TRACE_ERR("input type is: "<<m_inputType<<" (0==default: text input, else confusion net format)\n");
-
- // mysql
- m_mySQLParam = m_parameter.GetParam("mysql");
-
- if (m_parameter.GetParam("cache-path").size() == 1)
- m_cachePath = m_parameter.GetParam("cache-path")[0];
- else
- m_cachePath = GetTempFolder();
-
- // n-best
- if (m_parameter.GetParam("n-best-list").size() == 2)
- {
- m_nBestFilePath = m_parameter.GetParam("n-best-list")[0];
- m_nBestSize = Scan<size_t>( m_parameter.GetParam("n-best-list")[1] );
- }
- else
- {
- m_nBestSize = 0;
- }
-
- // verbose level
- if (m_parameter.GetParam("verbose").size() == 1)
- {
- m_verboseLevel = 1;
- m_verboseLevel = Scan<size_t>( m_parameter.GetParam("verbose")[0]);
- }
- else
- {
- m_verboseLevel = 0;
- }
-
-
- // printing source phrase spans
- if (m_parameter.GetParam("report-source-span").size() > 0)
- m_reportSourceSpan = Scan<bool>(m_parameter.GetParam("report-source-span")[0]);
- else
- m_reportSourceSpan = false;
-
-
- // print all factors of output translations
- if (m_parameter.GetParam("report-all-factors").size() > 0)
- m_reportAllFactors = Scan<bool>(m_parameter.GetParam("report-all-factors")[0]);
- else
- m_reportAllFactors = false;
-
- //distortion weights
- std::vector<float> distortionWeights = Scan<float>(m_parameter.GetParam("weight-d"));
-
-
-
- //input-factors
- const vector<string> &inputFactorVector = m_parameter.GetParam("input-factors");
- for(size_t i=0; i<inputFactorVector.size(); i++)
- {
- m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i]));
- }
- if(m_inputFactorOrder.empty())
- {
- std::cerr<<"ERROR: no input factor specified in config file"
- " (param input-factors) -> abort!\n";
- abort();
- }
-
- //source word deletion
- if(m_parameter.GetParam("phrase-drop-allowed").size() > 0)
- {
- m_wordDeletionEnabled = Scan<bool>(m_parameter.GetParam("phrase-drop-allowed")[0]);
- }
- else
- {
- m_wordDeletionEnabled = false;
- }
- // load Lexical Reordering model
- // check to see if the lexical reordering parameter exists
- //TODO: doesn't work for bidirectional: yet.
- const vector<string> &lrFileVector =
- m_parameter.GetParam("distortion-file");
-
- if (lrFileVector.size() > 0)
- {
- //get the weights for the lex reorderer
- TRACE_ERR("weights-lex")
- for(int i=1; i<distortionWeights.size(); i++)
- {
- m_lexWeights.push_back(distortionWeights[i]);
- TRACE_ERR(distortionWeights[i] << "\t");
- }
- TRACE_ERR(endl);
- assert(m_lexWeights.size()>0);
-
- // if there is a lexical reordering model, then parse the
- // parameters associated with it, and create a new Lexical
- // Reordering object (which will load the probability table)
- const vector<string> &lrTypeVector =
- m_parameter.GetParam("distortion");
- // if type values have been set in the .ini file, then use them;
- // first initialize to the defaults (msd, bidirectional, fe).
- int orientation = DistortionOrientationType::Msd,
- direction = LexReorderType::Bidirectional,
- condition = LexReorderType::Fe;
- if (lrTypeVector.size() > 0)
- {
- // loop through type vector and set the orientation,
- // direction, and condition to override the defaults
- int size = lrTypeVector.size();
- string val;
- //if multiple parameters of the same type (direction, orientation, condition)
- //are seen, default behavior is to set the type to the last seen
- for (int i=0; i<size; i++)
- {
- val = lrTypeVector[i];
- boost::algorithm::to_lower(val);
- //TODO:Lowercase val!
- //orientation
- if(val == "monotone")
- orientation = DistortionOrientationType::Monotone;
- else if(val == "msd")
- orientation = DistortionOrientationType::Msd;
- //direction
- else if(val == "forward")
- direction = LexReorderType::Forward;
- else if(val == "backward")
- direction = LexReorderType::Backward;
- else if(val == "bidirectional")
- direction = LexReorderType::Bidirectional;
- //condition
- else if(val == "f")
- condition = LexReorderType::F;
- else if(val == "fe")
- condition = LexReorderType::Fe;
- }
- }
- else // inform the user that the defaults are being employed
- {
- //cout << "Lexical reordering is using defaults: Msd, Bidirectional, Fe Parameters" << endl;
- }
-
- // for now, assume there is just one lexical reordering model
- timer.check("Starting to load lexical reorder table...");
- m_lexReorder = new LexicalReordering(lrFileVector[0], orientation, direction, condition, m_lexWeights);
- timer.check("Finished loading lexical reorder table.");
- }
- if (m_parameter.GetParam("lmodel-file").size() > 0)
- {
- // weights
- vector<float> weightAll = Scan<float>(m_parameter.GetParam("weight-l"));
-
- TRACE_ERR("weight-l: ");
- for (size_t i = 0 ; i < weightAll.size() ; i++)
- {
- TRACE_ERR(weightAll[i] << "\t");
- m_allWeights.push_back(weightAll[i]);
- }
- TRACE_ERR(endl);
-
-
- size_t nGramMaxOrder = 0;
- timer.check("Start loading LanguageModels");
- const vector<string> &lmVector = m_parameter.GetParam("lmodel-file");
-
- for(size_t i=0; i<lmVector.size(); i++)
- {
- vector<string> token = Tokenize(lmVector[i]);
- if (token.size() != 4 )
- {
- TRACE_ERR("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filename'");
- return false;
- }
- // type = whether or not to use in future cost calcs
- // (DEPRECATED, asked hieu)
- LMListType type = static_cast<LMListType>(Scan<int>(token[0]));
- // factorType = (see TypeDef.h)
- // 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc
- FactorType factorType = Scan<FactorType>(token[1]);
- // nGramOrder = 2 = bigram, 3 = trigram, etc
- size_t nGramOrder = Scan<int>(token[2]);
- // keep track of the largest n-gram length
- // (used by CompareHypothesisCollection)
- if (nGramOrder > nGramMaxOrder) // remove
- nGramMaxOrder = nGramOrder; // remove
- string &languageModelFile = token[3];
- if ((size_t)factorType >= m_maxNgramOrderForFactor.size()) {
- m_maxNgramOrderForFactor.resize((size_t)factorType+1, 0);
- }
- if (nGramOrder > m_maxNgramOrderForFactor[(size_t)factorType]) {
- m_maxNgramOrderForFactor[(size_t)factorType] = nGramOrder;
- }
- timer.check(("Start loading LanguageModel " + languageModelFile).c_str());
- LanguageModel *lm = LanguageModelFactory::createLanguageModel();
-
- // error handling here?
- lm->Load(i, languageModelFile, m_factorCollection, factorType, weightAll[i], nGramOrder);
- timer.check(("Finished loading LanguageModel " + languageModelFile).c_str());
- m_languageModel[type].push_back(lm);
-
- HypothesisRecombinationOrderer::SetMaxNGramOrder(factorType, nGramMaxOrder);
- }
- }
- // flag indicating that language models were loaded,
- // since phrase table loading requires their presence
- m_fLMsLoaded = true;
- timer.check("Finished loading LanguageModels");
-
- // generation tables
- if (m_parameter.GetParam("generation-file").size() > 0)
- {
- const vector<string> &generationVector = m_parameter.GetParam("generation-file");
- const vector<float> &weight = Scan<float>(m_parameter.GetParam("weight-generation"));
-
- TRACE_ERR("weight-generation: ");
- for (size_t i = 0 ; i < weight.size() ; i++)
- {
- TRACE_ERR(weight[i] << "\t");
- }
- TRACE_ERR(endl);
-
- for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++)
- {
- vector<string> token = Tokenize(generationVector[currDict]);
- vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
- ,output = Tokenize<FactorType>(token[1], ",");
- string filePath= token[2];
-
- m_allWeights.push_back(weight[currDict]);
- TRACE_ERR(filePath << endl);
- m_generationDictionary.push_back(new GenerationDictionary());
- m_generationDictionary.back()->Load(input
- , output
- , m_factorCollection
- , filePath
- , weight[currDict]
- , Output); // always target, for now
- }
- }
-
- timer.check("Finished loading generation tables");
-
- // score weights
- m_weightDistortion = distortionWeights[0];
- m_weightWordPenalty = Scan<float>( m_parameter.GetParam("weight-w")[0] );
-
- TRACE_ERR("weight-d: " << m_weightDistortion << endl);
- m_distortionScoreProducer = new DistortionScoreProducer;
- m_allWeights.push_back(m_weightDistortion);
-
- TRACE_ERR("weight-w: " << m_weightWordPenalty << endl);
- m_wpProducer = new WordPenaltyProducer;
- m_allWeights.push_back(m_weightWordPenalty);
-
- // misc
- m_maxHypoStackSize = (m_parameter.GetParam("stack").size() > 0)
- ? Scan<size_t>(m_parameter.GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE;
- m_maxDistortion = (m_parameter.GetParam("distortion-limit").size() > 0) ?
- Scan<int>(m_parameter.GetParam("distortion-limit")[0])
- : -1;
- m_beamThreshold = (m_parameter.GetParam("beam-threshold").size() > 0) ?
- TransformScore(Scan<float>(m_parameter.GetParam("beam-threshold")[0]))
- : TransformScore(DEFAULT_BEAM_THRESHOLD);
-
- m_maxNoTransOptPerCoverage = (m_parameter.GetParam("max-trans-opt-per-coverage").size() > 0)
- ? Scan<size_t>(m_parameter.GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
- TRACE_ERR("max translation options per coverage span: "<<m_maxNoTransOptPerCoverage<<"\n");
-
- // Unknown Word Processing -- wade
- //TODO replace this w/general word dropping -- EVH
- if (m_parameter.GetParam("drop-unknown").size() == 1)
- { m_dropUnknown = Scan<bool>( m_parameter.GetParam("drop-unknown")[0]); }
- else
- { m_dropUnknown = 0; }
-
- TRACE_ERR("m_dropUnknown: " << m_dropUnknown << endl);
-
-
-
-#if 0
- // weight for the posteriors for the confusion network
- if(m_parameter.GetParam("weight-i").size())
- {
- m_weightInput=Scan<float>(m_parameter.GetParam("weight-i")[0]);
- if(m_parameter.GetParam("weight-i").size()>1)
- m_weightRealSourceWords=Scan<float>(m_parameter.GetParam("weight-i")[1]);
- }
- if(m_inputType)
- TRACE_ERR("input weight is "<<m_weightInput<<" realWords: "<<m_weightRealSourceWords<<"\n");
-#endif
-
- return true;
-
-}
-
-StaticData::~StaticData()
-{
- delete m_inputOutput;
- for (size_t i = 0 ; i < m_phraseDictionary.size() ; i++)
- {
- delete m_phraseDictionary[i];
- }
- for (size_t i = 0 ; i < m_generationDictionary.size() ; i++)
- {
- delete m_generationDictionary[i];
- }
-
- LMList &lmList = m_languageModel[0];
- LMList::const_iterator iterLM;
- for (iterLM = lmList.begin() ; iterLM != lmList.end() ; ++iterLM)
- {
- delete *iterLM;
- }
- lmList = m_languageModel[1];
- for (iterLM = lmList.begin() ; iterLM != lmList.end() ; ++iterLM)
- {
- delete *iterLM;
- }
-
- // small score producers
- delete m_distortionScoreProducer;
- delete m_wpProducer;
-}
-
-IOMethod StaticData::GetIOMethod()
-{
- if (m_mySQLParam.size() == 6)
- return IOMethodMySQL;
- else if (m_parameter.GetParam("input-file").size() == 1)
- return IOMethodFile;
- else
- return IOMethodCommandLine;
-}
-
-void StaticData::SetWeightTransModel(const vector<float> &weight)
-{
- size_t currWeight = 0;
- for(vector<PhraseDictionaryBase*>::iterator iter = m_phraseDictionary.begin();
- iter != m_phraseDictionary.end(); ++iter)
- {
- PhraseDictionaryBase *phraseDict = *iter;
- const size_t noScoreComponent = phraseDict->GetNumScoreComponents();
- // weights for this particular dictionary
- vector<float> dictWeight(noScoreComponent);
- for (size_t i = 0 ; i < noScoreComponent ; i++)
- {
- dictWeight[i] = weight[currWeight++];
- }
- phraseDict->SetWeightTransModel(dictWeight);
- }
-}
-
-void StaticData::SetWeightLM(const std::vector<float> &weight)
-{
- assert(weight.size() == m_languageModel[Initial].size() + m_languageModel[Other].size());
-
- size_t currIndex = 0;
- LMList::iterator iter;
- for (iter = m_languageModel[Initial].begin() ; iter != m_languageModel[Initial].end() ; ++iter)
- {
- LanguageModel *languageModel = *iter;
- languageModel->SetWeight(weight[currIndex++]);
- }
- for (iter = m_languageModel[Other].begin() ; iter != m_languageModel[Other].end() ; ++iter)
- {
- LanguageModel *languageModel = *iter;
- languageModel->SetWeight(weight[currIndex++]);
- }
-}
-
-void StaticData::SetWeightGeneration(const std::vector<float> &weight)
-{
- assert(weight.size() == GetGenerationDictionarySize());
-
- size_t currWeight = 0;
- vector<GenerationDictionary*>::iterator iter;
- for(iter = m_generationDictionary.begin() ; iter != m_generationDictionary.end(); ++iter)
- {
- GenerationDictionary *dict = *iter;
- dict->SetWeight(weight[currWeight++]);
- }
-}
-
-const LMList StaticData::GetAllLM() const
-{
- LMList allLM;
- std::copy(m_languageModel[Initial].begin(), m_languageModel[Initial].end()
- , std::inserter(allLM, allLM.end()));
- std::copy(m_languageModel[Other].begin(), m_languageModel[Other].end()
- , std::inserter(allLM, allLM.end()));
-
- return allLM;
-}
-
-void StaticData::LoadPhraseTables(bool filter
- , const string &inputFileHash
- , const list< Phrase > &inputPhraseList)
-{
- // language models must be loaded prior to loading phrase tables
- assert(m_fLMsLoaded);
- // load phrase translation tables
- if (m_parameter.GetParam("ttable-file").size() > 0)
- {
- // weights
- vector<float> weightAll = Scan<float>(m_parameter.GetParam("weight-t"));
-
- TRACE_ERR("weight-t: ");
- for (size_t i = 0 ; i < weightAll.size() ; i++)
- {
- TRACE_ERR(weightAll[i] << "\t");
- }
- TRACE_ERR(endl);
-
- const vector<string> &translationVector = m_parameter.GetParam("ttable-file");
- vector<size_t> maxTargetPhrase = Scan<size_t>(m_parameter.GetParam("ttable-limit"));
- cerr<<"ttable-limits: ";copy(maxTargetPhrase.begin(),maxTargetPhrase.end(),ostream_iterator<size_t>(cerr," "));cerr<<"\n";
-
- size_t index = 0;
- size_t totalPrevNoScoreComponent = 0;
- for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++)
- {
- vector<string> token = Tokenize(translationVector[currDict]);
- //characteristics of the phrase table
- vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
- ,output = Tokenize<FactorType>(token[1], ",");
- string filePath= token[3];
- size_t noScoreComponent = Scan<size_t>(token[2]);
- // weights for this phrase dictionary
- vector<float> weight(noScoreComponent);
- for (size_t currScore = 0 ; currScore < noScoreComponent ; currScore++)
- weight[currScore] = weightAll[totalPrevNoScoreComponent + currScore];
-
- if(weight.size()!=noScoreComponent)
- {
- std::cerr<<"ERROR: your phrase table has "<<noScoreComponent<<" scores, but you specified "<<weight.size()<<" weights!\n";
- abort();
- }
-
- if(currDict==0 && m_inputType)
- {
- m_numInputScores=m_parameter.GetParam("weight-i").size();
- for(unsigned k=0;k<m_numInputScores;++k)
- weight.push_back(Scan<float>(m_parameter.GetParam("weight-i")[k]));
-
- noScoreComponent+=m_numInputScores;
- }
-
- assert(noScoreComponent==weight.size());
-
- std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights));
-
- totalPrevNoScoreComponent += noScoreComponent;
- string phraseTableHash = GetMD5Hash(filePath);
- string hashFilePath = GetCachePath()
- + PROJECT_NAME + "--"
- + token[0] + "--"
- + inputFileHash + "--"
- + phraseTableHash + ".txt";
-
- timer.check("Start loading PhraseTable");
- using namespace boost::filesystem;
- if (!exists(path(filePath+".binphr.idx", native)))
- {
- bool filterPhrase;
- if (filter)
- {
- boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
- if (boost::filesystem::exists(tempFile))
- { // load filtered file instead
- filterPhrase = false;
- filePath = hashFilePath;
- }
- else
- { // load original file & create has file
- filterPhrase = true;
- }
- }
- else
- { // load original file
- filterPhrase = false;
- }
- TRACE_ERR(filePath << endl);
-
-
- TRACE_ERR("using standard phrase tables");
- PhraseDictionary *pd=new PhraseDictionary(noScoreComponent);
- pd->Load(input
- , output
- , m_factorCollection
- , filePath
- , hashFilePath
- , weight
- , maxTargetPhrase[index]
- , filterPhrase
- , inputPhraseList
- , this->GetLanguageModel(Initial)
- , this->GetWeightWordPenalty()
- , *this);
- m_phraseDictionary.push_back(pd);
- }
- else
- {
- #ifdef WIN32
- TRACE_ERR("binary phrase tables not available under Windows\n");
- assert(false);
- #else
- TRACE_ERR("using binary phrase tables for idx "<<currDict<<"\n");
- PhraseDictionaryTreeAdaptor *pd=new PhraseDictionaryTreeAdaptor(noScoreComponent,(currDict==0 ? m_numInputScores : 0));
- pd->Create(input,output,m_factorCollection,filePath,weight,
- maxTargetPhrase[index],
- this->GetLanguageModel(Initial),
- this->GetWeightWordPenalty());
- m_phraseDictionary.push_back(pd);
- #endif
- }
-
- index++;
- timer.check("Finished loading PhraseTable");
- }
- }
- timer.check("Finished loading phrase tables");
-}
-
-void StaticData::LoadMapping()
-{
- // mapping
- const vector<string> &mappingVector = m_parameter.GetParam("mapping");
- for(size_t i=0; i<mappingVector.size(); i++)
- {
- vector<string> token = Tokenize(mappingVector[i]);
- if (token.size() == 2)
- {
- DecodeType decodeType = token[0] == "T" ? Translate : Generate;
- size_t index = Scan<size_t>(token[1]);
- DecodeStep decodeStep (decodeType
- ,decodeType == Translate ? (Dictionary*) m_phraseDictionary[index] : (Dictionary*) m_generationDictionary[index]);
- m_decodeStepList.push_back(decodeStep);
- }
- }
-}
-
-void StaticData::CleanUpAfterSentenceProcessing()
-{
- for(size_t i=0;i<m_phraseDictionary.size();++i)
- m_phraseDictionary[i]->CleanUp();
- for(size_t i=0;i<m_generationDictionary.size();++i)
- m_generationDictionary[i]->CleanUp();
-}
-void StaticData::InitializeBeforeSentenceProcessing(InputType const& in)
-{
- for(size_t i=0;i<m_phraseDictionary.size();++i)
- m_phraseDictionary[i]->InitializeForInput(in);
-} \ No newline at end of file