From bb941c01f6571524ecae01da5b51a4a4acf243aa Mon Sep 17 00:00:00 2001 From: dowobeha Date: Fri, 13 May 2011 18:07:21 +0000 Subject: Merge branch 'master' into local-trunk git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3971 1f5c12ca-751b-0410-a591-d2e778427230 --- .gitignore | 21 + config.h.in | 13 +- configure.in | 43 +- moses/src/Makefile.am | 8 + moses/src/Parameter.cpp | 6 + moses/src/ScoreIndexManager.cpp | 2 + moses/src/StaticData.cpp | 1365 ++++++++++++++++--------------- moses/src/StaticData.h | 15 + moses/src/SyntacticLanguageModel.cpp | 123 +++ moses/src/SyntacticLanguageModel.h | 52 ++ moses/src/SyntacticLanguageModelFiles.h | 95 +++ moses/src/SyntacticLanguageModelState.h | 303 +++++++ regenerate-makefiles.sh | 2 +- scripts/generic/balance-corpus | 392 +++++++++ scripts/training/mert-moses.pl | 13 +- 15 files changed, 1791 insertions(+), 662 deletions(-) create mode 100755 moses/src/SyntacticLanguageModel.cpp create mode 100755 moses/src/SyntacticLanguageModel.h create mode 100755 moses/src/SyntacticLanguageModelFiles.h create mode 100755 moses/src/SyntacticLanguageModelState.h create mode 100644 scripts/generic/balance-corpus diff --git a/.gitignore b/.gitignore index 77987e8ee..14834115d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,23 @@ *.[oa] +*~ +CreateOnDisk/src/.deps +CreateOnDisk/src/CreateOnDiskPt Makefile Makefile.in +OnDiskPt/src/.deps aclocal.m4 autom4te.cache/ config.h config.log config.status configure +kenlm/.deps +kenlm/.libs +kenlm/*.la +kenlm/*.lo +kenlm/build_binary +kenlm/query +libtool mert/.deps/ mert/Makefile mert/Makefile.in @@ -18,16 +29,26 @@ misc/Makefile.in misc/processLexicalTable misc/processPhraseTable misc/queryLexicalTable +misc/queryPhraseTable +moses-chart/src/.deps +moses-chart-cmd/src/.deps +moses-chart-cmd/src/moses_chart moses-cmd/src/.deps/ moses-cmd/src/Makefile moses-cmd/src/Makefile.in +moses-cmd/src/checkplf +moses-cmd/src/lmbrgrid moses-cmd/src/moses moses/src/.deps/ +moses/src/.libs +moses/src/*.lo moses/src/Makefile moses/src/Makefile.in +moses/src/libmoses.la scripts/training/cmert-0.5/mert scripts/training/mbr/mbr scripts/training/phrase-extract/extract scripts/training/phrase-extract/score scripts/training/symal/symal +server/.deps stamp-h1 diff --git a/config.h.in b/config.h.in index 264a888a5..f2f6ed881 100644 --- a/config.h.in +++ b/config.h.in @@ -30,6 +30,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* Define to 1 if you have the header file. */ +#undef HAVE_NL_CPT_H + /* flag for protobuf */ #undef HAVE_PROTOBUF @@ -51,6 +54,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_STRING_H +/* flag for Syntactic Parser */ +#undef HAVE_SYNLM + /* Define to 1 if you have the header file. */ #undef HAVE_SYS_STAT_H @@ -60,10 +66,6 @@ /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H -/* Define to the sub-directory in which libtool stores uninstalled libraries. - */ -#undef LT_OBJDIR - /* Name of package */ #undef PACKAGE @@ -79,9 +81,6 @@ /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME -/* Define to the home page for this package. */ -#undef PACKAGE_URL - /* Define to the version of this package. */ #undef PACKAGE_VERSION diff --git a/configure.in b/configure.in index 8778fa91a..4bf0aea46 100644 --- a/configure.in +++ b/configure.in @@ -34,6 +34,13 @@ AC_ARG_WITH(srilm-dynamic, [with_srilm_dynamic=no] ) +AC_ARG_WITH(srilm-arch, + [AC_HELP_STRING([--with-srilm-arch=ARCH], [(optional) architecture for which SRILM was built])], + [with_srilm_arch=$withval], + [with_srilm_arch=no] + ) + + AC_ARG_WITH(irstlm, [AC_HELP_STRING([--with-irstlm=PATH], [(optional) path to IRST's LM toolkit])], [with_irstlm=$withval], @@ -52,6 +59,12 @@ AC_ARG_WITH(randlm, [with_randlm=no] ) +AC_ARG_WITH(synlm, + [AC_HELP_STRING([--with-synlm=PATH], [(optional) path to syntactic language model parser])], + [with_synlm=$withval], + [with_synlm=no] + ) + AC_ARG_WITH(notrace, [AC_HELP_STRING([--notrace], [disable trace])], [without_trace=yes], @@ -82,7 +95,7 @@ AC_ARG_ENABLE(boost, ) AC_ARG_WITH(zlib, - [AC_HELP_STRING([--with-zlib=PATH], [(optional) path to zlib])], +boost [AC_HELP_STRING([--with-zlib=PATH], [(optional) path to zlib])], [with_zlib=$withval], [with_zlib=no] ) @@ -94,7 +107,7 @@ AC_ARG_WITH(tcmalloc, ) require_boost=no -if test "x$enable_threads" != 'xno' || test "x$enable_boost" != 'xno' +if test "x$enable_threads" != 'xno' || test "x$enable_boost" != 'xno' || test "x$with_synlm" != 'xno' then require_boost=yes fi @@ -111,6 +124,7 @@ AM_CONDITIONAL([SRI_LM], false) AM_CONDITIONAL([IRST_LM], false) AM_CONDITIONAL([KEN_LM], false) AM_CONDITIONAL([RAND_LM], false) +AM_CONDITIONAL([SYN_LM], false) AM_CONDITIONAL([PROTOBUF], false) AM_CONDITIONAL([am__fastdepCC], false) AM_CONDITIONAL([WITH_THREADS],false) @@ -124,13 +138,13 @@ else CPPFLAGS="$CPPFLAGS -DTRACE_ENABLE=1" fi -if test "x$require_boost" = 'xyes' +if test "x$require_boost" = 'xyes' || test "x$with_synlm" then AC_MSG_NOTICE([Using Boost library]) BOOST_REQUIRE([1.36.0]) fi -if test "x$enable_threads" = 'xyes' +if test "x$enable_threads" = 'xyes' || test "x$with_synlm" then AC_MSG_NOTICE([Building threaded moses]) BOOST_THREADS @@ -172,7 +186,12 @@ then # ROOT/lib/i686-m64/liboolm.a # ROOT/lib/i686-m64/libdstruct.a # ROOT/lib/i686-m64/libmisc.a - MY_ARCH=`${with_srilm}/sbin/machine-type` + if test "x$with_srilm_arch" != 'xno' + then + MY_ARCH=${with_srilm_arch} + else + MY_ARCH=`${with_srilm}/sbin/machine-type` + fi LDFLAGS="$LDFLAGS -L${with_srilm}/lib/${MY_ARCH} -L${with_srilm}/flm/obj/${MY_ARCH}" LIBS="$LIBS $LIB_SRILM" FMTLIBS="$FMTLIBS liboolm.a libdstruct.a libmisc.a" @@ -260,6 +279,20 @@ then ) fi + +if test "x$with_synlm" != 'xno' +then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS -DWITH_THREADS -I${with_synlm}/rvtl/include -I${with_synlm}/wsjparse/include -lm" + + AC_CHECK_HEADERS(nl-cpt.h, + [AC_DEFINE([HAVE_SYNLM], [], [flag for Syntactic Parser])]) + + AM_CONDITIONAL([SYN_LM], true) + +fi + + AM_CONDITIONAL([WITH_MERT],false) AC_CHECK_HEADERS([getopt.h], [AM_CONDITIONAL([WITH_MERT],true)], diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am index 3ca22d3ec..7cb610430 100644 --- a/moses/src/Makefile.am +++ b/moses/src/Makefile.am @@ -149,6 +149,10 @@ libmoses_la_HEADERS += LanguageModelInternal.h \ NGramNode.h endif +if SYN_LM +libmoses_la_HEADERS += SyntacticLanguageModel.h +endif + libmoses_la_SOURCES = \ AlignmentInfo.cpp \ BilingualDynSuffixArray.cpp \ @@ -306,6 +310,10 @@ libmoses_la_SOURCES += LanguageModelInternal.cpp \ NGramNode.cpp endif +if SYN_LM +libmoses_la_SOURCES += SyntacticLanguageModel.cpp +endif + if KEN_LM libmoses_la_SOURCES += LanguageModelKen.cpp endif diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp index e0173e49a..27170e0f8 100644 --- a/moses/src/Parameter.cpp +++ b/moses/src/Parameter.cpp @@ -68,6 +68,12 @@ Parameter::Parameter() AddParam("report-all-factors", "report all factors in output, not just first"); AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false"); AddParam("report-segmentation", "t", "report phrase segmentation in the output"); +#ifdef HAVE_SYNLM + AddParam("slmodel-file", "location of the syntactic language model file(s)"); + AddParam("weight-slm", "slm", "weight(s) for syntactic language model"); + AddParam("slmodel-factor", "factor to use with syntactic language model"); + AddParam("slmodel-beam", "beam width to use with syntactic language model's parser"); +#endif AddParam("stack", "s", "maximum stack size for histogram pruning"); AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)"); AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)"); diff --git a/moses/src/ScoreIndexManager.cpp b/moses/src/ScoreIndexManager.cpp index 3ef293039..16e8408f4 100644 --- a/moses/src/ScoreIndexManager.cpp +++ b/moses/src/ScoreIndexManager.cpp @@ -23,6 +23,7 @@ void ScoreIndexManager::AddScoreProducer(const ScoreProducer* sp) m_producers.push_back(sp); + m_begins.push_back(m_last); size_t numScoreCompsProduced = sp->GetNumScoreComponents(); assert(numScoreCompsProduced > 0); @@ -32,6 +33,7 @@ void ScoreIndexManager::AddScoreProducer(const ScoreProducer* sp) << " " << sp->GetScoreProducerDescription() << ") index=" << m_begins.back() << "-" << m_ends.back()-1 << std::endl); */ + } void ScoreIndexManager::PrintLabeledScores(std::ostream& os, const ScoreComponentCollection& scores) const diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index cb2981444..046381d8c 100644 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -41,6 +41,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "DecodeGraph.h" #include "InputFileStream.h" +#ifdef HAVE_SYNLM +#include "SyntacticLanguageModel.h" +#endif + using namespace std; namespace Moses @@ -409,6 +413,12 @@ bool StaticData::LoadData(Parameter *parameter) } } +#ifdef HAVE_SYNLM + if (m_parameter->GetParam("slmodel-file").size() > 0) { + if (!LoadSyntacticLanguageModel()) return false; + } +#endif + // use of xml in input if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough; else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive; @@ -508,6 +518,11 @@ bool StaticData::LoadData(Parameter *parameter) //Add any other features here. +#ifdef HAVE_SYNLM + if (m_syntacticLanguageModel != NULL) { + m_translationSystems.find(config[0])->second.AddFeatureFunction(m_syntacticLanguageModel); + } +#endif } @@ -538,651 +553,709 @@ void StaticData::SetBooleanParameter( bool *parameter, string parameterName, boo StaticData::~StaticData() { - RemoveAllInColl(m_phraseDictionary); - RemoveAllInColl(m_generationDictionary); - RemoveAllInColl(m_reorderModels); - RemoveAllInColl(m_globalLexicalModels); - RemoveAllInColl(m_decodeGraphs); - RemoveAllInColl(m_wordPenaltyProducers); - RemoveAllInColl(m_distortionScoreProducers); - m_languageModel.CleanUp(); - - // delete trans opt - map, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache; - for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) { - TranslationOptionList *transOptList = iterCache->second.first; - delete transOptList; - } - - // small score producers - delete m_unknownWordPenaltyProducer; - - //delete m_parameter; - - // memory pools - Phrase::FinalizeMemPool(); - -} - -bool StaticData::LoadLexicalReorderingModel() -{ - VERBOSE(1, "Loading lexical distortion models..."); - const vector fileStr = m_parameter->GetParam("distortion-file"); - bool hasWeightlr = (m_parameter->GetParam("weight-lr").size() != 0); - vector weightsStr; - if (hasWeightlr) { - weightsStr = m_parameter->GetParam("weight-lr"); - } else { - weightsStr = m_parameter->GetParam("weight-d"); - } - - std::vector weights; - size_t w = 1; //cur weight - if (hasWeightlr) { - w = 0; // if reading from weight-lr, don't have to count first as distortion penalty - } - size_t f = 0; //cur file - //get weights values - VERBOSE(1, "have " << fileStr.size() << " models" << std::endl); - for(size_t j = 0; j < weightsStr.size(); ++j) { - weights.push_back(Scan(weightsStr[j])); - } - //load all models - for(size_t i = 0; i < fileStr.size(); ++i) { - vector spec = Tokenize(fileStr[f], " "); - ++f; //mark file as consumed - if(spec.size() != 4) { - UserMessage::Add("Invalid Lexical Reordering Model Specification: " + fileStr[f]); - return false; - } - - // spec[0] = factor map - // spec[1] = name - // spec[2] = num weights - // spec[3] = fileName - - // decode factor map - - vector input, output; - vector inputfactors = Tokenize(spec[0],"-"); - if(inputfactors.size() == 2) { - input = Tokenize(inputfactors[0],","); - output = Tokenize(inputfactors[1],","); - } else if(inputfactors.size() == 1) { - //if there is only one side assume it is on e side... why? - output = Tokenize(inputfactors[0],","); - } else { - //format error - return false; - } - - string modelType = spec[1]; - - // decode num weights and fetch weights from array - std::vector mweights; - size_t numWeights = atoi(spec[2].c_str()); - for(size_t k = 0; k < numWeights; ++k, ++w) { - if(w >= weights.size()) { - UserMessage::Add("Lexicalized distortion model: Not enough weights, add to [weight-d]"); - return false; - } else - mweights.push_back(weights[w]); - } - - string filePath = spec[3]; - - m_reorderModels.push_back(new LexicalReordering(input, output, modelType, filePath, mweights)); - } - return true; -} - -bool StaticData::LoadGlobalLexicalModel() -{ - const vector &weight = Scan(m_parameter->GetParam("weight-lex")); - const vector &file = m_parameter->GetParam("global-lexical-file"); - - if (weight.size() != file.size()) { - std::cerr << "number of weights and models for the global lexical model does not match (" - << weight.size() << " != " << file.size() << ")" << std::endl; - return false; - } - - for (size_t i = 0; i < weight.size(); i++ ) { - vector spec = Tokenize(file[i], " "); - if ( spec.size() != 2 ) { - std::cerr << "wrong global lexical model specification: " << file[i] << endl; - return false; - } - vector< string > factors = Tokenize(spec[0],"-"); - if ( factors.size() != 2 ) { - std::cerr << "wrong factor definition for global lexical model: " << spec[0] << endl; - return false; - } - vector inputFactors = Tokenize(factors[0],","); - vector outputFactors = Tokenize(factors[1],","); - m_globalLexicalModels.push_back( new GlobalLexicalModel( spec[1], weight[i], inputFactors, outputFactors ) ); - } - return true; -} - -bool StaticData::LoadLanguageModels() -{ - if (m_parameter->GetParam("lmodel-file").size() > 0) { - // weights - vector weightAll = Scan(m_parameter->GetParam("weight-l")); - - for (size_t i = 0 ; i < weightAll.size() ; i++) { - m_allWeights.push_back(weightAll[i]); - } - - // dictionary upper-bounds fo all IRST LMs - vector LMdub = Scan(m_parameter->GetParam("lmodel-dub")); - if (m_parameter->GetParam("lmodel-dub").size() == 0) { - for(size_t i=0; iGetParam("lmodel-file").size(); i++) - LMdub.push_back(0); - } - - // initialize n-gram order for each factor. populated only by factored lm - const vector &lmVector = m_parameter->GetParam("lmodel-file"); - //prevent language models from being loaded twice - map languageModelsLoaded; - - for(size_t i=0; i token = Tokenize(lmVector[i]); - if (token.size() != 4 && token.size() != 5 ) { - UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'"); - return false; - } - // type = implementation, SRI, IRST etc - LMImplementation lmImplementation = static_cast(Scan(token[0])); - - // factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc - vector factorTypes = Tokenize(token[1], ","); - - // nGramOrder = 2 = bigram, 3 = trigram, etc - size_t nGramOrder = Scan(token[2]); - - string &languageModelFile = token[3]; - if (token.size() == 5) { - if (lmImplementation==IRST) - languageModelFile += " " + token[4]; - else { - UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'"); - return false; - } - } - IFVERBOSE(1) - PrintUserTime(string("Start loading LanguageModel ") + languageModelFile); - - lm = LanguageModelFactory::CreateLanguageModel( - lmImplementation - , factorTypes - , nGramOrder - , languageModelFile - , m_scoreIndexManager - , LMdub[i]); - if (lm == NULL) { - UserMessage::Add("no LM created. We probably don't have it compiled"); - return false; - } - languageModelsLoaded[lmVector[i]] = lm; - } - - m_languageModel.Add(lm); - } - } - // flag indicating that language models were loaded, - // since phrase table loading requires their presence - m_fLMsLoaded = true; - IFVERBOSE(1) - PrintUserTime("Finished loading LanguageModels"); - return true; -} - -bool StaticData::LoadGenerationTables() -{ - if (m_parameter->GetParam("generation-file").size() > 0) { - const vector &generationVector = m_parameter->GetParam("generation-file"); - const vector &weight = Scan(m_parameter->GetParam("weight-generation")); - - IFVERBOSE(1) { - TRACE_ERR( "weight-generation: "); - for (size_t i = 0 ; i < weight.size() ; i++) { - TRACE_ERR( weight[i] << "\t"); - } - TRACE_ERR(endl); - } - size_t currWeightNum = 0; - - for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) { - vector token = Tokenize(generationVector[currDict]); - vector input = Tokenize(token[0], ",") - ,output = Tokenize(token[1], ","); - m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output); - string filePath; - size_t numFeatures; - - numFeatures = Scan(token[2]); - filePath = token[3]; - - if (!FileExists(filePath) && FileExists(filePath + ".gz")) { - filePath += ".gz"; - } - - VERBOSE(1, filePath << endl); - - m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager, input,output)); - assert(m_generationDictionary.back() && "could not create GenerationDictionary"); - if (!m_generationDictionary.back()->Load(filePath, Output)) { - delete m_generationDictionary.back(); - return false; - } - for(size_t i = 0; i < numFeatures; i++) { - assert(currWeightNum < weight.size()); - m_allWeights.push_back(weight[currWeightNum++]); - } - } - if (currWeightNum != weight.size()) { - TRACE_ERR( " [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n"); - } - } - - return true; -} - -/* Doesn't load phrase tables any more. Just creates the features. */ -bool StaticData::LoadPhraseTables() -{ - VERBOSE(2,"Creating phrase table features" << endl); - - // language models must be loaded prior to loading phrase tables - assert(m_fLMsLoaded); - // load phrase translation tables - if (m_parameter->GetParam("ttable-file").size() > 0) { - // weights - vector weightAll = Scan(m_parameter->GetParam("weight-t")); - - const vector &translationVector = m_parameter->GetParam("ttable-file"); - vector maxTargetPhrase = Scan(m_parameter->GetParam("ttable-limit")); - - if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) { - VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl); - for(size_t i = 1; i < translationVector.size(); i++) - maxTargetPhrase.push_back(maxTargetPhrase[0]); - } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) { - stringstream strme; - strme << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits."; - UserMessage::Add(strme.str()); - return false; - } - - size_t index = 0; - size_t weightAllOffset = 0; - bool oldFileFormat = false; - for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) { - vector token = Tokenize(translationVector[currDict]); - - if(currDict == 0 && token.size() == 4) { - VERBOSE(1, "Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)!" << endl); - oldFileFormat = true; - } - - if((!oldFileFormat && token.size() < 5) || (oldFileFormat && token.size() != 4)) { - UserMessage::Add("invalid phrase table specification"); - return false; - } - - PhraseTableImplementation implementation = (PhraseTableImplementation) Scan(token[0]); - if(oldFileFormat) { - token.push_back(token[3]); - token[3] = token[2]; - token[2] = token[1]; - token[1] = token[0]; - token[0] = "1"; - implementation = Binary; - } else - implementation = (PhraseTableImplementation) Scan(token[0]); - - assert(token.size() >= 5); - //characteristics of the phrase table - - vector input = Tokenize(token[1], ",") - ,output = Tokenize(token[2], ","); - m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input); - m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output); - m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1; - size_t numScoreComponent = Scan(token[3]); - string filePath= token[4]; - - assert(weightAll.size() >= weightAllOffset + numScoreComponent); - - // weights for this phrase dictionary - // first InputScores (if any), then translation scores - vector weight; - - if(currDict==0 && (m_inputType == ConfusionNetworkInput || m_inputType == WordLatticeInput)) { - // TODO. find what the assumptions made by confusion network about phrase table output which makes - // it only work with binrary file. This is a hack - - m_numInputScores=m_parameter->GetParam("weight-i").size(); - for(unsigned k=0; k(m_parameter->GetParam("weight-i")[k])); - - if(m_parameter->GetParam("link-param-count").size()) - m_numLinkParams = Scan(m_parameter->GetParam("link-param-count")[0]); - - //print some info about this interaction: - if (m_numLinkParams == m_numInputScores) { - VERBOSE(1,"specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count.\n"); - } else if ((m_numLinkParams + 1) == m_numInputScores) { - VERBOSE(1,"WARN: "<< m_numInputScores << " insertion weights found and only "<< m_numLinkParams << " link parameters specified, applying non-epsilon 'real' word link count for last feature weight.\n"); - } else { - stringstream strme; - strme << "You specified " << m_numInputScores - << " input weights (weight-i), but you specified " << m_numLinkParams << " link parameters (link-param-count)!"; - UserMessage::Add(strme.str()); - return false; - } - - } - if (!m_inputType) { - m_numInputScores=0; - } - //this number changes depending on what phrase table we're talking about: only 0 has the weights on it - size_t tableInputScores = (currDict == 0 ? m_numInputScores : 0); - - for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++) - weight.push_back(weightAll[weightAllOffset + currScore]); - - - if(weight.size() - tableInputScores != numScoreComponent) { - stringstream strme; - strme << "Your phrase table has " << numScoreComponent - << " scores, but you specified " << (weight.size() - tableInputScores) << " weights!"; - UserMessage::Add(strme.str()); - return false; - } - - weightAllOffset += numScoreComponent; - numScoreComponent += tableInputScores; - - string targetPath, alignmentsFile; - if (implementation == SuffixArray) { - targetPath = token[5]; - alignmentsFile= token[6]; - } - - assert(numScoreComponent==weight.size()); - - std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights)); - - //This is needed for regression testing, but the phrase table - //might not really be loading here - IFVERBOSE(1) - PrintUserTime(string("Start loading PhraseTable ") + filePath); - VERBOSE(1,"filePath: " << filePath <GetParam("non-terminals").size() == 0) { - defaultNonTerminals = "X"; - } else { - vector tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]); - defaultNonTerminals = tokens[0]; - } - - FactorCollection &factorCollection = FactorCollection::Instance(); - - m_inputDefaultNonTerminal.SetIsNonTerminal(true); - const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals); - m_inputDefaultNonTerminal.SetFactor(0, sourceFactor); - - m_outputDefaultNonTerminal.SetIsNonTerminal(true); - const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals); - m_outputDefaultNonTerminal.SetFactor(0, targetFactor); - - // for unknwon words - if (m_parameter->GetParam("unknown-lhs").size() == 0) { - UnknownLHSEntry entry(defaultNonTerminals, 0.0f); - m_unknownLHS.push_back(entry); - } else { - const string &filePath = m_parameter->GetParam("unknown-lhs")[0]; - - InputFileStream inStream(filePath); - string line; - while(getline(inStream, line)) { - vector tokens = Tokenize(line); - assert(tokens.size() == 2); - UnknownLHSEntry entry(tokens[0], Scan(tokens[1])); - m_unknownLHS.push_back(entry); - } - - } - -} - -void StaticData::LoadChartDecodingParameters() -{ - LoadNonTerminals(); - - // source label overlap - if (m_parameter->GetParam("source-label-overlap").size() > 0) { - m_sourceLabelOverlap = (SourceLabelOverlap) Scan(m_parameter->GetParam("source-label-overlap")[0]); - } else { - m_sourceLabelOverlap = SourceLabelOverlapAdd; - } - - m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0) - ? Scan(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE; -} - -void StaticData::LoadPhraseBasedParameters() -{ - const vector distortionWeights = m_parameter->GetParam("weight-d"); - size_t distortionWeightCount = distortionWeights.size(); - //if there's a lex-reordering model, and no separate weight set, then - //take just one of these weights for linear distortion - if (!m_parameter->GetParam("weight-lr").size() && m_parameter->GetParam("distortion-file").size()) { - distortionWeightCount = 1; - } - for (size_t i = 0; i < distortionWeightCount; ++i) { - float weightDistortion = Scan(distortionWeights[i]); - m_distortionScoreProducers.push_back(new DistortionScoreProducer(m_scoreIndexManager)); - m_allWeights.push_back(weightDistortion); - } -} - -bool StaticData::LoadDecodeGraphs() -{ - const vector &mappingVector = m_parameter->GetParam("mapping"); - const vector &maxChartSpans = Scan(m_parameter->GetParam("max-chart-span")); - - DecodeStep *prev = 0; - size_t prevDecodeGraphInd = 0; - for(size_t i=0; i token = Tokenize(mappingVector[i]); - size_t decodeGraphInd; - DecodeType decodeType; - size_t index; - if (token.size() == 2) { - decodeGraphInd = 0; - decodeType = token[0] == "T" ? Translate : Generate; - index = Scan(token[1]); - } else if (token.size() == 3) { - // For specifying multiple translation model - decodeGraphInd = Scan(token[0]); - //the vectorList index can only increment by one - assert(decodeGraphInd == prevDecodeGraphInd || decodeGraphInd == prevDecodeGraphInd + 1); - if (decodeGraphInd > prevDecodeGraphInd) { - prev = NULL; - } - decodeType = token[1] == "T" ? Translate : Generate; - index = Scan(token[2]); - } else { - UserMessage::Add("Malformed mapping!"); - assert(false); - } - - DecodeStep* decodeStep = NULL; - switch (decodeType) { - case Translate: - if(index>=m_phraseDictionary.size()) { - stringstream strme; - strme << "No phrase dictionary with index " - << index << " available!"; - UserMessage::Add(strme.str()); - assert(false); - } - decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev); - break; - case Generate: - if(index>=m_generationDictionary.size()) { - stringstream strme; - strme << "No generation dictionary with index " - << index << " available!"; - UserMessage::Add(strme.str()); - assert(false); - } - decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev); - break; - case InsertNullFertilityWord: - assert(!"Please implement NullFertilityInsertion."); - break; - } - - assert(decodeStep); - if (m_decodeGraphs.size() < decodeGraphInd + 1) { - DecodeGraph *decodeGraph; - if (m_searchAlgorithm == ChartDecoding) { - size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN; - decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan); - } else { - decodeGraph = new DecodeGraph(m_decodeGraphs.size()); - } - - m_decodeGraphs.push_back(decodeGraph); // TODO max chart span - } - - m_decodeGraphs[decodeGraphInd]->Add(decodeStep); - prev = decodeStep; - prevDecodeGraphInd = decodeGraphInd; - } - - // set maximum n-gram size for backoff approach to decoding paths - // default is always use subsequent paths (value = 0) - for(size_t i=0; i &backoffVector = m_parameter->GetParam("decoding-graph-backoff"); - for(size_t i=0; i(backoffVector[i]); - } - - return true; -} - - -void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector& weights) -{ - const size_t id = sp->GetScoreBookkeepingID(); - const size_t begin = m_scoreIndexManager.GetBeginIndex(id); - const size_t end = m_scoreIndexManager.GetEndIndex(id); - assert(end - begin == weights.size()); - if (m_allWeights.size() < end) - m_allWeights.resize(end); - std::vector::const_iterator weightIter = weights.begin(); - for (size_t i = begin; i < end; i++) - m_allWeights[i] = *weightIter++; -} - -const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const -{ - std::pair key(decodeGraph.GetPosition(), sourcePhrase); -#ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_transOptCacheMutex); -#endif - std::map, std::pair >::iterator iter - = m_transOptCache.find(key); - if (iter == m_transOptCache.end()) - return NULL; - iter->second.second = clock(); // update last used time - return iter->second.first; -} - -void StaticData::ReduceTransOptCache() const -{ - if (m_transOptCache.size() <= m_transOptCacheMaxSize) return; // not full - clock_t t = clock(); - - // find cutoff for last used time - priority_queue< clock_t > lastUsedTimes; - std::map, std::pair >::iterator iter; - iter = m_transOptCache.begin(); - while( iter != m_transOptCache.end() ) { - lastUsedTimes.push( iter->second.second ); - iter++; - } - for( size_t i=0; i < lastUsedTimes.size()-m_transOptCacheMaxSize/2; i++ ) - lastUsedTimes.pop(); - clock_t cutoffLastUsedTime = lastUsedTimes.top(); - - // remove all old entries - iter = m_transOptCache.begin(); - while( iter != m_transOptCache.end() ) { - if (iter->second.second < cutoffLastUsedTime) { - std::map, std::pair >::iterator iterRemove = iter++; - delete iterRemove->second.first; - m_transOptCache.erase(iterRemove); - } else iter++; - } - VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl); -} - -void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const -{ - if (m_transOptCacheMaxSize == 0) return; - std::pair key(decodeGraph.GetPosition(), sourcePhrase); - TranslationOptionList* storedTransOptList = new TranslationOptionList(transOptList); -#ifdef WITH_THREADS - boost::mutex::scoped_lock lock(m_transOptCacheMutex); -#endif - m_transOptCache[key] = make_pair( storedTransOptList, clock() ); - ReduceTransOptCache(); -} - -} +// RemoveAllInColl(m_phraseDictionary); +// RemoveAllInColl(m_generationDictionary); +// RemoveAllInColl(m_reorderModels); +// RemoveAllInColl(m_globalLexicalModels); +// RemoveAllInColl(m_decodeGraphs); +// RemoveAllInColl(m_wordPenaltyProducers); +// RemoveAllInColl(m_distortionScoreProducers); +// m_languageModel.CleanUp(); +// #ifdef HAVE_SYNLM +// delete m_syntacticLanguageModel; +// #endif + +// // delete trans opt +// map, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache; +// for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) { +// TranslationOptionList *transOptList = iterCache->second.first; +// delete transOptList; +// } + +// // small score producers +// delete m_unknownWordPenaltyProducer; + +// //delete m_parameter; + +// // memory pools +// Phrase::FinalizeMemPool(); + +// } + +// #ifdef HAVE_SYNLM +// bool StaticData::LoadSyntacticLanguageModel() { +// cerr << "Loading syntactic language models..." << std::endl; + +// const vector weights = Scan(m_parameter->GetParam("weight-slm")); +// const vector files = m_parameter->GetParam("slmodel-file"); + +// const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ? +// TransformScore(Scan(m_parameter->GetParam("slmodel-factor")[0])) +// : 0; + +// const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ? +// TransformScore(Scan(m_parameter->GetParam("slmodel-beam")[0])) +// : 500; + +// if (files.size() < 1) { +// cerr << "No syntactic language model files specified!" << std::endl; +// return false; +// } + +// // check if feature is used +// if (weights.size() >= 1) { + +// //cout.setf(ios::scientific,ios::floatfield); +// //cerr.setf(ios::scientific,ios::floatfield); + +// // create the feature +// m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth); + +// /* +// ///////////////////////////////////////// +// // BEGIN LANE's UNSTABLE EXPERIMENT :) +// // + +// double ppl = m_syntacticLanguageModel->perplexity(); +// cerr << "Probability is " << ppl << endl; + + +// // +// // END LANE's UNSTABLE EXPERIMENT +// ///////////////////////////////////////// +// */ + + +// if (m_syntacticLanguageModel==NULL) { +// return false; +// } + +// } + +// return true; + +// } +// #endif + +// bool StaticData::LoadLexicalReorderingModel() +// { +// VERBOSE(1, "Loading lexical distortion models..."); +// const vector fileStr = m_parameter->GetParam("distortion-file"); +// bool hasWeightlr = (m_parameter->GetParam("weight-lr").size() != 0); +// vector weightsStr; +// if (hasWeightlr) { +// weightsStr = m_parameter->GetParam("weight-lr"); +// } else { +// weightsStr = m_parameter->GetParam("weight-d"); +// } + +// std::vector weights; +// size_t w = 1; //cur weight +// if (hasWeightlr) { +// w = 0; // if reading from weight-lr, don't have to count first as distortion penalty +// } +// size_t f = 0; //cur file +// //get weights values +// VERBOSE(1, "have " << fileStr.size() << " models" << std::endl); +// for(size_t j = 0; j < weightsStr.size(); ++j) { +// weights.push_back(Scan(weightsStr[j])); +// } +// //load all models +// for(size_t i = 0; i < fileStr.size(); ++i) { +// vector spec = Tokenize(fileStr[f], " "); +// ++f; //mark file as consumed +// if(spec.size() != 4) { +// UserMessage::Add("Invalid Lexical Reordering Model Specification: " + fileStr[f]); +// return false; +// } + +// // spec[0] = factor map +// // spec[1] = name +// // spec[2] = num weights +// // spec[3] = fileName + +// // decode factor map + +// vector input, output; +// vector inputfactors = Tokenize(spec[0],"-"); +// if(inputfactors.size() == 2) { +// input = Tokenize(inputfactors[0],","); +// output = Tokenize(inputfactors[1],","); +// } else if(inputfactors.size() == 1) { +// //if there is only one side assume it is on e side... why? +// output = Tokenize(inputfactors[0],","); +// } else { +// //format error +// return false; +// } + +// string modelType = spec[1]; + +// // decode num weights and fetch weights from array +// std::vector mweights; +// size_t numWeights = atoi(spec[2].c_str()); +// for(size_t k = 0; k < numWeights; ++k, ++w) { +// if(w >= weights.size()) { +// UserMessage::Add("Lexicalized distortion model: Not enough weights, add to [weight-d]"); +// return false; +// } else +// mweights.push_back(weights[w]); +// } + +// string filePath = spec[3]; + +// m_reorderModels.push_back(new LexicalReordering(input, output, modelType, filePath, mweights)); +// } +// return true; +// } + +// bool StaticData::LoadGlobalLexicalModel() +// { +// const vector &weight = Scan(m_parameter->GetParam("weight-lex")); +// const vector &file = m_parameter->GetParam("global-lexical-file"); + +// if (weight.size() != file.size()) { +// std::cerr << "number of weights and models for the global lexical model does not match (" +// << weight.size() << " != " << file.size() << ")" << std::endl; +// return false; +// } + +// for (size_t i = 0; i < weight.size(); i++ ) { +// vector spec = Tokenize(file[i], " "); +// if ( spec.size() != 2 ) { +// std::cerr << "wrong global lexical model specification: " << file[i] << endl; +// return false; +// } +// vector< string > factors = Tokenize(spec[0],"-"); +// if ( factors.size() != 2 ) { +// std::cerr << "wrong factor definition for global lexical model: " << spec[0] << endl; +// return false; +// } +// vector inputFactors = Tokenize(factors[0],","); +// vector outputFactors = Tokenize(factors[1],","); +// m_globalLexicalModels.push_back( new GlobalLexicalModel( spec[1], weight[i], inputFactors, outputFactors ) ); +// } +// return true; +// } + +// bool StaticData::LoadLanguageModels() +// { +// if (m_parameter->GetParam("lmodel-file").size() > 0) { +// // weights +// vector weightAll = Scan(m_parameter->GetParam("weight-l")); + +// for (size_t i = 0 ; i < weightAll.size() ; i++) { +// m_allWeights.push_back(weightAll[i]); +// } + +// // dictionary upper-bounds fo all IRST LMs +// vector LMdub = Scan(m_parameter->GetParam("lmodel-dub")); +// if (m_parameter->GetParam("lmodel-dub").size() == 0) { +// for(size_t i=0; iGetParam("lmodel-file").size(); i++) +// LMdub.push_back(0); +// } + +// // initialize n-gram order for each factor. populated only by factored lm +// const vector &lmVector = m_parameter->GetParam("lmodel-file"); +// //prevent language models from being loaded twice +// map languageModelsLoaded; + +// for(size_t i=0; i token = Tokenize(lmVector[i]); +// if (token.size() != 4 && token.size() != 5 ) { +// UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'"); +// return false; +// } +// // type = implementation, SRI, IRST etc +// LMImplementation lmImplementation = static_cast(Scan(token[0])); + +// // factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc +// vector factorTypes = Tokenize(token[1], ","); + +// // nGramOrder = 2 = bigram, 3 = trigram, etc +// size_t nGramOrder = Scan(token[2]); + +// string &languageModelFile = token[3]; +// if (token.size() == 5) { +// if (lmImplementation==IRST) +// languageModelFile += " " + token[4]; +// else { +// UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'"); +// return false; +// } +// } +// IFVERBOSE(1) +// PrintUserTime(string("Start loading LanguageModel ") + languageModelFile); + +// lm = LanguageModelFactory::CreateLanguageModel( +// lmImplementation +// , factorTypes +// , nGramOrder +// , languageModelFile +// , m_scoreIndexManager +// , LMdub[i]); +// if (lm == NULL) { +// UserMessage::Add("no LM created. We probably don't have it compiled"); +// return false; +// } +// languageModelsLoaded[lmVector[i]] = lm; +// } + +// m_languageModel.Add(lm); +// } +// } +// // flag indicating that language models were loaded, +// // since phrase table loading requires their presence +// m_fLMsLoaded = true; +// IFVERBOSE(1) +// PrintUserTime("Finished loading LanguageModels"); +// return true; +// } + +// bool StaticData::LoadGenerationTables() +// { +// if (m_parameter->GetParam("generation-file").size() > 0) { +// const vector &generationVector = m_parameter->GetParam("generation-file"); +// const vector &weight = Scan(m_parameter->GetParam("weight-generation")); + +// IFVERBOSE(1) { +// TRACE_ERR( "weight-generation: "); +// for (size_t i = 0 ; i < weight.size() ; i++) { +// TRACE_ERR( weight[i] << "\t"); +// } +// TRACE_ERR(endl); +// } +// size_t currWeightNum = 0; + +// for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) { +// vector token = Tokenize(generationVector[currDict]); +// vector input = Tokenize(token[0], ",") +// ,output = Tokenize(token[1], ","); +// m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output); +// string filePath; +// size_t numFeatures; + +// numFeatures = Scan(token[2]); +// filePath = token[3]; + +// if (!FileExists(filePath) && FileExists(filePath + ".gz")) { +// filePath += ".gz"; +// } + +// VERBOSE(1, filePath << endl); + +// m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager, input,output)); +// assert(m_generationDictionary.back() && "could not create GenerationDictionary"); +// if (!m_generationDictionary.back()->Load(filePath, Output)) { +// delete m_generationDictionary.back(); +// return false; +// } +// for(size_t i = 0; i < numFeatures; i++) { +// assert(currWeightNum < weight.size()); +// m_allWeights.push_back(weight[currWeightNum++]); +// } +// } +// if (currWeightNum != weight.size()) { +// TRACE_ERR( " [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n"); +// } +// } + +// return true; +// } + +// /* Doesn't load phrase tables any more. Just creates the features. */ +// bool StaticData::LoadPhraseTables() +// { +// VERBOSE(2,"Creating phrase table features" << endl); + +// // language models must be loaded prior to loading phrase tables +// assert(m_fLMsLoaded); +// // load phrase translation tables +// if (m_parameter->GetParam("ttable-file").size() > 0) { +// // weights +// vector weightAll = Scan(m_parameter->GetParam("weight-t")); + +// const vector &translationVector = m_parameter->GetParam("ttable-file"); +// vector maxTargetPhrase = Scan(m_parameter->GetParam("ttable-limit")); + +// if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) { +// VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl); +// for(size_t i = 1; i < translationVector.size(); i++) +// maxTargetPhrase.push_back(maxTargetPhrase[0]); +// } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) { +// stringstream strme; +// strme << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits."; +// UserMessage::Add(strme.str()); +// return false; +// } + +// size_t index = 0; +// size_t weightAllOffset = 0; +// bool oldFileFormat = false; +// for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) { +// vector token = Tokenize(translationVector[currDict]); + +// if(currDict == 0 && token.size() == 4) { +// VERBOSE(1, "Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)!" << endl); +// oldFileFormat = true; +// } + +// if((!oldFileFormat && token.size() < 5) || (oldFileFormat && token.size() != 4)) { +// UserMessage::Add("invalid phrase table specification"); +// return false; +// } + +// PhraseTableImplementation implementation = (PhraseTableImplementation) Scan(token[0]); +// if(oldFileFormat) { +// token.push_back(token[3]); +// token[3] = token[2]; +// token[2] = token[1]; +// token[1] = token[0]; +// token[0] = "1"; +// implementation = Binary; +// } else +// implementation = (PhraseTableImplementation) Scan(token[0]); + +// assert(token.size() >= 5); +// //characteristics of the phrase table + +// vector input = Tokenize(token[1], ",") +// ,output = Tokenize(token[2], ","); +// m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input); +// m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output); +// m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1; +// size_t numScoreComponent = Scan(token[3]); +// string filePath= token[4]; + +// assert(weightAll.size() >= weightAllOffset + numScoreComponent); + +// // weights for this phrase dictionary +// // first InputScores (if any), then translation scores +// vector weight; + +// if(currDict==0 && (m_inputType == ConfusionNetworkInput || m_inputType == WordLatticeInput)) { +// // TODO. find what the assumptions made by confusion network about phrase table output which makes +// // it only work with binrary file. This is a hack + +// m_numInputScores=m_parameter->GetParam("weight-i").size(); +// for(unsigned k=0; k(m_parameter->GetParam("weight-i")[k])); + +// if(m_parameter->GetParam("link-param-count").size()) +// m_numLinkParams = Scan(m_parameter->GetParam("link-param-count")[0]); + +// //print some info about this interaction: +// if (m_numLinkParams == m_numInputScores) { +// VERBOSE(1,"specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count.\n"); +// } else if ((m_numLinkParams + 1) == m_numInputScores) { +// VERBOSE(1,"WARN: "<< m_numInputScores << " insertion weights found and only "<< m_numLinkParams << " link parameters specified, applying non-epsilon 'real' word link count for last feature weight.\n"); +// } else { +// stringstream strme; +// strme << "You specified " << m_numInputScores +// << " input weights (weight-i), but you specified " << m_numLinkParams << " link parameters (link-param-count)!"; +// UserMessage::Add(strme.str()); +// return false; +// } + +// } +// if (!m_inputType) { +// m_numInputScores=0; +// } +// //this number changes depending on what phrase table we're talking about: only 0 has the weights on it +// size_t tableInputScores = (currDict == 0 ? m_numInputScores : 0); + +// for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++) +// weight.push_back(weightAll[weightAllOffset + currScore]); + + +// if(weight.size() - tableInputScores != numScoreComponent) { +// stringstream strme; +// strme << "Your phrase table has " << numScoreComponent +// << " scores, but you specified " << (weight.size() - tableInputScores) << " weights!"; +// UserMessage::Add(strme.str()); +// return false; +// } + +// weightAllOffset += numScoreComponent; +// numScoreComponent += tableInputScores; + +// string targetPath, alignmentsFile; +// if (implementation == SuffixArray) { +// targetPath = token[5]; +// alignmentsFile= token[6]; +// } + +// assert(numScoreComponent==weight.size()); + +// std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights)); + +// //This is needed for regression testing, but the phrase table +// //might not really be loading here +// IFVERBOSE(1) +// PrintUserTime(string("Start loading PhraseTable ") + filePath); +// VERBOSE(1,"filePath: " << filePath <GetParam("non-terminals").size() == 0) { +// defaultNonTerminals = "X"; +// } else { +// vector tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]); +// defaultNonTerminals = tokens[0]; +// } + +// FactorCollection &factorCollection = FactorCollection::Instance(); + +// m_inputDefaultNonTerminal.SetIsNonTerminal(true); +// const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals); +// m_inputDefaultNonTerminal.SetFactor(0, sourceFactor); + +// m_outputDefaultNonTerminal.SetIsNonTerminal(true); +// const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals); +// m_outputDefaultNonTerminal.SetFactor(0, targetFactor); + +// // for unknwon words +// if (m_parameter->GetParam("unknown-lhs").size() == 0) { +// UnknownLHSEntry entry(defaultNonTerminals, 0.0f); +// m_unknownLHS.push_back(entry); +// } else { +// const string &filePath = m_parameter->GetParam("unknown-lhs")[0]; + +// InputFileStream inStream(filePath); +// string line; +// while(getline(inStream, line)) { +// vector tokens = Tokenize(line); +// assert(tokens.size() == 2); +// UnknownLHSEntry entry(tokens[0], Scan(tokens[1])); +// m_unknownLHS.push_back(entry); +// } + +// } + +// } + +// void StaticData::LoadChartDecodingParameters() +// { +// LoadNonTerminals(); + +// // source label overlap +// if (m_parameter->GetParam("source-label-overlap").size() > 0) { +// m_sourceLabelOverlap = (SourceLabelOverlap) Scan(m_parameter->GetParam("source-label-overlap")[0]); +// } else { +// m_sourceLabelOverlap = SourceLabelOverlapAdd; +// } + +// m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0) +// ? Scan(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE; +// } + +// void StaticData::LoadPhraseBasedParameters() +// { +// const vector distortionWeights = m_parameter->GetParam("weight-d"); +// size_t distortionWeightCount = distortionWeights.size(); +// //if there's a lex-reordering model, and no separate weight set, then +// //take just one of these weights for linear distortion +// if (!m_parameter->GetParam("weight-lr").size() && m_parameter->GetParam("distortion-file").size()) { +// distortionWeightCount = 1; +// } +// for (size_t i = 0; i < distortionWeightCount; ++i) { +// float weightDistortion = Scan(distortionWeights[i]); +// m_distortionScoreProducers.push_back(new DistortionScoreProducer(m_scoreIndexManager)); +// m_allWeights.push_back(weightDistortion); +// } +// } + +// bool StaticData::LoadDecodeGraphs() +// { +// const vector &mappingVector = m_parameter->GetParam("mapping"); +// const vector &maxChartSpans = Scan(m_parameter->GetParam("max-chart-span")); + +// DecodeStep *prev = 0; +// size_t prevDecodeGraphInd = 0; +// for(size_t i=0; i token = Tokenize(mappingVector[i]); +// size_t decodeGraphInd; +// DecodeType decodeType; +// size_t index; +// if (token.size() == 2) { +// decodeGraphInd = 0; +// decodeType = token[0] == "T" ? Translate : Generate; +// index = Scan(token[1]); +// } else if (token.size() == 3) { +// // For specifying multiple translation model +// decodeGraphInd = Scan(token[0]); +// //the vectorList index can only increment by one +// assert(decodeGraphInd == prevDecodeGraphInd || decodeGraphInd == prevDecodeGraphInd + 1); +// if (decodeGraphInd > prevDecodeGraphInd) { +// prev = NULL; +// } +// decodeType = token[1] == "T" ? Translate : Generate; +// index = Scan(token[2]); +// } else { +// UserMessage::Add("Malformed mapping!"); +// assert(false); +// } + +// DecodeStep* decodeStep = NULL; +// switch (decodeType) { +// case Translate: +// if(index>=m_phraseDictionary.size()) { +// stringstream strme; +// strme << "No phrase dictionary with index " +// << index << " available!"; +// UserMessage::Add(strme.str()); +// assert(false); +// } +// decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev); +// break; +// case Generate: +// if(index>=m_generationDictionary.size()) { +// stringstream strme; +// strme << "No generation dictionary with index " +// << index << " available!"; +// UserMessage::Add(strme.str()); +// assert(false); +// } +// decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev); +// break; +// case InsertNullFertilityWord: +// assert(!"Please implement NullFertilityInsertion."); +// break; +// } + +// assert(decodeStep); +// if (m_decodeGraphs.size() < decodeGraphInd + 1) { +// DecodeGraph *decodeGraph; +// if (m_searchAlgorithm == ChartDecoding) { +// size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN; +// decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan); +// } else { +// decodeGraph = new DecodeGraph(m_decodeGraphs.size()); +// } + +// m_decodeGraphs.push_back(decodeGraph); // TODO max chart span +// } + +// m_decodeGraphs[decodeGraphInd]->Add(decodeStep); +// prev = decodeStep; +// prevDecodeGraphInd = decodeGraphInd; +// } + +// // set maximum n-gram size for backoff approach to decoding paths +// // default is always use subsequent paths (value = 0) +// for(size_t i=0; i &backoffVector = m_parameter->GetParam("decoding-graph-backoff"); +// for(size_t i=0; i(backoffVector[i]); +// } + +// return true; +// } + + +// void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector& weights) +// { +// const size_t id = sp->GetScoreBookkeepingID(); +// const size_t begin = m_scoreIndexManager.GetBeginIndex(id); +// const size_t end = m_scoreIndexManager.GetEndIndex(id); +// assert(end - begin == weights.size()); +// if (m_allWeights.size() < end) +// m_allWeights.resize(end); +// std::vector::const_iterator weightIter = weights.begin(); +// for (size_t i = begin; i < end; i++) +// m_allWeights[i] = *weightIter++; +// } + +// const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const +// { +// std::pair key(decodeGraph.GetPosition(), sourcePhrase); +// #ifdef WITH_THREADS +// boost::mutex::scoped_lock lock(m_transOptCacheMutex); +// #endif +// std::map, std::pair >::iterator iter +// = m_transOptCache.find(key); +// if (iter == m_transOptCache.end()) +// return NULL; +// iter->second.second = clock(); // update last used time +// return iter->second.first; +// } + +// void StaticData::ReduceTransOptCache() const +// { +// if (m_transOptCache.size() <= m_transOptCacheMaxSize) return; // not full +// clock_t t = clock(); + +// // find cutoff for last used time +// priority_queue< clock_t > lastUsedTimes; +// std::map, std::pair >::iterator iter; +// iter = m_transOptCache.begin(); +// while( iter != m_transOptCache.end() ) { +// lastUsedTimes.push( iter->second.second ); +// iter++; +// } +// for( size_t i=0; i < lastUsedTimes.size()-m_transOptCacheMaxSize/2; i++ ) +// lastUsedTimes.pop(); +// clock_t cutoffLastUsedTime = lastUsedTimes.top(); + +// // remove all old entries +// iter = m_transOptCache.begin(); +// while( iter != m_transOptCache.end() ) { +// if (iter->second.second < cutoffLastUsedTime) { +// std::map, std::pair >::iterator iterRemove = iter++; +// delete iterRemove->second.first; +// m_transOptCache.erase(iterRemove); +// } else iter++; +// } +// VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl); +// } + +// void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const +// { +// if (m_transOptCacheMaxSize == 0) return; +// std::pair key(decodeGraph.GetPosition(), sourcePhrase); +// TranslationOptionList* storedTransOptList = new TranslationOptionList(transOptList); +// #ifdef WITH_THREADS +// boost::mutex::scoped_lock lock(m_transOptCacheMutex); +// #endif +// m_transOptCache[key] = make_pair( storedTransOptList, clock() ); +// ReduceTransOptCache(); +// } + +// } diff --git a/moses/src/StaticData.h b/moses/src/StaticData.h index b3ea80a60..0d46d9531 100644 --- a/moses/src/StaticData.h +++ b/moses/src/StaticData.h @@ -63,6 +63,9 @@ class GenerationDictionary; class DistortionScoreProducer; class DecodeStep; class UnknownWordPenaltyProducer; +#ifdef HAVE_SYNLM +class SyntacticLanguageModel; +#endif class TranslationSystem; typedef std::pair UnknownLHSEntry; @@ -95,6 +98,11 @@ protected: m_earlyDiscardingThreshold, m_translationOptionThreshold, m_wordDeletionWeight; +#ifdef HAVE_SYNLM + SyntacticLanguageModel* m_syntacticLanguageModel; +#endif + + // PhraseTrans, Generation & LanguageModelScore has multiple weights. int m_maxDistortion; @@ -205,10 +213,16 @@ protected: void LoadChartDecodingParameters(); void LoadNonTerminals(); + //! helper fn to set bool param from ini file/command line void SetBooleanParameter(bool *paramter, std::string parameterName, bool defaultValue); //! load all language models as specified in ini file bool LoadLanguageModels(); +#ifdef HAVE_SYNLM + //! load syntactic language model + bool LoadSyntacticLanguageModel(); +#endif + //! load not only the main phrase table but also any auxiliary tables that depend on which features are being used (e.g., word-deletion, word-insertion tables) bool LoadPhraseTables(); //! load all generation tables as specified in ini file @@ -220,6 +234,7 @@ protected: void ReduceTransOptCache() const; bool m_continuePartialTranslation; + public: bool IsAlwaysCreateDirectTranslationOption() const { diff --git a/moses/src/SyntacticLanguageModel.cpp b/moses/src/SyntacticLanguageModel.cpp new file mode 100755 index 000000000..85c19bdc0 --- /dev/null +++ b/moses/src/SyntacticLanguageModel.cpp @@ -0,0 +1,123 @@ +// + +#include "StaticData.h" +#include "SyntacticLanguageModel.h" +#include "HHMMLangModel-gf.h" +#include "TextObsModel.h" +#include "SyntacticLanguageModelFiles.h" +#include "SyntacticLanguageModelState.h" + + +namespace Moses +{ + // asnteousntaoheisnthaoesntih + SyntacticLanguageModel::SyntacticLanguageModel(const std::vector& filePath, + const std::vector& weights, + const FactorType factorType, + size_t beamWidth) + // Initialize member variables + : m_NumScoreComponents(weights.size()) + , m_beamWidth(beamWidth) + , m_factorType(factorType) + , m_files(new SyntacticLanguageModelFiles(filePath)) { + + // Inform Moses score manager of this feature and its weight(s) + const_cast(StaticData::Instance().GetScoreIndexManager()).AddScoreProducer(this); + const_cast(StaticData::Instance()).SetWeightsForScoreProducer(this, weights); + VERBOSE(3,"Constructed SyntacticLanguageModel" << endl); + } + + SyntacticLanguageModel::~SyntacticLanguageModel() { + VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl); + // delete m_files; + } + + size_t SyntacticLanguageModel::GetNumScoreComponents() const { + return m_NumScoreComponents; + } + + std::string SyntacticLanguageModel::GetScoreProducerDescription() const { + return "Syntactic Language Model"; + } + + std::string SyntacticLanguageModel::GetScoreProducerWeightShortName() const { + return "slm"; + } + + const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const { + + return new SyntacticLanguageModelState(m_files,m_beamWidth); + + } + + /* + double SyntacticLanguageModel::perplexity() { + + SyntacticLanguageModelState *prev = + new SyntacticLanguageModelState(m_files,m_beamWidth); + + std::cerr << "Initial prob:" << "\t" << prev->getProb() < words(3); + words[0] = "no"; + words[1] = ","; + words[2] = "zxvth"; + + + for (std::vector::iterator i=words.begin(); + i != words.end(); + i++) { + + prev = new SyntacticLanguageModelState(prev, *i); + std::cerr << *i << "\t" << prev->getProb() <getProb(); + + } + */ + FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const { + + VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl); + + const SyntacticLanguageModelState& prev = + static_cast&>(*prev_state); + + const SyntacticLanguageModelState* currentState = &prev; + SyntacticLanguageModelState* nextState = NULL; + + + const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); + + for (size_t i=0, n=targetPhrase.GetSize(); iGetString(); + + if (i==0) { + nextState = new SyntacticLanguageModelState(&prev, string); + } else { + currentState = nextState; + nextState = new SyntacticLanguageModelState(currentState, string); + } + + double score = nextState->getScore(); + VERBOSE(3,"SynLM evaluated a score of " << score << endl); + accumulator->Assign( this, score ); + } + + + + return nextState; + + } + +} diff --git a/moses/src/SyntacticLanguageModel.h b/moses/src/SyntacticLanguageModel.h new file mode 100755 index 000000000..977a57680 --- /dev/null +++ b/moses/src/SyntacticLanguageModel.h @@ -0,0 +1,52 @@ +// + +#ifndef moses_SyntacticLanguageModel_h +#define moses_SyntacticLanguageModel_h + +#include "FeatureFunction.h" + + +class YModel; // hidden model +class XModel; // observed model + +namespace Moses +{ + + template class SyntacticLanguageModelFiles; + + class SyntacticLanguageModel : public StatefulFeatureFunction { + + public: + + SyntacticLanguageModel(const std::vector& filePaths, + const std::vector& weights, + const FactorType factorType, + const size_t beamWidth); + + ~SyntacticLanguageModel(); + + size_t GetNumScoreComponents() const; + std::string GetScoreProducerDescription() const; + std::string GetScoreProducerWeightShortName() const; + + const FFState* EmptyHypothesisState(const InputType &input) const; + + FFState* Evaluate(const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + // double perplexity(); + + private: + + const size_t m_NumScoreComponents; + SyntacticLanguageModelFiles* m_files; + const FactorType m_factorType; + const size_t m_beamWidth; + + }; + + +} + +#endif diff --git a/moses/src/SyntacticLanguageModelFiles.h b/moses/src/SyntacticLanguageModelFiles.h new file mode 100755 index 000000000..318e22636 --- /dev/null +++ b/moses/src/SyntacticLanguageModelFiles.h @@ -0,0 +1,95 @@ +// + +#ifndef moses_SyntacticLanguageModelFiles_h +#define moses_SyntacticLanguageModelFiles_h + +#include "nl-iomacros.h" +#include "nl-string.h" + +namespace Moses +{ + +template +class SyntacticLanguageModelFiles { + + public: + + SyntacticLanguageModelFiles(const std::vector& filePaths); + ~SyntacticLanguageModelFiles(); + + MH* getHiddenModel(); + MO* getObservedModel(); + + private: + MH* hiddenModel; + MO* observedModel; + +}; + + +template + SyntacticLanguageModelFiles::SyntacticLanguageModelFiles(const std::vector& filePaths) { + + this->hiddenModel = new MH(); + this->observedModel = new MO(); + + //// I. LOAD MODELS... + std::cerr << "Reading syntactic language model files...\n"; + // For each model file... + for ( int a=0, n=filePaths.size(); a>*(this->hiddenModel)>>"\0"!=NULL + || si>>*(this->observedModel)>>"\0"!=NULL + )) + std::cerr<<"\nERROR: can't parse \'"< + SyntacticLanguageModelFiles::~SyntacticLanguageModelFiles() { + + std::cerr<<"Destructing syntactic language model files" << std::endl; + //delete hiddenModel; + //delete observedModel; + +} + + +template + MH* SyntacticLanguageModelFiles::getHiddenModel() { + + return this->hiddenModel; + +} + +template + MO* SyntacticLanguageModelFiles::getObservedModel() { + + return this->observedModel; + +} + + +} + +#endif diff --git a/moses/src/SyntacticLanguageModelState.h b/moses/src/SyntacticLanguageModelState.h new file mode 100755 index 000000000..0877a59b3 --- /dev/null +++ b/moses/src/SyntacticLanguageModelState.h @@ -0,0 +1,303 @@ +// + +#ifndef moses_SyntacticLanguageModelState_h +#define moses_SyntacticLanguageModelState_h + +#include "nl-iomacros.h" +#include "nl-cpt.h" +#include "nl-hmm.h" + +#include "SyntacticLanguageModelFiles.h" +#include "FFState.h" +#include + +namespace Moses +{ + +template > + class SyntacticLanguageModelState : public FFState { + public: + + // Initialize an empty LM state + SyntacticLanguageModelState( SyntacticLanguageModelFiles* modelData, int beamSize ); + + // Get the next LM state from an existing LM state and the next word + SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ); + + + ~SyntacticLanguageModelState() { + //cerr << "Deleting SyntacticLanguageModelState" << std::endl; + //delete randomVariableStore; + } + + virtual int Compare(const FFState& other) const; + + // Get the LM score from this LM state + double getScore() const; + + double getProb() const; + + private: + + void setScore(double score); + void printRV(); + + SafeArray1D,pair >* randomVariableStore; + double prob; + double score; + int beamSize; + SyntacticLanguageModelFiles* modelData; + bool sentenceStart; +}; + + +//////////////////////////////////////////////////////////////////////////////// + + + template + void SyntacticLanguageModelState::printRV() { + + cerr << "*********** BEGIN printRV() ******************" << endl; + int size=randomVariableStore->getSize(); + cerr << "randomVariableStore->getSize() == " << size << endl; + + for (int depth=0; depth *data = &(randomVariableStore->get(depth)); + std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl; + + } + cerr << "*********** END printRV() ******************" << endl; + + } + +// Initialize an empty LM state from grammar files +// +// nArgs is the number of model files +// argv is the list of model file names +// +template + SyntacticLanguageModelState::SyntacticLanguageModelState( SyntacticLanguageModelFiles* modelData, int beamSize ) { + + this->randomVariableStore = new SafeArray1D,pair >(); + this->modelData = modelData; + this->beamSize = beamSize; + + // Initialize an empty random variable value + YS xBEG; + StringInput(String(BEG_STATE).c_array())>>xBEG>>"\0"; + cerr<randomVariableStore->init(1,pair(xBEG,0)); + + this->sentenceStart = true; + + IFVERBOSE(3) { + VERBOSE(3,"Examining RV store just after RV init" << endl); + printRV(); + } + + // Get score of final frame in HHMM + LogProb l(1.0); + //score = l.toDouble(); + setScore(l.toDouble()); + // MY::F_ROOT_OBS = true; + // this->modelData->getHiddenModel()->setRootObs(true); + + +} + + +template + int SyntacticLanguageModelState::Compare(const FFState& other) const { + /* + const SyntacticLanguageModelState& o = + static_cast&>(other); + + if (o.score > score) return 1; + else if (o.score < score) return -1; + else return 0; + */ + return 0; + } + + +template + SyntacticLanguageModelState::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ) { + + // Initialize member variables + this->randomVariableStore = new SafeArray1D,pair >(); + this->modelData = prev->modelData; + this->beamSize = prev->beamSize; + this->randomVariableStore->init(this->beamSize); + this->sentenceStart=false; + + YS ysEND; + StringInput(String(END_STATE).c_array())>>ysEND>>"\0"; + + // Get HHMM model files + MY& mH = *(modelData->getHiddenModel()); + MX& mO = *(modelData->getObservedModel()); + + // Initialize HHMM + HMM hmm(mH,mO); + int MAX_WORDS = 2; + hmm.init(MAX_WORDS,this->beamSize,prev->randomVariableStore); + typename MX::RandVarType x(word.c_str()); + // cout << "Examining HHMM just after hmm.init" << endl; + // hmm.debugPrint(); + + + /* cerr << "*********** BEGIN writeCurr() ******************" << endl; + hmm.writeCurr(cout,0); + hmm.writeCurr(cout,1); + cerr << "*********** END writeCurr() ******************" << endl; + */ +/* + { + + int wnum=1; + list > lys = hmm.getMLSnodes(ysEND); // get mls list + for ( typename list >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame + cout << "HYPOTH " << wnum + << " " << i->getBackData() + << " " << x + << " " << i->getId() + << " (" << i->getLogProb() << ")" + << endl; // print RV val + } + } + */ + + + /* + cerr << "Writing hmm.writeCurr" << endl; + hmm.writeCurr(cerr,0); + hmm.writeCurr(cerr,1); + cerr << "...done writing hmm.writeCurr" << endl; + */ + hmm.getCurrSum(); + + + + // Initialize observed variable + // typename MX::RandVarType ov; + // ov.set(word.c_str(),mO); + // MY::WORD = ov.getW(); + //bool endOfSentence = prev->sentenceStart;//true; + + // std::cerr << "About to give HHMM a word of input:\t" << word << std::endl; + + hmm.updateRanked(x, prev->sentenceStart); + + // cout << "Examining HHMM just after hmm.updateRanked(" << x << "," << prev->sentenceStart << ")" << endl; + // hmm.debugPrint(); +/* + cerr << "*********** BEGIN writeCurr() ******************" << endl; + hmm.writeCurr(cout,0); + hmm.writeCurr(cout,1); + cerr << "*********** END writeCurr() ******************" << endl; + */ +/* +{ + + int wnum=1; + list > lys = hmm.getMLSnodes(ysEND); // get mls list + for ( typename list >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame + cout << "HYPOTH " << wnum + << " " << i->getBackData() + << " " << x + << " " << i->getId() + << " (" << i->getLogProb() << ")" + << endl; // print RV val + } + } + */ +// X ov(word.c_str()); + //mH.setWord(ov); + // MY::WORD = ov;//ov.getW(); + + // Update HHMM based on observed variable + //hmm.updateRanked(ov); + //mH.setRootObs(true); + //MY::F_ROOT_OBS = false; + + // Get the current score + double currSum = hmm.getCurrSum(); + //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl); + setScore(currSum); + // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl; + // printRV(); + + // Get new hidden random variable store from HHMM + hmm.gatherElementsInBeam(randomVariableStore); + // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl; + // printRV(); + /* + cerr << "Writing hmm.writeCurr..." << endl; + hmm.writeCurr(cerr,0); + hmm.writeCurr(cerr,1); + cerr << "...done writing hmm.writeCurr" << endl; + */ +} + + +template +double SyntacticLanguageModelState::getProb() const { + + return prob; +} + +template +double SyntacticLanguageModelState::getScore() const { + + return score; +} + + +template + void SyntacticLanguageModelState::setScore(double score) { + + + + + this->prob = score; + + // We want values to range from -100 to 0 + // + // If the minimum positive value for a double is min=4.94065645841246544e-324 + // then to scale, we want a logarithmic base such that log_b(min)=-100 + // + // -100 = log(min) / log(b) + // + // log(b) = log(min) / -100 + // + // b = exp( log(min) / -100 ) + // + // b = 7.44440071921381 + + // Check for score==0 to avoid causing -infinity with log(score) + if (score==0) { + this->score = -100; + } else { + double x = log(score) / 7.44440071921381; + if ( x >= -100) { + this->score = x; + } else { + this->score = -100; + } + } + + VERBOSE(3,"\tSyntacticLanguageModelState has score=" << this->score << endl); + +} + + +} + +#endif diff --git a/regenerate-makefiles.sh b/regenerate-makefiles.sh index 747dabf88..c4c2e8bee 100755 --- a/regenerate-makefiles.sh +++ b/regenerate-makefiles.sh @@ -54,7 +54,7 @@ $LIBTOOLIZE || die "libtoolize failed" echo echo "You should now be able to configure and build:" -echo " ./configure [--with-srilm=/path/to/srilm] [--with-irstlm=/path/to/irstlm] [--with-randlm=/path/to/randlm] [--without-kenlm] [--with-xmlrpc-c=/path/to/xmlrpc-c-config]" +echo " ./configure [--with-srilm=/path/to/srilm] [--with-irstlm=/path/to/irstlm] [--with-randlm=/path/to/randlm] [--without-kenlm] [--with-synlm=/path/to/modelblocks] [--with-xmlrpc-c=/path/to/xmlrpc-c-config]" echo " make -j 4" echo diff --git a/scripts/generic/balance-corpus b/scripts/generic/balance-corpus new file mode 100644 index 000000000..647fa4502 --- /dev/null +++ b/scripts/generic/balance-corpus @@ -0,0 +1,392 @@ +#!/usr/bin/ruby -w + +require 'optparse' +require 'ostruct' +require 'pp' +require 'set' + +options = OpenStruct.new +OptionParser.new { |opts| + + opts.banner = "Usage: #{$0} [options]" + + opts.on("-n N","--num-parts N", Integer, "Number of parts into which the corpus should be split") { |v| + options.parts = v + options.parts_digits = options.parts.to_s.length + } + + opts.on("-i FILE", "--corpus", String, "Corpus to split") { |v| + options.corpus = v + } + + options.reference = Array.new + opts.on("-r FILE", "--reference", String, "Reference file") { |v| + options.reference << v + } + + options.put_all = false + opts.on("-a","--all","Output all lines into a single file, in addition to split files") { |v| + options.put_all = v + } + + options.max_words = 1.0/0.0 + opts.on("-m N","--max-words", Integer, "Maximum number of words allowed in a line") { |v| + options.max_words = v + } + + options.min_words = 1 + opts.on("--min-words N", Integer, "Minimum number of words allowed in a line") { |v| + options.min_words = v + } + + options.index_prefix = false + opts.on("--index-prefix FILE_PREFIX", String, "Index file name prefixing the part number") { |v| + options.index_prefix = v + } + + opts.on("-p FILE_PREFIX","--prefix FILE_PREFIX", String, "File name prefixing the part number") { |v| + options.output_prefix = v + } + + opts.on("-s FILE_SUFFIX","--suffix FILE_SUFFIX", String, "File name suffixing the part number") { |v| + options.output_suffix = v + } + + options.ref_prefix = Array.new + opts.on("--ref-prefix FILE_PREFIX", String, "File name prefixing the part number") { |v| + options.ref_prefix << v + } + + options.ref_suffix = Array.new + opts.on("--ref-suffix FILE_SUFFIX", String, "File name suffixing the part number") { |v| + options.ref_suffix << v + } + + options.balance_naive = false + opts.on("--balance-naive","Balance according to combined number of lines") { |v| + options.balance_naive = v + } + + options.balance_histogram = false + opts.on("-h","--balance-histogram","Balance according to sentence length histogram") { |v| + options.balance_histogram = v + } + + options.balance_word_count = true + opts.on("-w","--balance-words","Balance according to combined number of words") { |v| + options.balance_word_count = v + } + + options.balance_time = false + opts.on("-t TIMES","--balance-time TIMES","Balance according to estimated per-sentence processing time") { |v| + options.balance_time = v + } + + options.verbose = false + opts.on("-v","--[no-]verbose","Turn verbose on") { |v| + options.verbose = v + } + + options.zero_pad = true + opts.on("-z","--[no-]zeropad","Zero pad file names") { |v| + options.zero_pad = v + } + + if ARGV.length==0 + puts opts + exit + end + + +}.parse! + + + + +class LineSize + include Comparable + + attr_reader :size, :index + attr_writer :size + + @@max_index_digits = 0 + @@max_size_digits = 0 + + def initialize(line,index) + @index = index + @size = line.strip.split(/\s+/).length + + index_digits = @index.to_s.length + @@max_index_digits = index_digits if (index_digits > @@max_index_digits) + + size_digits = @size.to_s.length + @@max_size_digits = size_digits if (size_digits > @@max_size_digits) + end + + def <=>(other) + if @size==other.size + @index <=> other.index + else + size <=> other.size + end + end + + def to_s + sprintf("Line %#{@@max_index_digits}i: %#{@@max_size_digits}i words",@index, @size) + end +end + + + +def split_into_parts(file,part_for_line,parts,output_prefix,output_suffix,verbose,put_all,zeropad,index_prefix) + + if (zeropad) + parts_digits = parts.to_s.length + else + parts_digits = 0 + end + + out = Hash.new + all = File.new("#{output_prefix}_all#{output_suffix}","a") if put_all + index_out = Hash.new + + 1.upto(parts) {|v| + + file_name = sprintf("%s%0#{parts_digits}i%s",output_prefix,v,output_suffix) + out[v] = File.new(file_name,"w") + + unless index_prefix==false + index_file_name = sprintf("%s%0#{parts_digits}i",index_prefix,v) + index_out[v] = File.new(index_file_name,"w") + end + } + + + File.open(file).each_with_index { |line,index| + + + if (part_for_line.has_key?(index)) + puts "index==#{index}\tpart_for_line[#{index}]==#{part_for_line[index]}" if out[part_for_line[index]]==nil + if verbose + STDERR.puts "Line #{index} goes in #{out[part_for_line[index]].path} #{line}" + end + + out[part_for_line[index]].puts(line) + index_out[part_for_line[index]].puts(index) unless index_prefix==false + + elsif verbose + STDERR.puts "Line #{index} will be skipped #{line}" + end + } + + out.each_value { |file| + file.close + } + + + if (put_all) + 1.upto(parts) {|v| + + file_name = sprintf("%s%0#{parts_digits}i%s",output_prefix,v,output_suffix) + File.open(file_name,"r").each { |line| + all.puts(line) + } + + } + + all.close + end + +end + + +def index_of_least(array) + best=1.0/0 #Infinity + best_index=0 + array.each_with_index {|v,i| + if (v options.max_words + + STDERR.puts "Line #{index} is too long: #{line_size.size} words. Max allowed is #{options.max_words}" if options.verbose + skipped_lines.add(index) + + elsif line_size.size < options.min_words + + STDERR.puts "Line #{index} is too short: #{line_size.size} words. Min allowed is #{options.min_words}" if options.verbose + skipped_lines.add(index) + + else + + words_per_line.push(line_size) + + end +} + + +if (options.balance_naive) + + total_lines=words_per_line.size + + STDERR.puts "total_lines=#{total_lines}" if options.verbose + + ceil=(total_lines/options.parts.to_f).ceil + floor=(total_lines/options.parts.to_f).floor + + part_ceil = total_lines - floor*options.parts + part_floor = options.parts - part_ceil + + STDERR.puts "#{ceil}*#{part_ceil} + #{floor}*#{part_floor} = #{ceil*part_ceil + floor*part_floor}" if options.verbose + + + partition = 1 + lines_in_this_part = 0 + + 0.upto(total_lines-1) { |index| + + unless skipped_lines.include?(index) + if (partition <= part_ceil) + if (lines_in_this_part >= ceil) + STDERR.puts "Partition #{partition} has #{lines_in_this_part} lines" if options.verbose + lines_in_this_part=0 + partition += 1 + end + else + if (lines_in_this_part >= floor) + STDERR.puts "Partition #{partition} has #{lines_in_this_part} lines" if options.verbose + lines_in_this_part=0 + partition += 1 + end + end + + part_for_line[index] = partition + lines_in_this_part += 1 + puts "part_for_line[#{index}] = #{partition}" if options.verbose + end + + } + +elsif (options.balance_histogram) + + STDERR.puts "Balancing according to sentence length histogram" + + words_per_line.sort! + + + index=0 + + words_per_line.each { |lineSize| + if index x } + + # Store the number of words that have been placed in each partition + words_in_part = Array.new(options.parts,0) + + # At this point, words_per_line should be sorted with the longest sentences first + words_per_line.each { |lineSize| + partition = index_of_least(words_in_part) + STDERR.puts "Line #{lineSize.index}\t#{lineSize.size} #{measure_unit}\tPartition #{partition}" if options.verbose + part_for_line[lineSize.index] = partition+1 # part_for_line needs a 1-based partition index, so add 1 + words_in_part[partition] += lineSize.size + } + + if (options.verbose) + words_in_part.each_with_index { |words,partition| + STDERR.puts "Partition #{partition}\t#{words} #{measure_unit}" + } + end + +else + + + exit; + +end + + +split_into_parts( + options.corpus, + part_for_line, + options.parts, + options.output_prefix, + options.output_suffix, + options.verbose, + options.put_all, + options.zero_pad, + options.index_prefix) + + + +options.reference.each_with_index { |reference,index| + + split_into_parts( + reference, + part_for_line, + options.parts, + options.ref_prefix[index], + options.ref_suffix[index], + options.verbose, + options.put_all, + options.zero_pad, + false) + +} diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index fca0c1d31..f4d0b4551 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -62,6 +62,7 @@ my $additional_triples = { # (due to additional tables) use the following values for them "d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model + "slm"=> [ [ 1.0, 0.0, 2.0 ] ], # language model "g" => [ [ 1.0, 0.0, 2.0 ], # generation model [ 1.0, 0.0, 2.0 ] ], "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model @@ -79,14 +80,14 @@ my $additional_tripes_loop = { map { ($_, 1) } qw/ d I / }; # moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line) # uses ABBR names. -my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation lex=weight-lex I=weight-i"; +my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation slm=weight-slm lex=weight-lex I=weight-i"; my %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP; my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP; # We parse moses.ini to figure out how many weights do we need to optimize. # For this, we must know the correspondence between options defining files # for models and options assigning weights to these models. -my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d generation-file=g global-lexical-file=lex link-param-count=I"; +my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d slmodel-file=slm generation-file=g global-lexical-file=lex link-param-count=I"; my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP; # There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize @@ -901,7 +902,12 @@ sub run_decoder { my $decoder_cmd; if (defined $___JOBS && $___JOBS > 0) { - $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$parameters $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; + my $times_params="-timesfile run$run.times"; + if ($run>1) { + my $prevrun=$run-1; + $times_params.=" -existingtimesfile run$prevrun.times"; + } + $decoder_cmd = "$moses_parallel_cmd $pass_old_sge $times_params -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$parameters $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; } else { $decoder_cmd = "$___DECODER $parameters -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out"; } @@ -1107,6 +1113,7 @@ sub scan_config { "lmodel-file" => 3, "distortion-file" => 3, "global-lexical-file" => 1, + "slmodel-file" => 0, ); # by default, each line of each section means one lambda, but some sections # explicitly state a custom number of lambdas -- cgit v1.2.3