Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore21
-rw-r--r--config.h.in13
-rw-r--r--configure.in43
-rw-r--r--moses/src/Makefile.am8
-rw-r--r--moses/src/Parameter.cpp6
-rw-r--r--moses/src/ScoreIndexManager.cpp2
-rw-r--r--moses/src/StaticData.cpp1365
-rw-r--r--moses/src/StaticData.h15
-rwxr-xr-xmoses/src/SyntacticLanguageModel.cpp123
-rwxr-xr-xmoses/src/SyntacticLanguageModel.h52
-rwxr-xr-xmoses/src/SyntacticLanguageModelFiles.h95
-rwxr-xr-xmoses/src/SyntacticLanguageModelState.h303
-rwxr-xr-xregenerate-makefiles.sh2
-rw-r--r--scripts/generic/balance-corpus392
-rwxr-xr-xscripts/training/mert-moses.pl13
15 files changed, 662 insertions, 1791 deletions
diff --git a/.gitignore b/.gitignore
index 14834115d..77987e8ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,23 +1,12 @@
*.[oa]
-*~
-CreateOnDisk/src/.deps
-CreateOnDisk/src/CreateOnDiskPt
Makefile
Makefile.in
-OnDiskPt/src/.deps
aclocal.m4
autom4te.cache/
config.h
config.log
config.status
configure
-kenlm/.deps
-kenlm/.libs
-kenlm/*.la
-kenlm/*.lo
-kenlm/build_binary
-kenlm/query
-libtool
mert/.deps/
mert/Makefile
mert/Makefile.in
@@ -29,26 +18,16 @@ misc/Makefile.in
misc/processLexicalTable
misc/processPhraseTable
misc/queryLexicalTable
-misc/queryPhraseTable
-moses-chart/src/.deps
-moses-chart-cmd/src/.deps
-moses-chart-cmd/src/moses_chart
moses-cmd/src/.deps/
moses-cmd/src/Makefile
moses-cmd/src/Makefile.in
-moses-cmd/src/checkplf
-moses-cmd/src/lmbrgrid
moses-cmd/src/moses
moses/src/.deps/
-moses/src/.libs
-moses/src/*.lo
moses/src/Makefile
moses/src/Makefile.in
-moses/src/libmoses.la
scripts/training/cmert-0.5/mert
scripts/training/mbr/mbr
scripts/training/phrase-extract/extract
scripts/training/phrase-extract/score
scripts/training/symal/symal
-server/.deps
stamp-h1
diff --git a/config.h.in b/config.h.in
index f2f6ed881..264a888a5 100644
--- a/config.h.in
+++ b/config.h.in
@@ -30,9 +30,6 @@
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
-/* Define to 1 if you have the <nl-cpt.h> header file. */
-#undef HAVE_NL_CPT_H
-
/* flag for protobuf */
#undef HAVE_PROTOBUF
@@ -54,9 +51,6 @@
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
-/* flag for Syntactic Parser */
-#undef HAVE_SYNLM
-
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
@@ -66,6 +60,10 @@
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+ */
+#undef LT_OBJDIR
+
/* Name of package */
#undef PACKAGE
@@ -81,6 +79,9 @@
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
/* Define to the version of this package. */
#undef PACKAGE_VERSION
diff --git a/configure.in b/configure.in
index 4bf0aea46..8778fa91a 100644
--- a/configure.in
+++ b/configure.in
@@ -34,13 +34,6 @@ AC_ARG_WITH(srilm-dynamic,
[with_srilm_dynamic=no]
)
-AC_ARG_WITH(srilm-arch,
- [AC_HELP_STRING([--with-srilm-arch=ARCH], [(optional) architecture for which SRILM was built])],
- [with_srilm_arch=$withval],
- [with_srilm_arch=no]
- )
-
-
AC_ARG_WITH(irstlm,
[AC_HELP_STRING([--with-irstlm=PATH], [(optional) path to IRST's LM toolkit])],
[with_irstlm=$withval],
@@ -59,12 +52,6 @@ AC_ARG_WITH(randlm,
[with_randlm=no]
)
-AC_ARG_WITH(synlm,
- [AC_HELP_STRING([--with-synlm=PATH], [(optional) path to syntactic language model parser])],
- [with_synlm=$withval],
- [with_synlm=no]
- )
-
AC_ARG_WITH(notrace,
[AC_HELP_STRING([--notrace], [disable trace])],
[without_trace=yes],
@@ -95,7 +82,7 @@ AC_ARG_ENABLE(boost,
)
AC_ARG_WITH(zlib,
-boost [AC_HELP_STRING([--with-zlib=PATH], [(optional) path to zlib])],
+ [AC_HELP_STRING([--with-zlib=PATH], [(optional) path to zlib])],
[with_zlib=$withval],
[with_zlib=no]
)
@@ -107,7 +94,7 @@ AC_ARG_WITH(tcmalloc,
)
require_boost=no
-if test "x$enable_threads" != 'xno' || test "x$enable_boost" != 'xno' || test "x$with_synlm" != 'xno'
+if test "x$enable_threads" != 'xno' || test "x$enable_boost" != 'xno'
then
require_boost=yes
fi
@@ -124,7 +111,6 @@ AM_CONDITIONAL([SRI_LM], false)
AM_CONDITIONAL([IRST_LM], false)
AM_CONDITIONAL([KEN_LM], false)
AM_CONDITIONAL([RAND_LM], false)
-AM_CONDITIONAL([SYN_LM], false)
AM_CONDITIONAL([PROTOBUF], false)
AM_CONDITIONAL([am__fastdepCC], false)
AM_CONDITIONAL([WITH_THREADS],false)
@@ -138,13 +124,13 @@ else
CPPFLAGS="$CPPFLAGS -DTRACE_ENABLE=1"
fi
-if test "x$require_boost" = 'xyes' || test "x$with_synlm"
+if test "x$require_boost" = 'xyes'
then
AC_MSG_NOTICE([Using Boost library])
BOOST_REQUIRE([1.36.0])
fi
-if test "x$enable_threads" = 'xyes' || test "x$with_synlm"
+if test "x$enable_threads" = 'xyes'
then
AC_MSG_NOTICE([Building threaded moses])
BOOST_THREADS
@@ -186,12 +172,7 @@ then
# ROOT/lib/i686-m64/liboolm.a
# ROOT/lib/i686-m64/libdstruct.a
# ROOT/lib/i686-m64/libmisc.a
- if test "x$with_srilm_arch" != 'xno'
- then
- MY_ARCH=${with_srilm_arch}
- else
- MY_ARCH=`${with_srilm}/sbin/machine-type`
- fi
+ MY_ARCH=`${with_srilm}/sbin/machine-type`
LDFLAGS="$LDFLAGS -L${with_srilm}/lib/${MY_ARCH} -L${with_srilm}/flm/obj/${MY_ARCH}"
LIBS="$LIBS $LIB_SRILM"
FMTLIBS="$FMTLIBS liboolm.a libdstruct.a libmisc.a"
@@ -279,20 +260,6 @@ then
)
fi
-
-if test "x$with_synlm" != 'xno'
-then
- SAVE_CPPFLAGS="$CPPFLAGS"
- CPPFLAGS="$CPPFLAGS -DWITH_THREADS -I${with_synlm}/rvtl/include -I${with_synlm}/wsjparse/include -lm"
-
- AC_CHECK_HEADERS(nl-cpt.h,
- [AC_DEFINE([HAVE_SYNLM], [], [flag for Syntactic Parser])])
-
- AM_CONDITIONAL([SYN_LM], true)
-
-fi
-
-
AM_CONDITIONAL([WITH_MERT],false)
AC_CHECK_HEADERS([getopt.h],
[AM_CONDITIONAL([WITH_MERT],true)],
diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am
index 7cb610430..3ca22d3ec 100644
--- a/moses/src/Makefile.am
+++ b/moses/src/Makefile.am
@@ -149,10 +149,6 @@ libmoses_la_HEADERS += LanguageModelInternal.h \
NGramNode.h
endif
-if SYN_LM
-libmoses_la_HEADERS += SyntacticLanguageModel.h
-endif
-
libmoses_la_SOURCES = \
AlignmentInfo.cpp \
BilingualDynSuffixArray.cpp \
@@ -310,10 +306,6 @@ libmoses_la_SOURCES += LanguageModelInternal.cpp \
NGramNode.cpp
endif
-if SYN_LM
-libmoses_la_SOURCES += SyntacticLanguageModel.cpp
-endif
-
if KEN_LM
libmoses_la_SOURCES += LanguageModelKen.cpp
endif
diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp
index 27170e0f8..e0173e49a 100644
--- a/moses/src/Parameter.cpp
+++ b/moses/src/Parameter.cpp
@@ -68,12 +68,6 @@ Parameter::Parameter()
AddParam("report-all-factors", "report all factors in output, not just first");
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
AddParam("report-segmentation", "t", "report phrase segmentation in the output");
-#ifdef HAVE_SYNLM
- AddParam("slmodel-file", "location of the syntactic language model file(s)");
- AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
- AddParam("slmodel-factor", "factor to use with syntactic language model");
- AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
-#endif
AddParam("stack", "s", "maximum stack size for histogram pruning");
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)");
diff --git a/moses/src/ScoreIndexManager.cpp b/moses/src/ScoreIndexManager.cpp
index 16e8408f4..3ef293039 100644
--- a/moses/src/ScoreIndexManager.cpp
+++ b/moses/src/ScoreIndexManager.cpp
@@ -23,7 +23,6 @@ void ScoreIndexManager::AddScoreProducer(const ScoreProducer* sp)
m_producers.push_back(sp);
-
m_begins.push_back(m_last);
size_t numScoreCompsProduced = sp->GetNumScoreComponents();
assert(numScoreCompsProduced > 0);
@@ -33,7 +32,6 @@ void ScoreIndexManager::AddScoreProducer(const ScoreProducer* sp)
<< " " << sp->GetScoreProducerDescription()
<< ") index=" << m_begins.back() << "-" << m_ends.back()-1 << std::endl);
*/
-
}
void ScoreIndexManager::PrintLabeledScores(std::ostream& os, const ScoreComponentCollection& scores) const
diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp
index 046381d8c..cb2981444 100644
--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@@ -41,10 +41,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "DecodeGraph.h"
#include "InputFileStream.h"
-#ifdef HAVE_SYNLM
-#include "SyntacticLanguageModel.h"
-#endif
-
using namespace std;
namespace Moses
@@ -413,12 +409,6 @@ bool StaticData::LoadData(Parameter *parameter)
}
}
-#ifdef HAVE_SYNLM
- if (m_parameter->GetParam("slmodel-file").size() > 0) {
- if (!LoadSyntacticLanguageModel()) return false;
- }
-#endif
-
// use of xml in input
if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough;
else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive;
@@ -518,11 +508,6 @@ bool StaticData::LoadData(Parameter *parameter)
//Add any other features here.
-#ifdef HAVE_SYNLM
- if (m_syntacticLanguageModel != NULL) {
- m_translationSystems.find(config[0])->second.AddFeatureFunction(m_syntacticLanguageModel);
- }
-#endif
}
@@ -553,709 +538,651 @@ void StaticData::SetBooleanParameter( bool *parameter, string parameterName, boo
StaticData::~StaticData()
{
-// RemoveAllInColl(m_phraseDictionary);
-// RemoveAllInColl(m_generationDictionary);
-// RemoveAllInColl(m_reorderModels);
-// RemoveAllInColl(m_globalLexicalModels);
-// RemoveAllInColl(m_decodeGraphs);
-// RemoveAllInColl(m_wordPenaltyProducers);
-// RemoveAllInColl(m_distortionScoreProducers);
-// m_languageModel.CleanUp();
-// #ifdef HAVE_SYNLM
-// delete m_syntacticLanguageModel;
-// #endif
-
-// // delete trans opt
-// map<std::pair<size_t, Phrase>, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache;
-// for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) {
-// TranslationOptionList *transOptList = iterCache->second.first;
-// delete transOptList;
-// }
-
-// // small score producers
-// delete m_unknownWordPenaltyProducer;
-
-// //delete m_parameter;
-
-// // memory pools
-// Phrase::FinalizeMemPool();
-
-// }
-
-// #ifdef HAVE_SYNLM
-// bool StaticData::LoadSyntacticLanguageModel() {
-// cerr << "Loading syntactic language models..." << std::endl;
-
-// const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
-// const vector<string> files = m_parameter->GetParam("slmodel-file");
-
-// const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
-// TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
-// : 0;
-
-// const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
-// TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
-// : 500;
-
-// if (files.size() < 1) {
-// cerr << "No syntactic language model files specified!" << std::endl;
-// return false;
-// }
-
-// // check if feature is used
-// if (weights.size() >= 1) {
-
-// //cout.setf(ios::scientific,ios::floatfield);
-// //cerr.setf(ios::scientific,ios::floatfield);
-
-// // create the feature
-// m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
-
-// /*
-// /////////////////////////////////////////
-// // BEGIN LANE's UNSTABLE EXPERIMENT :)
-// //
-
-// double ppl = m_syntacticLanguageModel->perplexity();
-// cerr << "Probability is " << ppl << endl;
-
-
-// //
-// // END LANE's UNSTABLE EXPERIMENT
-// /////////////////////////////////////////
-// */
-
-
-// if (m_syntacticLanguageModel==NULL) {
-// return false;
-// }
-
-// }
-
-// return true;
-
-// }
-// #endif
-
-// bool StaticData::LoadLexicalReorderingModel()
-// {
-// VERBOSE(1, "Loading lexical distortion models...");
-// const vector<string> fileStr = m_parameter->GetParam("distortion-file");
-// bool hasWeightlr = (m_parameter->GetParam("weight-lr").size() != 0);
-// vector<string> weightsStr;
-// if (hasWeightlr) {
-// weightsStr = m_parameter->GetParam("weight-lr");
-// } else {
-// weightsStr = m_parameter->GetParam("weight-d");
-// }
-
-// std::vector<float> weights;
-// size_t w = 1; //cur weight
-// if (hasWeightlr) {
-// w = 0; // if reading from weight-lr, don't have to count first as distortion penalty
-// }
-// size_t f = 0; //cur file
-// //get weights values
-// VERBOSE(1, "have " << fileStr.size() << " models" << std::endl);
-// for(size_t j = 0; j < weightsStr.size(); ++j) {
-// weights.push_back(Scan<float>(weightsStr[j]));
-// }
-// //load all models
-// for(size_t i = 0; i < fileStr.size(); ++i) {
-// vector<string> spec = Tokenize<string>(fileStr[f], " ");
-// ++f; //mark file as consumed
-// if(spec.size() != 4) {
-// UserMessage::Add("Invalid Lexical Reordering Model Specification: " + fileStr[f]);
-// return false;
-// }
-
-// // spec[0] = factor map
-// // spec[1] = name
-// // spec[2] = num weights
-// // spec[3] = fileName
-
-// // decode factor map
-
-// vector<FactorType> input, output;
-// vector<string> inputfactors = Tokenize(spec[0],"-");
-// if(inputfactors.size() == 2) {
-// input = Tokenize<FactorType>(inputfactors[0],",");
-// output = Tokenize<FactorType>(inputfactors[1],",");
-// } else if(inputfactors.size() == 1) {
-// //if there is only one side assume it is on e side... why?
-// output = Tokenize<FactorType>(inputfactors[0],",");
-// } else {
-// //format error
-// return false;
-// }
-
-// string modelType = spec[1];
-
-// // decode num weights and fetch weights from array
-// std::vector<float> mweights;
-// size_t numWeights = atoi(spec[2].c_str());
-// for(size_t k = 0; k < numWeights; ++k, ++w) {
-// if(w >= weights.size()) {
-// UserMessage::Add("Lexicalized distortion model: Not enough weights, add to [weight-d]");
-// return false;
-// } else
-// mweights.push_back(weights[w]);
-// }
-
-// string filePath = spec[3];
-
-// m_reorderModels.push_back(new LexicalReordering(input, output, modelType, filePath, mweights));
-// }
-// return true;
-// }
-
-// bool StaticData::LoadGlobalLexicalModel()
-// {
-// const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-lex"));
-// const vector<string> &file = m_parameter->GetParam("global-lexical-file");
-
-// if (weight.size() != file.size()) {
-// std::cerr << "number of weights and models for the global lexical model does not match ("
-// << weight.size() << " != " << file.size() << ")" << std::endl;
-// return false;
-// }
-
-// for (size_t i = 0; i < weight.size(); i++ ) {
-// vector<string> spec = Tokenize<string>(file[i], " ");
-// if ( spec.size() != 2 ) {
-// std::cerr << "wrong global lexical model specification: " << file[i] << endl;
-// return false;
-// }
-// vector< string > factors = Tokenize(spec[0],"-");
-// if ( factors.size() != 2 ) {
-// std::cerr << "wrong factor definition for global lexical model: " << spec[0] << endl;
-// return false;
-// }
-// vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
-// vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
-// m_globalLexicalModels.push_back( new GlobalLexicalModel( spec[1], weight[i], inputFactors, outputFactors ) );
-// }
-// return true;
-// }
-
-// bool StaticData::LoadLanguageModels()
-// {
-// if (m_parameter->GetParam("lmodel-file").size() > 0) {
-// // weights
-// vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-l"));
-
-// for (size_t i = 0 ; i < weightAll.size() ; i++) {
-// m_allWeights.push_back(weightAll[i]);
-// }
-
-// // dictionary upper-bounds fo all IRST LMs
-// vector<int> LMdub = Scan<int>(m_parameter->GetParam("lmodel-dub"));
-// if (m_parameter->GetParam("lmodel-dub").size() == 0) {
-// for(size_t i=0; i<m_parameter->GetParam("lmodel-file").size(); i++)
-// LMdub.push_back(0);
-// }
-
-// // initialize n-gram order for each factor. populated only by factored lm
-// const vector<string> &lmVector = m_parameter->GetParam("lmodel-file");
-// //prevent language models from being loaded twice
-// map<string,LanguageModel*> languageModelsLoaded;
-
-// for(size_t i=0; i<lmVector.size(); i++) {
-// LanguageModel* lm = NULL;
-// if (languageModelsLoaded.find(lmVector[i]) != languageModelsLoaded.end()) {
-// lm = new LanguageModel(m_scoreIndexManager, languageModelsLoaded[lmVector[i]]);
-// } else {
-// vector<string> token = Tokenize(lmVector[i]);
-// if (token.size() != 4 && token.size() != 5 ) {
-// UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
-// return false;
-// }
-// // type = implementation, SRI, IRST etc
-// LMImplementation lmImplementation = static_cast<LMImplementation>(Scan<int>(token[0]));
-
-// // factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc
-// vector<FactorType> factorTypes = Tokenize<FactorType>(token[1], ",");
-
-// // nGramOrder = 2 = bigram, 3 = trigram, etc
-// size_t nGramOrder = Scan<int>(token[2]);
-
-// string &languageModelFile = token[3];
-// if (token.size() == 5) {
-// if (lmImplementation==IRST)
-// languageModelFile += " " + token[4];
-// else {
-// UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
-// return false;
-// }
-// }
-// IFVERBOSE(1)
-// PrintUserTime(string("Start loading LanguageModel ") + languageModelFile);
-
-// lm = LanguageModelFactory::CreateLanguageModel(
-// lmImplementation
-// , factorTypes
-// , nGramOrder
-// , languageModelFile
-// , m_scoreIndexManager
-// , LMdub[i]);
-// if (lm == NULL) {
-// UserMessage::Add("no LM created. We probably don't have it compiled");
-// return false;
-// }
-// languageModelsLoaded[lmVector[i]] = lm;
-// }
-
-// m_languageModel.Add(lm);
-// }
-// }
-// // flag indicating that language models were loaded,
-// // since phrase table loading requires their presence
-// m_fLMsLoaded = true;
-// IFVERBOSE(1)
-// PrintUserTime("Finished loading LanguageModels");
-// return true;
-// }
-
-// bool StaticData::LoadGenerationTables()
-// {
-// if (m_parameter->GetParam("generation-file").size() > 0) {
-// const vector<string> &generationVector = m_parameter->GetParam("generation-file");
-// const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-generation"));
-
-// IFVERBOSE(1) {
-// TRACE_ERR( "weight-generation: ");
-// for (size_t i = 0 ; i < weight.size() ; i++) {
-// TRACE_ERR( weight[i] << "\t");
-// }
-// TRACE_ERR(endl);
-// }
-// size_t currWeightNum = 0;
-
-// for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) {
-// vector<string> token = Tokenize(generationVector[currDict]);
-// vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
-// ,output = Tokenize<FactorType>(token[1], ",");
-// m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output);
-// string filePath;
-// size_t numFeatures;
-
-// numFeatures = Scan<size_t>(token[2]);
-// filePath = token[3];
-
-// if (!FileExists(filePath) && FileExists(filePath + ".gz")) {
-// filePath += ".gz";
-// }
-
-// VERBOSE(1, filePath << endl);
-
-// m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager, input,output));
-// assert(m_generationDictionary.back() && "could not create GenerationDictionary");
-// if (!m_generationDictionary.back()->Load(filePath, Output)) {
-// delete m_generationDictionary.back();
-// return false;
-// }
-// for(size_t i = 0; i < numFeatures; i++) {
-// assert(currWeightNum < weight.size());
-// m_allWeights.push_back(weight[currWeightNum++]);
-// }
-// }
-// if (currWeightNum != weight.size()) {
-// TRACE_ERR( " [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n");
-// }
-// }
-
-// return true;
-// }
-
-// /* Doesn't load phrase tables any more. Just creates the features. */
-// bool StaticData::LoadPhraseTables()
-// {
-// VERBOSE(2,"Creating phrase table features" << endl);
-
-// // language models must be loaded prior to loading phrase tables
-// assert(m_fLMsLoaded);
-// // load phrase translation tables
-// if (m_parameter->GetParam("ttable-file").size() > 0) {
-// // weights
-// vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-t"));
-
-// const vector<string> &translationVector = m_parameter->GetParam("ttable-file");
-// vector<size_t> maxTargetPhrase = Scan<size_t>(m_parameter->GetParam("ttable-limit"));
-
-// if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
-// VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
-// for(size_t i = 1; i < translationVector.size(); i++)
-// maxTargetPhrase.push_back(maxTargetPhrase[0]);
-// } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) {
-// stringstream strme;
-// strme << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits.";
-// UserMessage::Add(strme.str());
-// return false;
-// }
-
-// size_t index = 0;
-// size_t weightAllOffset = 0;
-// bool oldFileFormat = false;
-// for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
-// vector<string> token = Tokenize(translationVector[currDict]);
-
-// if(currDict == 0 && token.size() == 4) {
-// VERBOSE(1, "Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)!" << endl);
-// oldFileFormat = true;
-// }
-
-// if((!oldFileFormat && token.size() < 5) || (oldFileFormat && token.size() != 4)) {
-// UserMessage::Add("invalid phrase table specification");
-// return false;
-// }
-
-// PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
-// if(oldFileFormat) {
-// token.push_back(token[3]);
-// token[3] = token[2];
-// token[2] = token[1];
-// token[1] = token[0];
-// token[0] = "1";
-// implementation = Binary;
-// } else
-// implementation = (PhraseTableImplementation) Scan<int>(token[0]);
-
-// assert(token.size() >= 5);
-// //characteristics of the phrase table
-
-// vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
-// ,output = Tokenize<FactorType>(token[2], ",");
-// m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input);
-// m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output);
-// m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1;
-// size_t numScoreComponent = Scan<size_t>(token[3]);
-// string filePath= token[4];
-
-// assert(weightAll.size() >= weightAllOffset + numScoreComponent);
-
-// // weights for this phrase dictionary
-// // first InputScores (if any), then translation scores
-// vector<float> weight;
-
-// if(currDict==0 && (m_inputType == ConfusionNetworkInput || m_inputType == WordLatticeInput)) {
-// // TODO. find what the assumptions made by confusion network about phrase table output which makes
-// // it only work with binrary file. This is a hack
-
-// m_numInputScores=m_parameter->GetParam("weight-i").size();
-// for(unsigned k=0; k<m_numInputScores; ++k)
-// weight.push_back(Scan<float>(m_parameter->GetParam("weight-i")[k]));
-
-// if(m_parameter->GetParam("link-param-count").size())
-// m_numLinkParams = Scan<size_t>(m_parameter->GetParam("link-param-count")[0]);
-
-// //print some info about this interaction:
-// if (m_numLinkParams == m_numInputScores) {
-// VERBOSE(1,"specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count.\n");
-// } else if ((m_numLinkParams + 1) == m_numInputScores) {
-// VERBOSE(1,"WARN: "<< m_numInputScores << " insertion weights found and only "<< m_numLinkParams << " link parameters specified, applying non-epsilon 'real' word link count for last feature weight.\n");
-// } else {
-// stringstream strme;
-// strme << "You specified " << m_numInputScores
-// << " input weights (weight-i), but you specified " << m_numLinkParams << " link parameters (link-param-count)!";
-// UserMessage::Add(strme.str());
-// return false;
-// }
-
-// }
-// if (!m_inputType) {
-// m_numInputScores=0;
-// }
-// //this number changes depending on what phrase table we're talking about: only 0 has the weights on it
-// size_t tableInputScores = (currDict == 0 ? m_numInputScores : 0);
-
-// for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++)
-// weight.push_back(weightAll[weightAllOffset + currScore]);
-
-
-// if(weight.size() - tableInputScores != numScoreComponent) {
-// stringstream strme;
-// strme << "Your phrase table has " << numScoreComponent
-// << " scores, but you specified " << (weight.size() - tableInputScores) << " weights!";
-// UserMessage::Add(strme.str());
-// return false;
-// }
-
-// weightAllOffset += numScoreComponent;
-// numScoreComponent += tableInputScores;
-
-// string targetPath, alignmentsFile;
-// if (implementation == SuffixArray) {
-// targetPath = token[5];
-// alignmentsFile= token[6];
-// }
-
-// assert(numScoreComponent==weight.size());
-
-// std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights));
-
-// //This is needed for regression testing, but the phrase table
-// //might not really be loading here
-// IFVERBOSE(1)
-// PrintUserTime(string("Start loading PhraseTable ") + filePath);
-// VERBOSE(1,"filePath: " << filePath <<endl);
-
-// PhraseDictionaryFeature* pdf = new PhraseDictionaryFeature(
-// implementation
-// , numScoreComponent
-// , (currDict==0 ? m_numInputScores : 0)
-// , input
-// , output
-// , filePath
-// , weight
-// , maxTargetPhrase[index]
-// , targetPath, alignmentsFile);
-
-// m_phraseDictionary.push_back(pdf);
-
-
-
-
-
-// index++;
-// }
-// }
-
-// IFVERBOSE(1)
-// PrintUserTime("Finished loading phrase tables");
-// return true;
-// }
-
-// void StaticData::LoadNonTerminals()
-// {
-// string defaultNonTerminals;
-
-// if (m_parameter->GetParam("non-terminals").size() == 0) {
-// defaultNonTerminals = "X";
-// } else {
-// vector<std::string> tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]);
-// defaultNonTerminals = tokens[0];
-// }
-
-// FactorCollection &factorCollection = FactorCollection::Instance();
-
-// m_inputDefaultNonTerminal.SetIsNonTerminal(true);
-// const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals);
-// m_inputDefaultNonTerminal.SetFactor(0, sourceFactor);
-
-// m_outputDefaultNonTerminal.SetIsNonTerminal(true);
-// const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals);
-// m_outputDefaultNonTerminal.SetFactor(0, targetFactor);
-
-// // for unknwon words
-// if (m_parameter->GetParam("unknown-lhs").size() == 0) {
-// UnknownLHSEntry entry(defaultNonTerminals, 0.0f);
-// m_unknownLHS.push_back(entry);
-// } else {
-// const string &filePath = m_parameter->GetParam("unknown-lhs")[0];
-
-// InputFileStream inStream(filePath);
-// string line;
-// while(getline(inStream, line)) {
-// vector<string> tokens = Tokenize(line);
-// assert(tokens.size() == 2);
-// UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
-// m_unknownLHS.push_back(entry);
-// }
-
-// }
-
-// }
-
-// void StaticData::LoadChartDecodingParameters()
-// {
-// LoadNonTerminals();
-
-// // source label overlap
-// if (m_parameter->GetParam("source-label-overlap").size() > 0) {
-// m_sourceLabelOverlap = (SourceLabelOverlap) Scan<int>(m_parameter->GetParam("source-label-overlap")[0]);
-// } else {
-// m_sourceLabelOverlap = SourceLabelOverlapAdd;
-// }
-
-// m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0)
-// ? Scan<size_t>(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
-// }
-
-// void StaticData::LoadPhraseBasedParameters()
-// {
-// const vector<string> distortionWeights = m_parameter->GetParam("weight-d");
-// size_t distortionWeightCount = distortionWeights.size();
-// //if there's a lex-reordering model, and no separate weight set, then
-// //take just one of these weights for linear distortion
-// if (!m_parameter->GetParam("weight-lr").size() && m_parameter->GetParam("distortion-file").size()) {
-// distortionWeightCount = 1;
-// }
-// for (size_t i = 0; i < distortionWeightCount; ++i) {
-// float weightDistortion = Scan<float>(distortionWeights[i]);
-// m_distortionScoreProducers.push_back(new DistortionScoreProducer(m_scoreIndexManager));
-// m_allWeights.push_back(weightDistortion);
-// }
-// }
-
-// bool StaticData::LoadDecodeGraphs()
-// {
-// const vector<string> &mappingVector = m_parameter->GetParam("mapping");
-// const vector<size_t> &maxChartSpans = Scan<size_t>(m_parameter->GetParam("max-chart-span"));
-
-// DecodeStep *prev = 0;
-// size_t prevDecodeGraphInd = 0;
-// for(size_t i=0; i<mappingVector.size(); i++) {
-// vector<string> token = Tokenize(mappingVector[i]);
-// size_t decodeGraphInd;
-// DecodeType decodeType;
-// size_t index;
-// if (token.size() == 2) {
-// decodeGraphInd = 0;
-// decodeType = token[0] == "T" ? Translate : Generate;
-// index = Scan<size_t>(token[1]);
-// } else if (token.size() == 3) {
-// // For specifying multiple translation model
-// decodeGraphInd = Scan<size_t>(token[0]);
-// //the vectorList index can only increment by one
-// assert(decodeGraphInd == prevDecodeGraphInd || decodeGraphInd == prevDecodeGraphInd + 1);
-// if (decodeGraphInd > prevDecodeGraphInd) {
-// prev = NULL;
-// }
-// decodeType = token[1] == "T" ? Translate : Generate;
-// index = Scan<size_t>(token[2]);
-// } else {
-// UserMessage::Add("Malformed mapping!");
-// assert(false);
-// }
-
-// DecodeStep* decodeStep = NULL;
-// switch (decodeType) {
-// case Translate:
-// if(index>=m_phraseDictionary.size()) {
-// stringstream strme;
-// strme << "No phrase dictionary with index "
-// << index << " available!";
-// UserMessage::Add(strme.str());
-// assert(false);
-// }
-// decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev);
-// break;
-// case Generate:
-// if(index>=m_generationDictionary.size()) {
-// stringstream strme;
-// strme << "No generation dictionary with index "
-// << index << " available!";
-// UserMessage::Add(strme.str());
-// assert(false);
-// }
-// decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev);
-// break;
-// case InsertNullFertilityWord:
-// assert(!"Please implement NullFertilityInsertion.");
-// break;
-// }
-
-// assert(decodeStep);
-// if (m_decodeGraphs.size() < decodeGraphInd + 1) {
-// DecodeGraph *decodeGraph;
-// if (m_searchAlgorithm == ChartDecoding) {
-// size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
-// decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
-// } else {
-// decodeGraph = new DecodeGraph(m_decodeGraphs.size());
-// }
-
-// m_decodeGraphs.push_back(decodeGraph); // TODO max chart span
-// }
-
-// m_decodeGraphs[decodeGraphInd]->Add(decodeStep);
-// prev = decodeStep;
-// prevDecodeGraphInd = decodeGraphInd;
-// }
-
-// // set maximum n-gram size for backoff approach to decoding paths
-// // default is always use subsequent paths (value = 0)
-// for(size_t i=0; i<m_decodeGraphs.size(); i++) {
-// m_decodeGraphBackoff.push_back( 0 );
-// }
-// // if specified, record maxmimum unseen n-gram size
-// const vector<string> &backoffVector = m_parameter->GetParam("decoding-graph-backoff");
-// for(size_t i=0; i<m_decodeGraphs.size() && i<backoffVector.size(); i++) {
-// m_decodeGraphBackoff[i] = Scan<size_t>(backoffVector[i]);
-// }
-
-// return true;
-// }
-
-
-// void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector<float>& weights)
-// {
-// const size_t id = sp->GetScoreBookkeepingID();
-// const size_t begin = m_scoreIndexManager.GetBeginIndex(id);
-// const size_t end = m_scoreIndexManager.GetEndIndex(id);
-// assert(end - begin == weights.size());
-// if (m_allWeights.size() < end)
-// m_allWeights.resize(end);
-// std::vector<float>::const_iterator weightIter = weights.begin();
-// for (size_t i = begin; i < end; i++)
-// m_allWeights[i] = *weightIter++;
-// }
-
-// const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const
-// {
-// std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
-// #ifdef WITH_THREADS
-// boost::mutex::scoped_lock lock(m_transOptCacheMutex);
-// #endif
-// std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
-// = m_transOptCache.find(key);
-// if (iter == m_transOptCache.end())
-// return NULL;
-// iter->second.second = clock(); // update last used time
-// return iter->second.first;
-// }
-
-// void StaticData::ReduceTransOptCache() const
-// {
-// if (m_transOptCache.size() <= m_transOptCacheMaxSize) return; // not full
-// clock_t t = clock();
-
-// // find cutoff for last used time
-// priority_queue< clock_t > lastUsedTimes;
-// std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter;
-// iter = m_transOptCache.begin();
-// while( iter != m_transOptCache.end() ) {
-// lastUsedTimes.push( iter->second.second );
-// iter++;
-// }
-// for( size_t i=0; i < lastUsedTimes.size()-m_transOptCacheMaxSize/2; i++ )
-// lastUsedTimes.pop();
-// clock_t cutoffLastUsedTime = lastUsedTimes.top();
-
-// // remove all old entries
-// iter = m_transOptCache.begin();
-// while( iter != m_transOptCache.end() ) {
-// if (iter->second.second < cutoffLastUsedTime) {
-// std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iterRemove = iter++;
-// delete iterRemove->second.first;
-// m_transOptCache.erase(iterRemove);
-// } else iter++;
-// }
-// VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl);
-// }
-
-// void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const
-// {
-// if (m_transOptCacheMaxSize == 0) return;
-// std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
-// TranslationOptionList* storedTransOptList = new TranslationOptionList(transOptList);
-// #ifdef WITH_THREADS
-// boost::mutex::scoped_lock lock(m_transOptCacheMutex);
-// #endif
-// m_transOptCache[key] = make_pair( storedTransOptList, clock() );
-// ReduceTransOptCache();
-// }
-
-// }
+ RemoveAllInColl(m_phraseDictionary);
+ RemoveAllInColl(m_generationDictionary);
+ RemoveAllInColl(m_reorderModels);
+ RemoveAllInColl(m_globalLexicalModels);
+ RemoveAllInColl(m_decodeGraphs);
+ RemoveAllInColl(m_wordPenaltyProducers);
+ RemoveAllInColl(m_distortionScoreProducers);
+ m_languageModel.CleanUp();
+
+ // delete trans opt
+ map<std::pair<size_t, Phrase>, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache;
+ for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) {
+ TranslationOptionList *transOptList = iterCache->second.first;
+ delete transOptList;
+ }
+
+ // small score producers
+ delete m_unknownWordPenaltyProducer;
+
+ //delete m_parameter;
+
+ // memory pools
+ Phrase::FinalizeMemPool();
+
+}
+
+bool StaticData::LoadLexicalReorderingModel()
+{
+ VERBOSE(1, "Loading lexical distortion models...");
+ const vector<string> fileStr = m_parameter->GetParam("distortion-file");
+ bool hasWeightlr = (m_parameter->GetParam("weight-lr").size() != 0);
+ vector<string> weightsStr;
+ if (hasWeightlr) {
+ weightsStr = m_parameter->GetParam("weight-lr");
+ } else {
+ weightsStr = m_parameter->GetParam("weight-d");
+ }
+
+ std::vector<float> weights;
+ size_t w = 1; //cur weight
+ if (hasWeightlr) {
+ w = 0; // if reading from weight-lr, don't have to count first as distortion penalty
+ }
+ size_t f = 0; //cur file
+ //get weights values
+ VERBOSE(1, "have " << fileStr.size() << " models" << std::endl);
+ for(size_t j = 0; j < weightsStr.size(); ++j) {
+ weights.push_back(Scan<float>(weightsStr[j]));
+ }
+ //load all models
+ for(size_t i = 0; i < fileStr.size(); ++i) {
+ vector<string> spec = Tokenize<string>(fileStr[f], " ");
+ ++f; //mark file as consumed
+ if(spec.size() != 4) {
+ UserMessage::Add("Invalid Lexical Reordering Model Specification: " + fileStr[f]);
+ return false;
+ }
+
+ // spec[0] = factor map
+ // spec[1] = name
+ // spec[2] = num weights
+ // spec[3] = fileName
+
+ // decode factor map
+
+ vector<FactorType> input, output;
+ vector<string> inputfactors = Tokenize(spec[0],"-");
+ if(inputfactors.size() == 2) {
+ input = Tokenize<FactorType>(inputfactors[0],",");
+ output = Tokenize<FactorType>(inputfactors[1],",");
+ } else if(inputfactors.size() == 1) {
+ //if there is only one side assume it is on e side... why?
+ output = Tokenize<FactorType>(inputfactors[0],",");
+ } else {
+ //format error
+ return false;
+ }
+
+ string modelType = spec[1];
+
+ // decode num weights and fetch weights from array
+ std::vector<float> mweights;
+ size_t numWeights = atoi(spec[2].c_str());
+ for(size_t k = 0; k < numWeights; ++k, ++w) {
+ if(w >= weights.size()) {
+ UserMessage::Add("Lexicalized distortion model: Not enough weights, add to [weight-d]");
+ return false;
+ } else
+ mweights.push_back(weights[w]);
+ }
+
+ string filePath = spec[3];
+
+ m_reorderModels.push_back(new LexicalReordering(input, output, modelType, filePath, mweights));
+ }
+ return true;
+}
+
+bool StaticData::LoadGlobalLexicalModel()
+{
+ const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-lex"));
+ const vector<string> &file = m_parameter->GetParam("global-lexical-file");
+
+ if (weight.size() != file.size()) {
+ std::cerr << "number of weights and models for the global lexical model does not match ("
+ << weight.size() << " != " << file.size() << ")" << std::endl;
+ return false;
+ }
+
+ for (size_t i = 0; i < weight.size(); i++ ) {
+ vector<string> spec = Tokenize<string>(file[i], " ");
+ if ( spec.size() != 2 ) {
+ std::cerr << "wrong global lexical model specification: " << file[i] << endl;
+ return false;
+ }
+ vector< string > factors = Tokenize(spec[0],"-");
+ if ( factors.size() != 2 ) {
+ std::cerr << "wrong factor definition for global lexical model: " << spec[0] << endl;
+ return false;
+ }
+ vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
+ vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
+ m_globalLexicalModels.push_back( new GlobalLexicalModel( spec[1], weight[i], inputFactors, outputFactors ) );
+ }
+ return true;
+}
+
+bool StaticData::LoadLanguageModels()
+{
+ if (m_parameter->GetParam("lmodel-file").size() > 0) {
+ // weights
+ vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-l"));
+
+ for (size_t i = 0 ; i < weightAll.size() ; i++) {
+ m_allWeights.push_back(weightAll[i]);
+ }
+
+ // dictionary upper-bounds fo all IRST LMs
+ vector<int> LMdub = Scan<int>(m_parameter->GetParam("lmodel-dub"));
+ if (m_parameter->GetParam("lmodel-dub").size() == 0) {
+ for(size_t i=0; i<m_parameter->GetParam("lmodel-file").size(); i++)
+ LMdub.push_back(0);
+ }
+
+ // initialize n-gram order for each factor. populated only by factored lm
+ const vector<string> &lmVector = m_parameter->GetParam("lmodel-file");
+ //prevent language models from being loaded twice
+ map<string,LanguageModel*> languageModelsLoaded;
+
+ for(size_t i=0; i<lmVector.size(); i++) {
+ LanguageModel* lm = NULL;
+ if (languageModelsLoaded.find(lmVector[i]) != languageModelsLoaded.end()) {
+ lm = new LanguageModel(m_scoreIndexManager, languageModelsLoaded[lmVector[i]]);
+ } else {
+ vector<string> token = Tokenize(lmVector[i]);
+ if (token.size() != 4 && token.size() != 5 ) {
+ UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
+ return false;
+ }
+ // type = implementation, SRI, IRST etc
+ LMImplementation lmImplementation = static_cast<LMImplementation>(Scan<int>(token[0]));
+
+ // factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc
+ vector<FactorType> factorTypes = Tokenize<FactorType>(token[1], ",");
+
+ // nGramOrder = 2 = bigram, 3 = trigram, etc
+ size_t nGramOrder = Scan<int>(token[2]);
+
+ string &languageModelFile = token[3];
+ if (token.size() == 5) {
+ if (lmImplementation==IRST)
+ languageModelFile += " " + token[4];
+ else {
+ UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
+ return false;
+ }
+ }
+ IFVERBOSE(1)
+ PrintUserTime(string("Start loading LanguageModel ") + languageModelFile);
+
+ lm = LanguageModelFactory::CreateLanguageModel(
+ lmImplementation
+ , factorTypes
+ , nGramOrder
+ , languageModelFile
+ , m_scoreIndexManager
+ , LMdub[i]);
+ if (lm == NULL) {
+ UserMessage::Add("no LM created. We probably don't have it compiled");
+ return false;
+ }
+ languageModelsLoaded[lmVector[i]] = lm;
+ }
+
+ m_languageModel.Add(lm);
+ }
+ }
+ // flag indicating that language models were loaded,
+ // since phrase table loading requires their presence
+ m_fLMsLoaded = true;
+ IFVERBOSE(1)
+ PrintUserTime("Finished loading LanguageModels");
+ return true;
+}
+
+bool StaticData::LoadGenerationTables()
+{
+ if (m_parameter->GetParam("generation-file").size() > 0) {
+ const vector<string> &generationVector = m_parameter->GetParam("generation-file");
+ const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-generation"));
+
+ IFVERBOSE(1) {
+ TRACE_ERR( "weight-generation: ");
+ for (size_t i = 0 ; i < weight.size() ; i++) {
+ TRACE_ERR( weight[i] << "\t");
+ }
+ TRACE_ERR(endl);
+ }
+ size_t currWeightNum = 0;
+
+ for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) {
+ vector<string> token = Tokenize(generationVector[currDict]);
+ vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
+ ,output = Tokenize<FactorType>(token[1], ",");
+ m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output);
+ string filePath;
+ size_t numFeatures;
+
+ numFeatures = Scan<size_t>(token[2]);
+ filePath = token[3];
+
+ if (!FileExists(filePath) && FileExists(filePath + ".gz")) {
+ filePath += ".gz";
+ }
+
+ VERBOSE(1, filePath << endl);
+
+ m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager, input,output));
+ assert(m_generationDictionary.back() && "could not create GenerationDictionary");
+ if (!m_generationDictionary.back()->Load(filePath, Output)) {
+ delete m_generationDictionary.back();
+ return false;
+ }
+ for(size_t i = 0; i < numFeatures; i++) {
+ assert(currWeightNum < weight.size());
+ m_allWeights.push_back(weight[currWeightNum++]);
+ }
+ }
+ if (currWeightNum != weight.size()) {
+ TRACE_ERR( " [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n");
+ }
+ }
+
+ return true;
+}
+
+/* Doesn't load phrase tables any more. Just creates the features. */
+bool StaticData::LoadPhraseTables()
+{
+ VERBOSE(2,"Creating phrase table features" << endl);
+
+ // language models must be loaded prior to loading phrase tables
+ assert(m_fLMsLoaded);
+ // load phrase translation tables
+ if (m_parameter->GetParam("ttable-file").size() > 0) {
+ // weights
+ vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-t"));
+
+ const vector<string> &translationVector = m_parameter->GetParam("ttable-file");
+ vector<size_t> maxTargetPhrase = Scan<size_t>(m_parameter->GetParam("ttable-limit"));
+
+ if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
+ VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
+ for(size_t i = 1; i < translationVector.size(); i++)
+ maxTargetPhrase.push_back(maxTargetPhrase[0]);
+ } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) {
+ stringstream strme;
+ strme << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits.";
+ UserMessage::Add(strme.str());
+ return false;
+ }
+
+ size_t index = 0;
+ size_t weightAllOffset = 0;
+ bool oldFileFormat = false;
+ for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
+ vector<string> token = Tokenize(translationVector[currDict]);
+
+ if(currDict == 0 && token.size() == 4) {
+ VERBOSE(1, "Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)!" << endl);
+ oldFileFormat = true;
+ }
+
+ if((!oldFileFormat && token.size() < 5) || (oldFileFormat && token.size() != 4)) {
+ UserMessage::Add("invalid phrase table specification");
+ return false;
+ }
+
+ PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
+ if(oldFileFormat) {
+ token.push_back(token[3]);
+ token[3] = token[2];
+ token[2] = token[1];
+ token[1] = token[0];
+ token[0] = "1";
+ implementation = Binary;
+ } else
+ implementation = (PhraseTableImplementation) Scan<int>(token[0]);
+
+ assert(token.size() >= 5);
+ //characteristics of the phrase table
+
+ vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
+ ,output = Tokenize<FactorType>(token[2], ",");
+ m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input);
+ m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output);
+ m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1;
+ size_t numScoreComponent = Scan<size_t>(token[3]);
+ string filePath= token[4];
+
+ assert(weightAll.size() >= weightAllOffset + numScoreComponent);
+
+ // weights for this phrase dictionary
+ // first InputScores (if any), then translation scores
+ vector<float> weight;
+
+ if(currDict==0 && (m_inputType == ConfusionNetworkInput || m_inputType == WordLatticeInput)) {
+ // TODO. find what the assumptions made by confusion network about phrase table output which makes
+ // it only work with binrary file. This is a hack
+
+ m_numInputScores=m_parameter->GetParam("weight-i").size();
+ for(unsigned k=0; k<m_numInputScores; ++k)
+ weight.push_back(Scan<float>(m_parameter->GetParam("weight-i")[k]));
+
+ if(m_parameter->GetParam("link-param-count").size())
+ m_numLinkParams = Scan<size_t>(m_parameter->GetParam("link-param-count")[0]);
+
+ //print some info about this interaction:
+ if (m_numLinkParams == m_numInputScores) {
+ VERBOSE(1,"specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count.\n");
+ } else if ((m_numLinkParams + 1) == m_numInputScores) {
+ VERBOSE(1,"WARN: "<< m_numInputScores << " insertion weights found and only "<< m_numLinkParams << " link parameters specified, applying non-epsilon 'real' word link count for last feature weight.\n");
+ } else {
+ stringstream strme;
+ strme << "You specified " << m_numInputScores
+ << " input weights (weight-i), but you specified " << m_numLinkParams << " link parameters (link-param-count)!";
+ UserMessage::Add(strme.str());
+ return false;
+ }
+
+ }
+ if (!m_inputType) {
+ m_numInputScores=0;
+ }
+ //this number changes depending on what phrase table we're talking about: only 0 has the weights on it
+ size_t tableInputScores = (currDict == 0 ? m_numInputScores : 0);
+
+ for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++)
+ weight.push_back(weightAll[weightAllOffset + currScore]);
+
+
+ if(weight.size() - tableInputScores != numScoreComponent) {
+ stringstream strme;
+ strme << "Your phrase table has " << numScoreComponent
+ << " scores, but you specified " << (weight.size() - tableInputScores) << " weights!";
+ UserMessage::Add(strme.str());
+ return false;
+ }
+
+ weightAllOffset += numScoreComponent;
+ numScoreComponent += tableInputScores;
+
+ string targetPath, alignmentsFile;
+ if (implementation == SuffixArray) {
+ targetPath = token[5];
+ alignmentsFile= token[6];
+ }
+
+ assert(numScoreComponent==weight.size());
+
+ std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights));
+
+ //This is needed for regression testing, but the phrase table
+ //might not really be loading here
+ IFVERBOSE(1)
+ PrintUserTime(string("Start loading PhraseTable ") + filePath);
+ VERBOSE(1,"filePath: " << filePath <<endl);
+
+ PhraseDictionaryFeature* pdf = new PhraseDictionaryFeature(
+ implementation
+ , numScoreComponent
+ , (currDict==0 ? m_numInputScores : 0)
+ , input
+ , output
+ , filePath
+ , weight
+ , maxTargetPhrase[index]
+ , targetPath, alignmentsFile);
+
+ m_phraseDictionary.push_back(pdf);
+
+
+
+
+
+ index++;
+ }
+ }
+
+ IFVERBOSE(1)
+ PrintUserTime("Finished loading phrase tables");
+ return true;
+}
+
+void StaticData::LoadNonTerminals()
+{
+ string defaultNonTerminals;
+
+ if (m_parameter->GetParam("non-terminals").size() == 0) {
+ defaultNonTerminals = "X";
+ } else {
+ vector<std::string> tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]);
+ defaultNonTerminals = tokens[0];
+ }
+
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ m_inputDefaultNonTerminal.SetIsNonTerminal(true);
+ const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals);
+ m_inputDefaultNonTerminal.SetFactor(0, sourceFactor);
+
+ m_outputDefaultNonTerminal.SetIsNonTerminal(true);
+ const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals);
+ m_outputDefaultNonTerminal.SetFactor(0, targetFactor);
+
+ // for unknwon words
+ if (m_parameter->GetParam("unknown-lhs").size() == 0) {
+ UnknownLHSEntry entry(defaultNonTerminals, 0.0f);
+ m_unknownLHS.push_back(entry);
+ } else {
+ const string &filePath = m_parameter->GetParam("unknown-lhs")[0];
+
+ InputFileStream inStream(filePath);
+ string line;
+ while(getline(inStream, line)) {
+ vector<string> tokens = Tokenize(line);
+ assert(tokens.size() == 2);
+ UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
+ m_unknownLHS.push_back(entry);
+ }
+
+ }
+
+}
+
+void StaticData::LoadChartDecodingParameters()
+{
+ LoadNonTerminals();
+
+ // source label overlap
+ if (m_parameter->GetParam("source-label-overlap").size() > 0) {
+ m_sourceLabelOverlap = (SourceLabelOverlap) Scan<int>(m_parameter->GetParam("source-label-overlap")[0]);
+ } else {
+ m_sourceLabelOverlap = SourceLabelOverlapAdd;
+ }
+
+ m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0)
+ ? Scan<size_t>(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
+}
+
+void StaticData::LoadPhraseBasedParameters()
+{
+ const vector<string> distortionWeights = m_parameter->GetParam("weight-d");
+ size_t distortionWeightCount = distortionWeights.size();
+ //if there's a lex-reordering model, and no separate weight set, then
+ //take just one of these weights for linear distortion
+ if (!m_parameter->GetParam("weight-lr").size() && m_parameter->GetParam("distortion-file").size()) {
+ distortionWeightCount = 1;
+ }
+ for (size_t i = 0; i < distortionWeightCount; ++i) {
+ float weightDistortion = Scan<float>(distortionWeights[i]);
+ m_distortionScoreProducers.push_back(new DistortionScoreProducer(m_scoreIndexManager));
+ m_allWeights.push_back(weightDistortion);
+ }
+}
+
+bool StaticData::LoadDecodeGraphs()
+{
+ const vector<string> &mappingVector = m_parameter->GetParam("mapping");
+ const vector<size_t> &maxChartSpans = Scan<size_t>(m_parameter->GetParam("max-chart-span"));
+
+ DecodeStep *prev = 0;
+ size_t prevDecodeGraphInd = 0;
+ for(size_t i=0; i<mappingVector.size(); i++) {
+ vector<string> token = Tokenize(mappingVector[i]);
+ size_t decodeGraphInd;
+ DecodeType decodeType;
+ size_t index;
+ if (token.size() == 2) {
+ decodeGraphInd = 0;
+ decodeType = token[0] == "T" ? Translate : Generate;
+ index = Scan<size_t>(token[1]);
+ } else if (token.size() == 3) {
+ // For specifying multiple translation model
+ decodeGraphInd = Scan<size_t>(token[0]);
+ //the vectorList index can only increment by one
+ assert(decodeGraphInd == prevDecodeGraphInd || decodeGraphInd == prevDecodeGraphInd + 1);
+ if (decodeGraphInd > prevDecodeGraphInd) {
+ prev = NULL;
+ }
+ decodeType = token[1] == "T" ? Translate : Generate;
+ index = Scan<size_t>(token[2]);
+ } else {
+ UserMessage::Add("Malformed mapping!");
+ assert(false);
+ }
+
+ DecodeStep* decodeStep = NULL;
+ switch (decodeType) {
+ case Translate:
+ if(index>=m_phraseDictionary.size()) {
+ stringstream strme;
+ strme << "No phrase dictionary with index "
+ << index << " available!";
+ UserMessage::Add(strme.str());
+ assert(false);
+ }
+ decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev);
+ break;
+ case Generate:
+ if(index>=m_generationDictionary.size()) {
+ stringstream strme;
+ strme << "No generation dictionary with index "
+ << index << " available!";
+ UserMessage::Add(strme.str());
+ assert(false);
+ }
+ decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev);
+ break;
+ case InsertNullFertilityWord:
+ assert(!"Please implement NullFertilityInsertion.");
+ break;
+ }
+
+ assert(decodeStep);
+ if (m_decodeGraphs.size() < decodeGraphInd + 1) {
+ DecodeGraph *decodeGraph;
+ if (m_searchAlgorithm == ChartDecoding) {
+ size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
+ decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
+ } else {
+ decodeGraph = new DecodeGraph(m_decodeGraphs.size());
+ }
+
+ m_decodeGraphs.push_back(decodeGraph); // TODO max chart span
+ }
+
+ m_decodeGraphs[decodeGraphInd]->Add(decodeStep);
+ prev = decodeStep;
+ prevDecodeGraphInd = decodeGraphInd;
+ }
+
+ // set maximum n-gram size for backoff approach to decoding paths
+ // default is always use subsequent paths (value = 0)
+ for(size_t i=0; i<m_decodeGraphs.size(); i++) {
+ m_decodeGraphBackoff.push_back( 0 );
+ }
+ // if specified, record maxmimum unseen n-gram size
+ const vector<string> &backoffVector = m_parameter->GetParam("decoding-graph-backoff");
+ for(size_t i=0; i<m_decodeGraphs.size() && i<backoffVector.size(); i++) {
+ m_decodeGraphBackoff[i] = Scan<size_t>(backoffVector[i]);
+ }
+
+ return true;
+}
+
+
+void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector<float>& weights)
+{
+ const size_t id = sp->GetScoreBookkeepingID();
+ const size_t begin = m_scoreIndexManager.GetBeginIndex(id);
+ const size_t end = m_scoreIndexManager.GetEndIndex(id);
+ assert(end - begin == weights.size());
+ if (m_allWeights.size() < end)
+ m_allWeights.resize(end);
+ std::vector<float>::const_iterator weightIter = weights.begin();
+ for (size_t i = begin; i < end; i++)
+ m_allWeights[i] = *weightIter++;
+}
+
+const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const
+{
+ std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
+#ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_transOptCacheMutex);
+#endif
+ std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
+ = m_transOptCache.find(key);
+ if (iter == m_transOptCache.end())
+ return NULL;
+ iter->second.second = clock(); // update last used time
+ return iter->second.first;
+}
+
+void StaticData::ReduceTransOptCache() const
+{
+ if (m_transOptCache.size() <= m_transOptCacheMaxSize) return; // not full
+ clock_t t = clock();
+
+ // find cutoff for last used time
+ priority_queue< clock_t > lastUsedTimes;
+ std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter;
+ iter = m_transOptCache.begin();
+ while( iter != m_transOptCache.end() ) {
+ lastUsedTimes.push( iter->second.second );
+ iter++;
+ }
+ for( size_t i=0; i < lastUsedTimes.size()-m_transOptCacheMaxSize/2; i++ )
+ lastUsedTimes.pop();
+ clock_t cutoffLastUsedTime = lastUsedTimes.top();
+
+ // remove all old entries
+ iter = m_transOptCache.begin();
+ while( iter != m_transOptCache.end() ) {
+ if (iter->second.second < cutoffLastUsedTime) {
+ std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iterRemove = iter++;
+ delete iterRemove->second.first;
+ m_transOptCache.erase(iterRemove);
+ } else iter++;
+ }
+ VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl);
+}
+
+void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const
+{
+ if (m_transOptCacheMaxSize == 0) return;
+ std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
+ TranslationOptionList* storedTransOptList = new TranslationOptionList(transOptList);
+#ifdef WITH_THREADS
+ boost::mutex::scoped_lock lock(m_transOptCacheMutex);
+#endif
+ m_transOptCache[key] = make_pair( storedTransOptList, clock() );
+ ReduceTransOptCache();
+}
+
+}
diff --git a/moses/src/StaticData.h b/moses/src/StaticData.h
index 0d46d9531..b3ea80a60 100644
--- a/moses/src/StaticData.h
+++ b/moses/src/StaticData.h
@@ -63,9 +63,6 @@ class GenerationDictionary;
class DistortionScoreProducer;
class DecodeStep;
class UnknownWordPenaltyProducer;
-#ifdef HAVE_SYNLM
-class SyntacticLanguageModel;
-#endif
class TranslationSystem;
typedef std::pair<std::string, float> UnknownLHSEntry;
@@ -98,11 +95,6 @@ protected:
m_earlyDiscardingThreshold,
m_translationOptionThreshold,
m_wordDeletionWeight;
-#ifdef HAVE_SYNLM
- SyntacticLanguageModel* m_syntacticLanguageModel;
-#endif
-
-
// PhraseTrans, Generation & LanguageModelScore has multiple weights.
int m_maxDistortion;
@@ -213,16 +205,10 @@ protected:
void LoadChartDecodingParameters();
void LoadNonTerminals();
-
//! helper fn to set bool param from ini file/command line
void SetBooleanParameter(bool *paramter, std::string parameterName, bool defaultValue);
//! load all language models as specified in ini file
bool LoadLanguageModels();
-#ifdef HAVE_SYNLM
- //! load syntactic language model
- bool LoadSyntacticLanguageModel();
-#endif
-
//! load not only the main phrase table but also any auxiliary tables that depend on which features are being used (e.g., word-deletion, word-insertion tables)
bool LoadPhraseTables();
//! load all generation tables as specified in ini file
@@ -234,7 +220,6 @@ protected:
void ReduceTransOptCache() const;
bool m_continuePartialTranslation;
-
public:
bool IsAlwaysCreateDirectTranslationOption() const {
diff --git a/moses/src/SyntacticLanguageModel.cpp b/moses/src/SyntacticLanguageModel.cpp
deleted file mode 100755
index 85c19bdc0..000000000
--- a/moses/src/SyntacticLanguageModel.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//
-
-#include "StaticData.h"
-#include "SyntacticLanguageModel.h"
-#include "HHMMLangModel-gf.h"
-#include "TextObsModel.h"
-#include "SyntacticLanguageModelFiles.h"
-#include "SyntacticLanguageModelState.h"
-
-
-namespace Moses
-{
- // asnteousntaoheisnthaoesntih
- SyntacticLanguageModel::SyntacticLanguageModel(const std::vector<std::string>& filePath,
- const std::vector<float>& weights,
- const FactorType factorType,
- size_t beamWidth)
- // Initialize member variables
- : m_NumScoreComponents(weights.size())
- , m_beamWidth(beamWidth)
- , m_factorType(factorType)
- , m_files(new SyntacticLanguageModelFiles<YModel,XModel>(filePath)) {
-
- // Inform Moses score manager of this feature and its weight(s)
- const_cast<ScoreIndexManager&>(StaticData::Instance().GetScoreIndexManager()).AddScoreProducer(this);
- const_cast<StaticData&>(StaticData::Instance()).SetWeightsForScoreProducer(this, weights);
- VERBOSE(3,"Constructed SyntacticLanguageModel" << endl);
- }
-
- SyntacticLanguageModel::~SyntacticLanguageModel() {
- VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl);
- // delete m_files;
- }
-
- size_t SyntacticLanguageModel::GetNumScoreComponents() const {
- return m_NumScoreComponents;
- }
-
- std::string SyntacticLanguageModel::GetScoreProducerDescription() const {
- return "Syntactic Language Model";
- }
-
- std::string SyntacticLanguageModel::GetScoreProducerWeightShortName() const {
- return "slm";
- }
-
- const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const {
-
- return new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
-
- }
-
- /*
- double SyntacticLanguageModel::perplexity() {
-
- SyntacticLanguageModelState<YModel,XModel,S,R> *prev =
- new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
-
- std::cerr << "Initial prob:" << "\t" << prev->getProb() <<std::endl;
-
-
- std::vector<std::string> words(3);
- words[0] = "no";
- words[1] = ",";
- words[2] = "zxvth";
-
-
- for (std::vector<std::string>::iterator i=words.begin();
- i != words.end();
- i++) {
-
- prev = new SyntacticLanguageModelState<YModel,XModel,S,R>(prev, *i);
- std::cerr << *i << "\t" << prev->getProb() <<std::endl;
-
- }
-
- if (true) exit(-1);
-
- return prev->getProb();
-
- }
- */
- FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const {
-
- VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl);
-
- const SyntacticLanguageModelState<YModel,XModel,S,R>& prev =
- static_cast<const SyntacticLanguageModelState<YModel,XModel,S,R>&>(*prev_state);
-
- const SyntacticLanguageModelState<YModel,XModel,S,R>* currentState = &prev;
- SyntacticLanguageModelState<YModel,XModel,S,R>* nextState = NULL;
-
-
- const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
-
- for (size_t i=0, n=targetPhrase.GetSize(); i<n; i++) {
-
- const Word& word = targetPhrase.GetWord(i);
- const Factor* factor = word.GetFactor(m_factorType);
-
- const std::string& string = factor->GetString();
-
- if (i==0) {
- nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>(&prev, string);
- } else {
- currentState = nextState;
- nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>(currentState, string);
- }
-
- double score = nextState->getScore();
- VERBOSE(3,"SynLM evaluated a score of " << score << endl);
- accumulator->Assign( this, score );
- }
-
-
-
- return nextState;
-
- }
-
-}
diff --git a/moses/src/SyntacticLanguageModel.h b/moses/src/SyntacticLanguageModel.h
deleted file mode 100755
index 977a57680..000000000
--- a/moses/src/SyntacticLanguageModel.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//
-
-#ifndef moses_SyntacticLanguageModel_h
-#define moses_SyntacticLanguageModel_h
-
-#include "FeatureFunction.h"
-
-
-class YModel; // hidden model
-class XModel; // observed model
-
-namespace Moses
-{
-
- template <class MH, class MO> class SyntacticLanguageModelFiles;
-
- class SyntacticLanguageModel : public StatefulFeatureFunction {
-
- public:
-
- SyntacticLanguageModel(const std::vector<std::string>& filePaths,
- const std::vector<float>& weights,
- const FactorType factorType,
- const size_t beamWidth);
-
- ~SyntacticLanguageModel();
-
- size_t GetNumScoreComponents() const;
- std::string GetScoreProducerDescription() const;
- std::string GetScoreProducerWeightShortName() const;
-
- const FFState* EmptyHypothesisState(const InputType &input) const;
-
- FFState* Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
-
- // double perplexity();
-
- private:
-
- const size_t m_NumScoreComponents;
- SyntacticLanguageModelFiles<YModel,XModel>* m_files;
- const FactorType m_factorType;
- const size_t m_beamWidth;
-
- };
-
-
-}
-
-#endif
diff --git a/moses/src/SyntacticLanguageModelFiles.h b/moses/src/SyntacticLanguageModelFiles.h
deleted file mode 100755
index 318e22636..000000000
--- a/moses/src/SyntacticLanguageModelFiles.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//
-
-#ifndef moses_SyntacticLanguageModelFiles_h
-#define moses_SyntacticLanguageModelFiles_h
-
-#include "nl-iomacros.h"
-#include "nl-string.h"
-
-namespace Moses
-{
-
-template <class MH, class MO>
-class SyntacticLanguageModelFiles {
-
- public:
-
- SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths);
- ~SyntacticLanguageModelFiles();
-
- MH* getHiddenModel();
- MO* getObservedModel();
-
- private:
- MH* hiddenModel;
- MO* observedModel;
-
-};
-
-
-template <class MH, class MO>
- SyntacticLanguageModelFiles<MH,MO>::SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths) {
-
- this->hiddenModel = new MH();
- this->observedModel = new MO();
-
- //// I. LOAD MODELS...
- std::cerr << "Reading syntactic language model files...\n";
- // For each model file...
- for ( int a=0, n=filePaths.size(); a<n; a++ ) { // read models
- FILE* pf = fopen(filePaths[a].c_str(),"r"); //assert(pf); // Read model file
- if(!pf){
- std::cerr << "Error loading model file " << filePaths[a] << std::endl;
- return;
- }
- std::cerr << "Loading model \'" << filePaths[a] << "\'...\n";
- int c=' '; int i=0; int line=1; String sBuff(1000); // Lookahead/ctrs/buffers
- CONSUME_ALL ( pf, c, WHITESPACE(c), line); // Get to first record
- while ( c!=-1 && c!='\0' && c!='\5' ) { // For each record
- CONSUME_STR ( pf, c, (c!='\n' && c!='\0' && c!='\5'), sBuff, i, line ); // Consume line
- StringInput si(sBuff.c_array());
- if ( !( sBuff[0]=='#' // Accept comments/fields
- || si>>*(this->hiddenModel)>>"\0"!=NULL
- || si>>*(this->observedModel)>>"\0"!=NULL
- ))
- std::cerr<<"\nERROR: can't parse \'"<<sBuff<<"\' in line "<<line<<"\n\n";
- CONSUME_ALL ( pf, c, WHITESPACE(c), line); // Consume whitespace
- if ( line%100000==0 ) std::cerr<<" "<<line<<" lines read...\n"; // Progress for big models
- }
- std::cerr << "Model \'" << filePaths[a] << "\' loaded.\n";
- }
-
- std::cerr << "...reading syntactic language model files completed\n";
-
-
-}
-
-
-template <class MH, class MO>
- SyntacticLanguageModelFiles<MH,MO>::~SyntacticLanguageModelFiles() {
-
- std::cerr<<"Destructing syntactic language model files" << std::endl;
- //delete hiddenModel;
- //delete observedModel;
-
-}
-
-
-template <class MH, class MO>
- MH* SyntacticLanguageModelFiles<MH,MO>::getHiddenModel() {
-
- return this->hiddenModel;
-
-}
-
-template <class MH, class MO>
- MO* SyntacticLanguageModelFiles<MH,MO>::getObservedModel() {
-
- return this->observedModel;
-
-}
-
-
-}
-
-#endif
diff --git a/moses/src/SyntacticLanguageModelState.h b/moses/src/SyntacticLanguageModelState.h
deleted file mode 100755
index 0877a59b3..000000000
--- a/moses/src/SyntacticLanguageModelState.h
+++ /dev/null
@@ -1,303 +0,0 @@
-//
-
-#ifndef moses_SyntacticLanguageModelState_h
-#define moses_SyntacticLanguageModelState_h
-
-#include "nl-iomacros.h"
-#include "nl-cpt.h"
-#include "nl-hmm.h"
-
-#include "SyntacticLanguageModelFiles.h"
-#include "FFState.h"
-#include <string>
-
-namespace Moses
-{
-
-template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBackDat<typename MY::RandVarType> >
- class SyntacticLanguageModelState : public FFState {
- public:
-
- // Initialize an empty LM state
- SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize );
-
- // Get the next LM state from an existing LM state and the next word
- SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word );
-
-
- ~SyntacticLanguageModelState() {
- //cerr << "Deleting SyntacticLanguageModelState" << std::endl;
- //delete randomVariableStore;
- }
-
- virtual int Compare(const FFState& other) const;
-
- // Get the LM score from this LM state
- double getScore() const;
-
- double getProb() const;
-
- private:
-
- void setScore(double score);
- void printRV();
-
- SafeArray1D<Id<int>,pair<YS,LogProb> >* randomVariableStore;
- double prob;
- double score;
- int beamSize;
- SyntacticLanguageModelFiles<MY,MX>* modelData;
- bool sentenceStart;
-};
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-
- template <class MY, class MX, class YS, class B>
- void SyntacticLanguageModelState<MY,MX,YS,B>::printRV() {
-
- cerr << "*********** BEGIN printRV() ******************" << endl;
- int size=randomVariableStore->getSize();
- cerr << "randomVariableStore->getSize() == " << size << endl;
-
- for (int depth=0; depth<size; depth+=1) {
-
-
- const pair<YS,LogProb> *data = &(randomVariableStore->get(depth));
- std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl;
-
- }
- cerr << "*********** END printRV() ******************" << endl;
-
- }
-
-// Initialize an empty LM state from grammar files
-//
-// nArgs is the number of model files
-// argv is the list of model file names
-//
-template <class MY, class MX, class YS, class B>
- SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize ) {
-
- this->randomVariableStore = new SafeArray1D<Id<int>,pair<YS,LogProb> >();
- this->modelData = modelData;
- this->beamSize = beamSize;
-
- // Initialize an empty random variable value
- YS xBEG;
- StringInput(String(BEG_STATE).c_array())>>xBEG>>"\0";
- cerr<<xBEG<<"\n";
-
- // cout << "Examining RV store just before RV init" << endl;
- //printRV();
-
- // Initialize the random variable store
- this->randomVariableStore->init(1,pair<YS,LogProb>(xBEG,0));
-
- this->sentenceStart = true;
-
- IFVERBOSE(3) {
- VERBOSE(3,"Examining RV store just after RV init" << endl);
- printRV();
- }
-
- // Get score of final frame in HHMM
- LogProb l(1.0);
- //score = l.toDouble();
- setScore(l.toDouble());
- // MY::F_ROOT_OBS = true;
- // this->modelData->getHiddenModel()->setRootObs(true);
-
-
-}
-
-
-template <class MY, class MX, class YS, class B>
- int SyntacticLanguageModelState<MY,MX,YS,B>::Compare(const FFState& other) const {
- /*
- const SyntacticLanguageModelState<MY,MX,YS,B>& o =
- static_cast<const SyntacticLanguageModelState<MY,MX,YS,B>&>(other);
-
- if (o.score > score) return 1;
- else if (o.score < score) return -1;
- else return 0;
- */
- return 0;
- }
-
-
-template <class MY, class MX, class YS, class B>
- SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ) {
-
- // Initialize member variables
- this->randomVariableStore = new SafeArray1D<Id<int>,pair<YS,LogProb> >();
- this->modelData = prev->modelData;
- this->beamSize = prev->beamSize;
- this->randomVariableStore->init(this->beamSize);
- this->sentenceStart=false;
-
- YS ysEND;
- StringInput(String(END_STATE).c_array())>>ysEND>>"\0";
-
- // Get HHMM model files
- MY& mH = *(modelData->getHiddenModel());
- MX& mO = *(modelData->getObservedModel());
-
- // Initialize HHMM
- HMM<MY,MX,YS,B> hmm(mH,mO);
- int MAX_WORDS = 2;
- hmm.init(MAX_WORDS,this->beamSize,prev->randomVariableStore);
- typename MX::RandVarType x(word.c_str());
- // cout << "Examining HHMM just after hmm.init" << endl;
- // hmm.debugPrint();
-
-
- /* cerr << "*********** BEGIN writeCurr() ******************" << endl;
- hmm.writeCurr(cout,0);
- hmm.writeCurr(cout,1);
- cerr << "*********** END writeCurr() ******************" << endl;
- */
-/*
- {
-
- int wnum=1;
- list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
- for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
- cout << "HYPOTH " << wnum
- << " " << i->getBackData()
- << " " << x
- << " " << i->getId()
- << " (" << i->getLogProb() << ")"
- << endl; // print RV val
- }
- }
- */
-
-
- /*
- cerr << "Writing hmm.writeCurr" << endl;
- hmm.writeCurr(cerr,0);
- hmm.writeCurr(cerr,1);
- cerr << "...done writing hmm.writeCurr" << endl;
- */
- hmm.getCurrSum();
-
-
-
- // Initialize observed variable
- // typename MX::RandVarType ov;
- // ov.set(word.c_str(),mO);
- // MY::WORD = ov.getW();
- //bool endOfSentence = prev->sentenceStart;//true;
-
- // std::cerr << "About to give HHMM a word of input:\t" << word << std::endl;
-
- hmm.updateRanked(x, prev->sentenceStart);
-
- // cout << "Examining HHMM just after hmm.updateRanked(" << x << "," << prev->sentenceStart << ")" << endl;
- // hmm.debugPrint();
-/*
- cerr << "*********** BEGIN writeCurr() ******************" << endl;
- hmm.writeCurr(cout,0);
- hmm.writeCurr(cout,1);
- cerr << "*********** END writeCurr() ******************" << endl;
- */
-/*
-{
-
- int wnum=1;
- list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
- for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
- cout << "HYPOTH " << wnum
- << " " << i->getBackData()
- << " " << x
- << " " << i->getId()
- << " (" << i->getLogProb() << ")"
- << endl; // print RV val
- }
- }
- */
-// X ov(word.c_str());
- //mH.setWord(ov);
- // MY::WORD = ov;//ov.getW();
-
- // Update HHMM based on observed variable
- //hmm.updateRanked(ov);
- //mH.setRootObs(true);
- //MY::F_ROOT_OBS = false;
-
- // Get the current score
- double currSum = hmm.getCurrSum();
- //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl);
- setScore(currSum);
- // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl;
- // printRV();
-
- // Get new hidden random variable store from HHMM
- hmm.gatherElementsInBeam(randomVariableStore);
- // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl;
- // printRV();
- /*
- cerr << "Writing hmm.writeCurr..." << endl;
- hmm.writeCurr(cerr,0);
- hmm.writeCurr(cerr,1);
- cerr << "...done writing hmm.writeCurr" << endl;
- */
-}
-
-
-template <class MY, class MX, class YS, class B>
-double SyntacticLanguageModelState<MY,MX,YS,B>::getProb() const {
-
- return prob;
-}
-
-template <class MY, class MX, class YS, class B>
-double SyntacticLanguageModelState<MY,MX,YS,B>::getScore() const {
-
- return score;
-}
-
-
-template <class MY, class MX, class YS, class B>
- void SyntacticLanguageModelState<MY,MX,YS,B>::setScore(double score) {
-
-
-
-
- this->prob = score;
-
- // We want values to range from -100 to 0
- //
- // If the minimum positive value for a double is min=4.94065645841246544e-324
- // then to scale, we want a logarithmic base such that log_b(min)=-100
- //
- // -100 = log(min) / log(b)
- //
- // log(b) = log(min) / -100
- //
- // b = exp( log(min) / -100 )
- //
- // b = 7.44440071921381
-
- // Check for score==0 to avoid causing -infinity with log(score)
- if (score==0) {
- this->score = -100;
- } else {
- double x = log(score) / 7.44440071921381;
- if ( x >= -100) {
- this->score = x;
- } else {
- this->score = -100;
- }
- }
-
- VERBOSE(3,"\tSyntacticLanguageModelState has score=" << this->score << endl);
-
-}
-
-
-}
-
-#endif
diff --git a/regenerate-makefiles.sh b/regenerate-makefiles.sh
index c4c2e8bee..747dabf88 100755
--- a/regenerate-makefiles.sh
+++ b/regenerate-makefiles.sh
@@ -54,7 +54,7 @@ $LIBTOOLIZE || die "libtoolize failed"
echo
echo "You should now be able to configure and build:"
-echo " ./configure [--with-srilm=/path/to/srilm] [--with-irstlm=/path/to/irstlm] [--with-randlm=/path/to/randlm] [--without-kenlm] [--with-synlm=/path/to/modelblocks] [--with-xmlrpc-c=/path/to/xmlrpc-c-config]"
+echo " ./configure [--with-srilm=/path/to/srilm] [--with-irstlm=/path/to/irstlm] [--with-randlm=/path/to/randlm] [--without-kenlm] [--with-xmlrpc-c=/path/to/xmlrpc-c-config]"
echo " make -j 4"
echo
diff --git a/scripts/generic/balance-corpus b/scripts/generic/balance-corpus
deleted file mode 100644
index 647fa4502..000000000
--- a/scripts/generic/balance-corpus
+++ /dev/null
@@ -1,392 +0,0 @@
-#!/usr/bin/ruby -w
-
-require 'optparse'
-require 'ostruct'
-require 'pp'
-require 'set'
-
-options = OpenStruct.new
-OptionParser.new { |opts|
-
- opts.banner = "Usage: #{$0} [options]"
-
- opts.on("-n N","--num-parts N", Integer, "Number of parts into which the corpus should be split") { |v|
- options.parts = v
- options.parts_digits = options.parts.to_s.length
- }
-
- opts.on("-i FILE", "--corpus", String, "Corpus to split") { |v|
- options.corpus = v
- }
-
- options.reference = Array.new
- opts.on("-r FILE", "--reference", String, "Reference file") { |v|
- options.reference << v
- }
-
- options.put_all = false
- opts.on("-a","--all","Output all lines into a single file, in addition to split files") { |v|
- options.put_all = v
- }
-
- options.max_words = 1.0/0.0
- opts.on("-m N","--max-words", Integer, "Maximum number of words allowed in a line") { |v|
- options.max_words = v
- }
-
- options.min_words = 1
- opts.on("--min-words N", Integer, "Minimum number of words allowed in a line") { |v|
- options.min_words = v
- }
-
- options.index_prefix = false
- opts.on("--index-prefix FILE_PREFIX", String, "Index file name prefixing the part number") { |v|
- options.index_prefix = v
- }
-
- opts.on("-p FILE_PREFIX","--prefix FILE_PREFIX", String, "File name prefixing the part number") { |v|
- options.output_prefix = v
- }
-
- opts.on("-s FILE_SUFFIX","--suffix FILE_SUFFIX", String, "File name suffixing the part number") { |v|
- options.output_suffix = v
- }
-
- options.ref_prefix = Array.new
- opts.on("--ref-prefix FILE_PREFIX", String, "File name prefixing the part number") { |v|
- options.ref_prefix << v
- }
-
- options.ref_suffix = Array.new
- opts.on("--ref-suffix FILE_SUFFIX", String, "File name suffixing the part number") { |v|
- options.ref_suffix << v
- }
-
- options.balance_naive = false
- opts.on("--balance-naive","Balance according to combined number of lines") { |v|
- options.balance_naive = v
- }
-
- options.balance_histogram = false
- opts.on("-h","--balance-histogram","Balance according to sentence length histogram") { |v|
- options.balance_histogram = v
- }
-
- options.balance_word_count = true
- opts.on("-w","--balance-words","Balance according to combined number of words") { |v|
- options.balance_word_count = v
- }
-
- options.balance_time = false
- opts.on("-t TIMES","--balance-time TIMES","Balance according to estimated per-sentence processing time") { |v|
- options.balance_time = v
- }
-
- options.verbose = false
- opts.on("-v","--[no-]verbose","Turn verbose on") { |v|
- options.verbose = v
- }
-
- options.zero_pad = true
- opts.on("-z","--[no-]zeropad","Zero pad file names") { |v|
- options.zero_pad = v
- }
-
- if ARGV.length==0
- puts opts
- exit
- end
-
-
-}.parse!
-
-
-
-
-class LineSize
- include Comparable
-
- attr_reader :size, :index
- attr_writer :size
-
- @@max_index_digits = 0
- @@max_size_digits = 0
-
- def initialize(line,index)
- @index = index
- @size = line.strip.split(/\s+/).length
-
- index_digits = @index.to_s.length
- @@max_index_digits = index_digits if (index_digits > @@max_index_digits)
-
- size_digits = @size.to_s.length
- @@max_size_digits = size_digits if (size_digits > @@max_size_digits)
- end
-
- def <=>(other)
- if @size==other.size
- @index <=> other.index
- else
- size <=> other.size
- end
- end
-
- def to_s
- sprintf("Line %#{@@max_index_digits}i: %#{@@max_size_digits}i words",@index, @size)
- end
-end
-
-
-
-def split_into_parts(file,part_for_line,parts,output_prefix,output_suffix,verbose,put_all,zeropad,index_prefix)
-
- if (zeropad)
- parts_digits = parts.to_s.length
- else
- parts_digits = 0
- end
-
- out = Hash.new
- all = File.new("#{output_prefix}_all#{output_suffix}","a") if put_all
- index_out = Hash.new
-
- 1.upto(parts) {|v|
-
- file_name = sprintf("%s%0#{parts_digits}i%s",output_prefix,v,output_suffix)
- out[v] = File.new(file_name,"w")
-
- unless index_prefix==false
- index_file_name = sprintf("%s%0#{parts_digits}i",index_prefix,v)
- index_out[v] = File.new(index_file_name,"w")
- end
- }
-
-
- File.open(file).each_with_index { |line,index|
-
-
- if (part_for_line.has_key?(index))
- puts "index==#{index}\tpart_for_line[#{index}]==#{part_for_line[index]}" if out[part_for_line[index]]==nil
- if verbose
- STDERR.puts "Line #{index} goes in #{out[part_for_line[index]].path} #{line}"
- end
-
- out[part_for_line[index]].puts(line)
- index_out[part_for_line[index]].puts(index) unless index_prefix==false
-
- elsif verbose
- STDERR.puts "Line #{index} will be skipped #{line}"
- end
- }
-
- out.each_value { |file|
- file.close
- }
-
-
- if (put_all)
- 1.upto(parts) {|v|
-
- file_name = sprintf("%s%0#{parts_digits}i%s",output_prefix,v,output_suffix)
- File.open(file_name,"r").each { |line|
- all.puts(line)
- }
-
- }
-
- all.close
- end
-
-end
-
-
-def index_of_least(array)
- best=1.0/0 #Infinity
- best_index=0
- array.each_with_index {|v,i|
- if (v<best)
- best=v
- best_index=i
- end
- }
- return best_index
-end
-
-
-# Use to store which partition each line should be placed in
-#
-# So, part_for_line[74] = 3 would mean that
-# line number 74 should go into partition 3
-#
-part_for_line = Hash.new
-
-# Use to store how many words are in each line
-#
-# So, words_per_line[74] = 15 would mean that
-# line number 74 contains 15 words
-#
-words_per_line=Array.new
-
-skipped_lines=Set.new
-
-File.open(options.corpus).each_with_index { |line,index|
-
- line_size = LineSize.new(line,index)
-
- if line_size.size > options.max_words
-
- STDERR.puts "Line #{index} is too long: #{line_size.size} words. Max allowed is #{options.max_words}" if options.verbose
- skipped_lines.add(index)
-
- elsif line_size.size < options.min_words
-
- STDERR.puts "Line #{index} is too short: #{line_size.size} words. Min allowed is #{options.min_words}" if options.verbose
- skipped_lines.add(index)
-
- else
-
- words_per_line.push(line_size)
-
- end
-}
-
-
-if (options.balance_naive)
-
- total_lines=words_per_line.size
-
- STDERR.puts "total_lines=#{total_lines}" if options.verbose
-
- ceil=(total_lines/options.parts.to_f).ceil
- floor=(total_lines/options.parts.to_f).floor
-
- part_ceil = total_lines - floor*options.parts
- part_floor = options.parts - part_ceil
-
- STDERR.puts "#{ceil}*#{part_ceil} + #{floor}*#{part_floor} = #{ceil*part_ceil + floor*part_floor}" if options.verbose
-
-
- partition = 1
- lines_in_this_part = 0
-
- 0.upto(total_lines-1) { |index|
-
- unless skipped_lines.include?(index)
- if (partition <= part_ceil)
- if (lines_in_this_part >= ceil)
- STDERR.puts "Partition #{partition} has #{lines_in_this_part} lines" if options.verbose
- lines_in_this_part=0
- partition += 1
- end
- else
- if (lines_in_this_part >= floor)
- STDERR.puts "Partition #{partition} has #{lines_in_this_part} lines" if options.verbose
- lines_in_this_part=0
- partition += 1
- end
- end
-
- part_for_line[index] = partition
- lines_in_this_part += 1
- puts "part_for_line[#{index}] = #{partition}" if options.verbose
- end
-
- }
-
-elsif (options.balance_histogram)
-
- STDERR.puts "Balancing according to sentence length histogram"
-
- words_per_line.sort!
-
-
- index=0
-
- words_per_line.each { |lineSize|
- if index<options.parts
- index+=1
- else
- index=1
- end
-
- part_for_line[lineSize.index] = index
-
- }
-
-elsif (options.balance_word_count || options.balance_time)
-
- measure_unit = ""
-
- if (options.balance_time)
- STDERR.puts "Balancing according to time estimates"
- measure_unit = "seconds"
-
- index = 0
- File.open(options.balance_time).each_with_index { |time,line_index|
- unless (skipped_lines.include?(line_index))
- words_per_line[index].size = time.strip.to_f
- index += 1
- end
- }
-
-
- elsif (options.balance_word_count)
- STDERR.puts "Balancing according to word count"
- measure_unit = "words"
- end
-
- # Sort in reverse order
- words_per_line.sort! {|x,y| y <=> x }
-
- # Store the number of words that have been placed in each partition
- words_in_part = Array.new(options.parts,0)
-
- # At this point, words_per_line should be sorted with the longest sentences first
- words_per_line.each { |lineSize|
- partition = index_of_least(words_in_part)
- STDERR.puts "Line #{lineSize.index}\t#{lineSize.size} #{measure_unit}\tPartition #{partition}" if options.verbose
- part_for_line[lineSize.index] = partition+1 # part_for_line needs a 1-based partition index, so add 1
- words_in_part[partition] += lineSize.size
- }
-
- if (options.verbose)
- words_in_part.each_with_index { |words,partition|
- STDERR.puts "Partition #{partition}\t#{words} #{measure_unit}"
- }
- end
-
-else
-
-
- exit;
-
-end
-
-
-split_into_parts(
- options.corpus,
- part_for_line,
- options.parts,
- options.output_prefix,
- options.output_suffix,
- options.verbose,
- options.put_all,
- options.zero_pad,
- options.index_prefix)
-
-
-
-options.reference.each_with_index { |reference,index|
-
- split_into_parts(
- reference,
- part_for_line,
- options.parts,
- options.ref_prefix[index],
- options.ref_suffix[index],
- options.verbose,
- options.put_all,
- options.zero_pad,
- false)
-
-}
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index f4d0b4551..fca0c1d31 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -62,7 +62,6 @@ my $additional_triples = {
# (due to additional tables) use the following values for them
"d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model
"lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model
- "slm"=> [ [ 1.0, 0.0, 2.0 ] ], # language model
"g" => [ [ 1.0, 0.0, 2.0 ], # generation model
[ 1.0, 0.0, 2.0 ] ],
"tm" => [ [ 0.3, 0.0, 0.5 ], # translation model
@@ -80,14 +79,14 @@ my $additional_tripes_loop = { map { ($_, 1) } qw/ d I / };
# moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line)
# uses ABBR names.
-my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation slm=weight-slm lex=weight-lex I=weight-i";
+my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation lex=weight-lex I=weight-i";
my %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
# We parse moses.ini to figure out how many weights do we need to optimize.
# For this, we must know the correspondence between options defining files
# for models and options assigning weights to these models.
-my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d slmodel-file=slm generation-file=g global-lexical-file=lex link-param-count=I";
+my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d generation-file=g global-lexical-file=lex link-param-count=I";
my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP;
# There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize
@@ -902,12 +901,7 @@ sub run_decoder {
my $decoder_cmd;
if (defined $___JOBS && $___JOBS > 0) {
- my $times_params="-timesfile run$run.times";
- if ($run>1) {
- my $prevrun=$run-1;
- $times_params.=" -existingtimesfile run$prevrun.times";
- }
- $decoder_cmd = "$moses_parallel_cmd $pass_old_sge $times_params -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$parameters $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
+ $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$parameters $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
} else {
$decoder_cmd = "$___DECODER $parameters -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out";
}
@@ -1113,7 +1107,6 @@ sub scan_config {
"lmodel-file" => 3,
"distortion-file" => 3,
"global-lexical-file" => 1,
- "slmodel-file" => 0,
);
# by default, each line of each section means one lambda, but some sections
# explicitly state a custom number of lambdas