diff options
-rw-r--r-- | Jamroot | 3 | ||||
-rw-r--r-- | contrib/other-builds/moses/.project | 20 | ||||
-rw-r--r-- | moses/FF/Factory.cpp | 11 | ||||
-rw-r--r-- | moses/LM/Ken.cpp | 14 | ||||
-rw-r--r-- | moses/LM/Ken.h | 5 | ||||
-rw-r--r-- | moses/LM/Reloading.cpp | 112 | ||||
-rw-r--r-- | moses/LM/Reloading.h | 220 | ||||
-rw-r--r-- | moses/TranslationModel/UG/Jamfile | 10 | ||||
-rw-r--r-- | moses/TranslationModel/UG/filter-pt.cc | 669 | ||||
-rw-r--r-- | moses/TranslationModel/UG/mmsapt.cpp | 2 | ||||
-rw-r--r-- | moses/TrellisPath.cpp | 10 | ||||
-rwxr-xr-x | run-regtests.sh | 8 | ||||
-rw-r--r-- | util/file_stream.hh | 1 |
13 files changed, 1044 insertions, 41 deletions
@@ -208,7 +208,7 @@ if [ option.get "with-icu" : : "yes" ] # for probing pt external-lib boost_serialization ; -requirements += <library>boost_serialization ; +requirements += <library>boost_serialization/<runtime-link>static ; if [ option.get "with-vw" ] { requirements += <define>HAVE_VW ; @@ -247,6 +247,7 @@ if [ option.get "with-mm-extras" : : "yes" ] moses/TranslationModel/UG//bitext-find moses/TranslationModel/UG//ptable-describe-features moses/TranslationModel/UG//count-ptable-features + moses/TranslationModel/UG//ptable-sigtest-filter moses/TranslationModel/UG//ptable-lookup moses/TranslationModel/UG//ptable-lookup-corpus moses/TranslationModel/UG//check-coverage diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 32bfa1927..e8651529d 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1636,16 +1636,6 @@ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetNgramFeature.h</locationURI> </link> <link> - <name>FF/TargetPreferencesFeature.cpp</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.cpp</locationURI> - </link> - <link> - <name>FF/TargetPreferencesFeature.h</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.h</locationURI> - </link> - <link> <name>FF/TargetWordInsertionFeature.cpp</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetWordInsertionFeature.cpp</locationURI> @@ -2006,16 +1996,6 @@ <locationURI>PARENT-3-PROJECT_LOC/moses/PP/SpanLengthPhraseProperty.h</locationURI> </link> <link> - <name>PP/TargetPreferencesPhraseProperty.cpp</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.cpp</locationURI> - </link> - <link> - <name>PP/TargetPreferencesPhraseProperty.h</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.h</locationURI> - </link> - <link> <name>PP/TreeStructurePhraseProperty.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/PP/TreeStructurePhraseProperty.h</locationURI> diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index e44c5c509..c2d8d3363 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -93,6 +93,7 @@ #endif #include "moses/LM/Ken.h" +#include "moses/LM/Reloading.h" #ifdef LM_IRST #include "moses/LM/IRST.h" #endif @@ -203,6 +204,14 @@ public: } }; +class ReloadingFactory : public FeatureFactory +{ +public: + void Create(const std::string &line) { + DefaultSetup(ConstructReloadingLM(line)); + } +}; + } // namespace FeatureRegistry::FeatureRegistry() @@ -332,7 +341,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME2("OxSourceFactoredLM", SourceOxLM); MOSES_FNAME2("OxTreeLM", OxLM<oxlm::FactoredTreeLM>); #endif - + Add("ReloadingLM", new ReloadingFactory()); Add("KENLM", new KenFactory()); } diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp index 428640290..c81f3b859 100644 --- a/moses/LM/Ken.cpp +++ b/moses/LM/Ken.cpp @@ -148,12 +148,8 @@ private: } // namespace -template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy) - :LanguageModel(line) - ,m_factorType(factorType) +template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, bool lazy) { - ReadParameters(); - lm::ngram::Config config; if(this->m_verbosity >= 1) { config.messages = &std::cerr; @@ -170,6 +166,14 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri m_beginSentenceFactor = collection.AddFactor(BOS_); } +template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy) + :LanguageModel(line) + ,m_factorType(factorType) +{ + ReadParameters(); + LoadModel(file, lazy); +} + template <class Model> LanguageModelKen<Model>::LanguageModelKen(const LanguageModelKen<Model> ©_from) :LanguageModel(copy_from.GetArgLine()), m_ngram(copy_from.m_ngram), diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h index 73a957e93..3a94e4c0b 100644 --- a/moses/LM/Ken.h +++ b/moses/LM/Ken.h @@ -73,11 +73,15 @@ protected: FactorType m_factorType; + void LoadModel(const std::string &file, bool lazy); + lm::WordIndex TranslateID(const Word &word) const { std::size_t factor = word.GetFactor(m_factorType)->GetId(); return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); } + std::vector<lm::WordIndex> m_lmIdLookup; + private: LanguageModelKen(const LanguageModelKen<Model> ©_from); @@ -96,7 +100,6 @@ private: } } - std::vector<lm::WordIndex> m_lmIdLookup; protected: //bool m_oovFeatureEnabled; /// originally from LanguageModel, copied here to separate the interfaces. Called m_enableOOVFeature there diff --git a/moses/LM/Reloading.cpp b/moses/LM/Reloading.cpp new file mode 100644 index 000000000..0f9d80a70 --- /dev/null +++ b/moses/LM/Reloading.cpp @@ -0,0 +1,112 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "lm/binary_format.hh" +#include "lm/enumerate_vocab.hh" +#include "lm/left.hh" +#include "lm/model.hh" + +#include "moses/FF/FFState.h" +#include "moses/Hypothesis.h" +#include "moses/Phrase.h" + +#include "moses/LM/Ken.h" +#include "moses/LM/Reloading.h" +#include "util/exception.hh" + +//#include "moses/Util.h" +//#include "moses/StaticData.h" +//#include <iostream> +/* +namespace Moses +{ +namespace +{ + +struct ReloadingLMState : public FFState { + lm::ngram::State state; + virtual size_t hash() const { + return 0; + } + virtual bool operator==(const FFState& o) const { + return true; + } + +}; +} // namespace + + +template <class Model> ReloadingLanguageModel<Model>::ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType,lazy) +{ + // + // This space intentionally left blank + // +} +template <class Model> const FFState *ReloadingLanguageModel<Model>::EmptyHypothesisState(const InputType &input) const +{ + ReloadingLMState *ret = new ReloadingLMState(); + ret->state = m_ngram->BeginSentenceState(); + return ret; +} + + +template <class Model> FFState *ReloadingLanguageModel<Model>::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ + + std::auto_ptr<FFState> kenlmState(LanguageModelKen<Model>::EvaluateWhenApplied(hypo, ps, out)); + const lm::ngram::State &out_state = static_cast<const ReloadingLMState&>(*kenlmState).state; + + + std::auto_ptr<ReloadingLMState> ret(new ReloadingLMState()); + ret->state = out_state; + + kenlmState.release(); + return ret.release(); +} + + +LanguageModel *ConstructReloadingLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy) +{ + lm::ngram::ModelType model_type; + if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { + switch(model_type) { + case lm::ngram::PROBING: + return new ReloadingLanguageModel<lm::ngram::ProbingModel>(line, file, factorType, lazy); + case lm::ngram::REST_PROBING: + return new ReloadingLanguageModel<lm::ngram::RestProbingModel>(line, file, factorType, lazy); + case lm::ngram::TRIE: + return new ReloadingLanguageModel<lm::ngram::TrieModel>(line, file, factorType, lazy); + case lm::ngram::QUANT_TRIE: + return new ReloadingLanguageModel<lm::ngram::QuantTrieModel>(line, file, factorType, lazy); + case lm::ngram::ARRAY_TRIE: + return new ReloadingLanguageModel<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy); + case lm::ngram::QUANT_ARRAY_TRIE: + return new ReloadingLanguageModel<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy); + default: + UTIL_THROW2("Unrecognized kenlm model type " << model_type); + } + } else { + return new ReloadingLanguageModel<lm::ngram::ProbingModel>(line, file, factorType, lazy); + } +} + +} // namespace Moses +*/ diff --git a/moses/LM/Reloading.h b/moses/LM/Reloading.h new file mode 100644 index 000000000..3993fe9d7 --- /dev/null +++ b/moses/LM/Reloading.h @@ -0,0 +1,220 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_LanguageModelReloading_h +#define moses_LanguageModelReloading_h + +#include <string> + +#include "moses/LM/Base.h" +#include "moses/LM/Ken.h" + +#include "util/tokenize_piece.hh" +#include "util/string_stream.hh" + +#include <iostream> +namespace Moses +{ + +class FFState; + +//LanguageModel *ConstructReloadingLM(const std::string &line); +//LanguageModel *ConstructReloadingLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy); +/* + namespace { +class MappingBuilder : public lm::EnumerateVocab +{ +public: + MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping) + : m_factorCollection(factorCollection), m_mapping(mapping) {} + + void Add(lm::WordIndex index, const StringPiece &str) { + std::size_t factorId = m_factorCollection.AddFactor(str)->GetId(); + if (m_mapping.size() <= factorId) { + // 0 is <unk> :-) + m_mapping.resize(factorId + 1); + } + m_mapping[factorId] = index; + } + +private: + FactorCollection &m_factorCollection; + std::vector<lm::WordIndex> &m_mapping; +}; + } +*/ +template <class Model> class ReloadingLanguageModel : public LanguageModelKen<Model> +{ +public: + + ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy), m_file(file), m_lazy(lazy) { + + std::cerr << "ReloadingLM constructor: " << m_file << std::endl; + // std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl; + + } + + virtual void InitializeForInput(ttasksptr const& ttask) { + std::cerr << "ReloadingLM InitializeForInput" << std::endl; + LanguageModelKen<Model>::LoadModel(m_file, m_lazy); + /* + lm::ngram::Config config; + if(this->m_verbosity >= 1) { + config.messages = &std::cerr; + } else { + config.messages = NULL; + } + FactorCollection &collection = FactorCollection::Instance(); + MappingBuilder builder(collection, m_lmIdLookup); + config.enumerate_vocab = &builder; + config.load_method = m_lazy ? util::LAZY : util::POPULATE_OR_READ; + + m_ngram.reset(new Model(m_file.c_str(), config)); + + m_beginSentenceFactor = collection.AddFactor(BOS_); + */ + }; + + /* + ReloadingLanguageModel(const std::string &line) : LanguageModelKen<Model>(ConstructKenLM(std::string(line).replace(0,11,"KENLM"))) { + std::cerr << "ReloadingLM constructor" << std::endl; + std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl; + } + */ + /* + ~ReloadingLanguageModel() { + delete m_lm; + } + + virtual const FFState *EmptyHypothesisState(const InputType &input) const { + return m_lm->EmptyHypothesisState(input); + } + + virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { + m_lm->CalcScore(phrase, fullScore, ngramScore, oovCount); + } + + virtual FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { + return m_lm->EvaluateWhenApplied(hypo, ps, out); + } + + virtual FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const { + return m_lm->EvaluateWhenApplied(cur_hypo, featureID, accumulator); + } + + virtual FFState *EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const { + return m_lm->EvaluateWhenApplied(hyperedge, featureID, accumulator); + } + + virtual void IncrementalCallback(Incremental::Manager &manager) const { + m_lm->IncrementalCallback(manager); + } + + virtual void ReportHistoryOrder(std::ostream &out,const Phrase &phrase) const { + m_lm->ReportHistoryOrder(out, phrase); + } + + virtual bool IsUseable(const FactorMask &mask) const { + return m_lm->IsUseable(mask); + } + + + private: + + LanguageModel *m_lm; + */ + +protected: + + using LanguageModelKen<Model>::m_ngram; + using LanguageModelKen<Model>::m_lmIdLookup; + using LanguageModelKen<Model>::m_beginSentenceFactor; + + const std::string m_file; + bool m_lazy; +}; + + +LanguageModel *ConstructReloadingLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy) +{ + lm::ngram::ModelType model_type; + if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { + switch(model_type) { + case lm::ngram::PROBING: + return new ReloadingLanguageModel<lm::ngram::ProbingModel>(line, file, factorType, lazy); + case lm::ngram::REST_PROBING: + return new ReloadingLanguageModel<lm::ngram::RestProbingModel>(line, file, factorType, lazy); + case lm::ngram::TRIE: + return new ReloadingLanguageModel<lm::ngram::TrieModel>(line, file, factorType, lazy); + case lm::ngram::QUANT_TRIE: + return new ReloadingLanguageModel<lm::ngram::QuantTrieModel>(line, file, factorType, lazy); + case lm::ngram::ARRAY_TRIE: + return new ReloadingLanguageModel<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy); + case lm::ngram::QUANT_ARRAY_TRIE: + return new ReloadingLanguageModel<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy); + default: + UTIL_THROW2("Unrecognized kenlm model type " << model_type); + } + } else { + return new ReloadingLanguageModel<lm::ngram::ProbingModel>(line, file, factorType, lazy); + } +} + +LanguageModel *ConstructReloadingLM(const std::string &lineOrig) +{ + FactorType factorType = 0; + std::string filePath; + bool lazy = false; + + util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' '); + ++argument; // KENLM + + util::StringStream line; + line << "KENLM"; + + for (; argument; ++argument) { + const char *equals = std::find(argument->data(), argument->data() + argument->size(), '='); + UTIL_THROW_IF2(equals == argument->data() + argument->size(), + "Expected = in ReloadingLM argument " << *argument); + StringPiece name(argument->data(), equals - argument->data()); + StringPiece value(equals + 1, argument->data() + argument->size() - equals - 1); + if (name == "factor") { + factorType = boost::lexical_cast<FactorType>(value); + } else if (name == "order") { + // Ignored + } else if (name == "path") { + filePath.assign(value.data(), value.size()); + } else if (name == "lazyken") { + lazy = boost::lexical_cast<bool>(value); + } else { + // pass to base class to interpret + line << " " << name << "=" << value; + } + } + + return ConstructReloadingLM(line.str(), filePath, factorType, lazy); +} + + +} // namespace Moses + +#endif + diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile index d41e0f5ca..34e0b6663 100644 --- a/moses/TranslationModel/UG/Jamfile +++ b/moses/TranslationModel/UG/Jamfile @@ -1,3 +1,13 @@ +exe ptable-sigtest-filter : +filter-pt.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; exe try-align : try-align.cc $(TOP)/moses//moses diff --git a/moses/TranslationModel/UG/filter-pt.cc b/moses/TranslationModel/UG/filter-pt.cc new file mode 100644 index 000000000..cb288d534 --- /dev/null +++ b/moses/TranslationModel/UG/filter-pt.cc @@ -0,0 +1,669 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +// significance filtering for phrase tables as described in +// H. Johnson, et al. (2007) Improving Translation Quality +// by Discarding Most of the Phrasetable. EMNLP 2007. +// Implemented by Marcin Junczys-Dowmunt +// recommended use: -l a+e -n <ttable-limit> +#include <cstring> +#include <cassert> +#include <cstdio> +#include <cstdlib> +#include <algorithm> +#include <fstream> +#include <sstream> + +#include <vector> +#include <iostream> +#include <set> + +#include <boost/thread/tss.hpp> +#include <boost/thread.hpp> +#include <boost/unordered_map.hpp> +#include <boost/program_options.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/foreach.hpp> + +#ifdef WIN32 +#include "WIN32_functions.h" +#else +#include <unistd.h> +#endif + +#include "mm/ug_bitext.h" + +// constants +const size_t MINIMUM_SIZE_TO_KEEP = 10000; // increase this to improve memory usage, +// reduce for speed +const std::string SEPARATOR = " ||| "; + +const double ALPHA_PLUS_EPS = -1000.0; // dummy value +const double ALPHA_MINUS_EPS = -2000.0; // dummy value + +// configuration params +int pfe_filter_limit = 0; // 0 = don't filter anything based on P(f|e) +bool print_cooc_counts = false; // add cooc counts to phrase table? +bool print_neglog_significance = false; // add -log(p) to phrase table? +double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > sig_filter_limit +// higher = filter-more +bool pef_filter_only = false; // only filter based on pef +bool hierarchical = false; + +double p_111 = 0.0; // alpha +size_t pt_lines = 0; +size_t nremoved_sigfilter = 0; +size_t nremoved_pfefilter = 0; + +typedef sapt::L2R_Token<sapt::SimpleWordId> Token; +typedef sapt::mmTtrack<Token> ttrack_t; +typedef sapt::mmTSA<Token> tsa_t; +typedef sapt::TokenIndex tind_t; + +int num_lines; + +boost::mutex in_mutex; +boost::mutex out_mutex; +boost::mutex err_mutex; + +typedef size_t TextLenType; + +typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet; + +class Cache { + typedef std::pair<SentIdSet, clock_t> ClockedSet; + typedef boost::unordered_map<std::string, ClockedSet> ClockedMap; + + public: + + SentIdSet get(const std::string& phrase) { + boost::shared_lock<boost::shared_mutex> lock(m_mutex); + if(m_cont.count(phrase)) { + ClockedSet& set = m_cont[phrase]; + set.second = clock(); + return set.first; + } + return SentIdSet( new SentIdSet::element_type() ); + } + + void put(const std::string& phrase, const SentIdSet set) { + boost::unique_lock<boost::shared_mutex> lock(m_mutex); + m_cont[phrase] = std::make_pair(set, clock()); + } + + static void set_max_cache(size_t max_cache) { + s_max_cache = max_cache; + } + + void prune() { + if(s_max_cache > 0) { + boost::upgrade_lock<boost::shared_mutex> lock(m_mutex); + if(m_cont.size() > s_max_cache) { + std::vector<clock_t> clocks; + for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++) + clocks.push_back(it->second.second); + + std::sort(clocks.begin(), clocks.end()); + clock_t out = clocks[m_cont.size() - s_max_cache]; + + boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock); + for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++) + if(it->second.second < out) + m_cont.erase(it); + } + } + } + + private: + ClockedMap m_cont; + boost::shared_mutex m_mutex; + static size_t s_max_cache; +}; + +size_t Cache::s_max_cache = 0; + +struct SA { + tind_t V; + boost::shared_ptr<ttrack_t> T; + tsa_t I; + Cache cache; +}; + +std::vector<boost::shared_ptr<SA> > e_sas; +std::vector<boost::shared_ptr<SA> > f_sas; + +#undef min + +void usage() +{ + std::cerr << "\nFilter phrase table using significance testing as described\n" + << "in H. Johnson, et al. (2007) Improving Translation Quality\n" + << "by Discarding Most of the Phrasetable. EMNLP 2007.\n"; +} + +struct PTEntry { + PTEntry(const std::string& str, int index); + std::string f_phrase; + std::string e_phrase; + std::string extra; + std::string scores; + float pfe; + int cf; + int ce; + int cfe; + float nlog_pte; + void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) { + cfe = _cef; + cf = _cf; + ce = _ce; + nlog_pte = nlp; + } + +}; + +PTEntry::PTEntry(const std::string& str, int index) : + cf(0), ce(0), cfe(0), nlog_pte(0.0) +{ + size_t pos = 0; + std::string::size_type nextPos = str.find(SEPARATOR, pos); + this->f_phrase = str.substr(pos,nextPos); + + pos = nextPos + SEPARATOR.size(); + nextPos = str.find(SEPARATOR, pos); + this->e_phrase = str.substr(pos,nextPos-pos); + + pos = nextPos + SEPARATOR.size(); + nextPos = str.find(SEPARATOR, pos); + if (nextPos < str.size()) { + this->scores = str.substr(pos,nextPos-pos); + + pos = nextPos + SEPARATOR.size(); + this->extra = str.substr(pos); + } + else { + this->scores = str.substr(pos,str.size()-pos); + } + + int c = 0; + std::string::iterator i=scores.begin(); + if (index > 0) { + for (; i != scores.end(); ++i) { + if ((*i) == ' ') { + c++; + if (c == index) break; + } + } + } + if (i != scores.end()) { + ++i; + } + char f[24]; + char *fp=f; + while (i != scores.end() && *i != ' ') { + *fp++=*i++; + } + *fp++=0; + + this->pfe = atof(f); +} + +struct PfeComparer { + bool operator()(const PTEntry* a, const PTEntry* b) const { + return a->pfe > b->pfe; + } +}; + +struct NlogSigThresholder { + NlogSigThresholder(float threshold) : t(threshold) {} + float t; + bool operator()(const PTEntry* a) const { + if (a->nlog_pte < t) { + delete a; + return true; + } else return false; + } +}; + +std::ostream& operator << (std::ostream& os, const PTEntry& pp) +{ + os << pp.f_phrase << " ||| " << pp.e_phrase; + os << " ||| " << pp.scores; + if (pp.extra.size()>0) os << " ||| " << pp.extra; + if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce; + if (print_neglog_significance) os << " ||| " << pp.nlog_pte; + return os; +} + +void print(int a, int b, int c, int d, float p) +{ + std::cerr << a << "\t" << b << "\t P=" << p << "\n" + << c << "\t" << d << "\t xf=" + << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n"; +} + +// 2x2 (one-sided) Fisher's exact test +// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events +double fisher_exact(int cfe, int ce, int cf) +{ + assert(cfe <= ce); + assert(cfe <= cf); + + int a = cfe; + int b = (cf - cfe); + int c = (ce - cfe); + int d = (num_lines - ce - cf + cfe); + int n = a + b + c + d; + + double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) + - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) + - lgamma(1+d)); + double total_p = 0.0; + int tc = std::min(b,c); + for (int i=0; i<=tc; i++) { + total_p += cp; + double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1); + cp *= coef; + ++a; + --c; + ++d; + --b; + } + return total_p; +} + +template <class setType> +void ordered_set_intersect(setType& out, const setType set_1, const setType set_2) +{ + std::set_intersection(set_1->begin(), set_1->end(), set_2->begin(), + set_2->end(), inserter(*out, out->begin()) ); +} + + +void lookup_phrase(SentIdSet& ids, const std::string& phrase, + tsa_t &my_sa, tind_t &my_v, Cache& cache) +{ + ids = cache.get(phrase); + if(ids->empty()) { + + std::vector<sapt::id_type> snt; + my_v.fillIdSeq(phrase, snt); + + tsa_t::tree_iterator m(&my_sa); + size_t k = 0; + while (k < snt.size() && m.extend(snt[k])) ++k; + if(k == snt.size()) { + ids->reserve(m.approxOccurrenceCount()+10); + sapt::tsa::ArrayEntry I(m.lower_bound(-1)); + char const* stop = m.upper_bound(-1); + do { + m.root->readEntry(I.next,I); + ids->push_back(I.sid); + } while (I.next != stop); + + std::sort(ids->begin(), ids->end()); + SentIdSet::element_type::iterator it = + std::unique(ids->begin(), ids->end()); + ids->resize(it - ids->begin()); + + if(ids->size() >= MINIMUM_SIZE_TO_KEEP) + cache.put(phrase, ids); + } + } +} + +void lookup_multiple_phrases(SentIdSet& ids, std::vector<std::string> & phrases, + tsa_t & my_sa, tind_t &my_v, + const std::string & rule, Cache& cache) +{ + + if (phrases.size() == 1) { + lookup_phrase(ids, phrases.front(), my_sa, my_v, cache); + } + else { + SentIdSet main_set( new SentIdSet::element_type() ); + bool first = true; + SentIdSet first_set( new SentIdSet::element_type() ); + lookup_phrase(first_set, phrases.front(), my_sa, my_v, cache); + for (std::vector<std::string>::iterator phrase=phrases.begin()+1; + phrase != phrases.end(); ++phrase) { + SentIdSet temp_set( new SentIdSet::element_type() ); + lookup_phrase(temp_set, *phrase, my_sa, my_v, cache); + if (first) { + ordered_set_intersect(main_set, first_set, temp_set); + first = false; + } + else { + SentIdSet new_set( new SentIdSet::element_type() ); + ordered_set_intersect(new_set, main_set, temp_set); + main_set->swap(*new_set); + } + } + ids->swap(*main_set); + } +} + + +void find_occurrences(SentIdSet& ids, const std::string& rule, + tsa_t& my_sa, tind_t &my_v, Cache& cache) +{ + // we search for hierarchical rules by stripping away NT and looking for terminals sequences + // if a rule contains multiple sequences of terminals, we intersect their occurrences. + if (hierarchical) { + // std::cerr << "splitting up phrase: " << phrase << "\n"; + int pos = 0; + int NTStartPos, NTEndPos; + std::vector<std::string> phrases; + while (rule.find("] ", pos) < rule.size()) { + NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT + NTEndPos = rule.find("] ",pos); + if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs) + pos = NTEndPos + 2; + continue; + } + phrases.push_back(rule.substr(pos,NTStartPos-pos)); + pos = NTEndPos + 2; + } + + NTStartPos = rule.find("[",pos) - 1; // LHS of rule + if (NTStartPos > pos) { + phrases.push_back(rule.substr(pos,NTStartPos-pos)); + } + + lookup_multiple_phrases(ids, phrases, my_sa, my_v, rule, cache); + } + else { + lookup_phrase(ids, rule, my_sa, my_v, cache); + } +} + + +// input: unordered list of translation options for a single source phrase +void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options) +{ + if (pfe_filter_limit > 0 && options.size() > pfe_filter_limit) { + nremoved_pfefilter += (options.size() - pfe_filter_limit); + std::nth_element(options.begin(), options.begin() + pfe_filter_limit, + options.end(), PfeComparer()); + for (std::vector<PTEntry*>::iterator i = options.begin() + pfe_filter_limit; + i != options.end(); ++i) + delete *i; + options.erase(options.begin() + pfe_filter_limit,options.end()); + } + + if (pef_filter_only) + return; + + if (options.empty()) + return; + + size_t cf = 0; + std::vector<SentIdSet> fsets; + BOOST_FOREACH(boost::shared_ptr<SA>& f_sa, f_sas) { + fsets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) ); + find_occurrences(fsets.back(), options.front()->f_phrase, f_sa->I, f_sa->V, f_sa->cache); + cf += fsets.back()->size(); + } + + for (std::vector<PTEntry*>::iterator i = options.begin(); + i != options.end(); ++i) { + const std::string& e_phrase = (*i)->e_phrase; + + size_t ce = 0; + std::vector<SentIdSet> esets; + BOOST_FOREACH(boost::shared_ptr<SA>& e_sa, e_sas) { + esets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) ); + find_occurrences(esets.back(), e_phrase, e_sa->I, e_sa->V, e_sa->cache); + ce += esets.back()->size(); + } + + size_t cef = 0; + for(size_t j = 0; j < fsets.size(); ++j) { + SentIdSet efset( new SentIdSet::element_type() ); + ordered_set_intersect(efset, fsets[j], esets[j]); + cef += efset->size(); + } + + double nlp = -log(fisher_exact(cef, cf, ce)); + (*i)->set_cooc_stats(cef, cf, ce, nlp); + } + + std::vector<PTEntry*>::iterator new_end = + std::remove_if(options.begin(), options.end(), + NlogSigThresholder(sig_filter_limit)); + nremoved_sigfilter += (options.end() - new_end); + options.erase(new_end,options.end()); +} + +void filter_thread(std::istream* in, std::ostream* out, int pfe_index) { + + std::vector<std::string> lines; + std::string prev = ""; + std::vector<PTEntry*> options; + while(true) { + { + boost::mutex::scoped_lock lock(in_mutex); + if(in->eof()) + break; + + lines.clear(); + std::string line; + while(getline(*in, line) && lines.size() < 500000) + lines.push_back(line); + } + + std::stringstream out_temp; + for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) { + size_t tmp_lines = ++pt_lines; + if(tmp_lines % 10000 == 0) { + boost::mutex::scoped_lock lock(err_mutex); + std::cerr << "."; + + if(tmp_lines % 500000 == 0) + std::cerr << "[n:" << tmp_lines << "]\n"; + + if(tmp_lines % 10000000 == 0) { + float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines; + float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines; + std::cerr << "------------------------------------------------------\n" + << " unfiltered phrases pairs: " << pt_lines << "\n" + << "\n" + << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n" + << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n" + << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n" + << "\n" + << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n" + << "------------------------------------------------------\n"; + } + } + + if(pt_lines % 10000 == 0) { + BOOST_FOREACH(boost::shared_ptr<SA> f_sa, f_sas) + f_sa->cache.prune(); + BOOST_FOREACH(boost::shared_ptr<SA> e_sa, e_sas) + e_sa->cache.prune(); + } + + if(it->length() > 0) { + PTEntry* pp = new PTEntry(it->c_str(), pfe_index); + if (prev != pp->f_phrase) { + prev = pp->f_phrase; + + if (!options.empty()) { // always true after first line + compute_cooc_stats_and_filter(options); + } + + for (std::vector<PTEntry*>::iterator i = options.begin(); + i != options.end(); ++i) { + out_temp << **i << '\n'; + delete *i; + } + + options.clear(); + options.push_back(pp); + + } else { + options.push_back(pp); + } + } + } + boost::mutex::scoped_lock lock(out_mutex); + *out << out_temp.str() << std::flush; + } + compute_cooc_stats_and_filter(options); + + boost::mutex::scoped_lock lock(out_mutex); + for (std::vector<PTEntry*>::iterator i = options.begin(); + i != options.end(); ++i) { + *out << **i << '\n'; + delete *i; + } + *out << std::flush; +} + +namespace po = boost::program_options; + +int main(int argc, char * argv[]) +{ + bool help; + std::vector<std::string> efiles; + std::vector<std::string> ffiles; + int pfe_index = 2; + int threads = 1; + size_t max_cache = 0; + std::string str_sig_filter_limit; + + po::options_description general("General options"); + general.add_options() + ("english,e", po::value<std::vector<std::string> >(&efiles)->multitoken(), + "english.suf-arr") + ("french,f", po::value<std::vector<std::string> >(&ffiles)->multitoken(), + "french.suf-arr") + ("pfe-index,i", po::value(&pfe_index)->default_value(2), + "Index of P(f|e) in phrase table") + ("pfe-filter-limit,n", po::value(&pfe_filter_limit)->default_value(0), + "0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements") + ("threads,t", po::value(&threads)->default_value(1), + "number of threads to use") + ("max-cache,m", po::value(&max_cache)->default_value(0), + "limit cache to arg most recent phrases") + ("print-cooc,c", po::value(&print_cooc_counts)->zero_tokens()->default_value(false), + "add the coocurrence counts to the phrase table") + ("print-significance,p", po::value(&print_neglog_significance)->zero_tokens()->default_value(false), + "add -log(significance) to the phrase table") + ("hierarchical,x", po::value(&hierarchical)->zero_tokens()->default_value(false), + "filter hierarchical rule table") + ("sig-filter-limit,l", po::value(&str_sig_filter_limit), + ">0.0, a+e, or a-e: keep values that have a -log significance > this") + ("help,h", po::value(&help)->zero_tokens()->default_value(false), + "display this message") + ; + + po::options_description cmdline_options("Allowed options"); + cmdline_options.add(general); + po::variables_map vm; + + try { + po::store(po::command_line_parser(argc,argv). + options(cmdline_options).run(), vm); + po::notify(vm); + } + catch (std::exception& e) { + std::cout << "Error: " << e.what() << std::endl << std::endl; + + usage(); + std::cout << cmdline_options << std::endl; + exit(0); + } + + if(vm["help"].as<bool>()) { + usage(); + std::cout << cmdline_options << std::endl; + exit(0); + } + + if(vm.count("pfe-filter-limit")) + std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl; + if(vm.count("threads")) + std::cerr << "Using threads: " << threads << std::endl; + if(vm.count("max-cache")) + std::cerr << "Using max phrases in caches: " << max_cache << std::endl; + + if (strcmp(str_sig_filter_limit.c_str(),"a+e") == 0) { + sig_filter_limit = ALPHA_PLUS_EPS; + } else if (strcmp(str_sig_filter_limit.c_str(),"a-e") == 0) { + sig_filter_limit = ALPHA_MINUS_EPS; + } else { + char *x; + sig_filter_limit = strtod(str_sig_filter_limit.c_str(), &x); + if (sig_filter_limit < 0.0) { + std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n"; + usage(); + } + } + + if (sig_filter_limit == 0.0) pef_filter_only = true; + //----------------------------------------------------------------------------- + if (optind != argc || ((efiles.empty() || ffiles.empty()) && !pef_filter_only)) { + usage(); + } + + if (!pef_filter_only) { + size_t elines = 0; + BOOST_FOREACH(std::string& efile, efiles) { + e_sas.push_back(boost::shared_ptr<SA>(new SA())); + e_sas.back()->V.open(efile + ".tdx"); + e_sas.back()->T.reset(new ttrack_t()); + e_sas.back()->T->open(efile + ".mct"); + e_sas.back()->I.open(efile + ".sfa", e_sas.back()->T); + elines += e_sas.back()->T->size(); + } + + size_t flines = 0; + BOOST_FOREACH(std::string& ffile, ffiles) { + f_sas.push_back(boost::shared_ptr<SA>(new SA())); + f_sas.back()->V.open(ffile + ".tdx"); + f_sas.back()->T.reset(new ttrack_t()); + f_sas.back()->T->open(ffile + ".mct"); + f_sas.back()->I.open(ffile + ".sfa", f_sas.back()->T); + flines += f_sas.back()->T->size(); + } + + if (elines != flines) { + std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n"; + usage(); + exit(1); + } else { + std::cerr << "Training corpus: " << elines << " lines\n"; + num_lines = elines; + } + p_111 = -log(fisher_exact(1,1,1)); + std::cerr << "\\alpha = " << p_111 << "\n"; + if (sig_filter_limit == ALPHA_MINUS_EPS) { + sig_filter_limit = p_111 - 0.001; + } else if (sig_filter_limit == ALPHA_PLUS_EPS) { + sig_filter_limit = p_111 + 0.001; + } + std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n"; + } else { + std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl; + } + + Cache::set_max_cache(max_cache); + std::ios_base::sync_with_stdio(false); + + boost::thread_group threadGroup; + for(int i = 0; i < threads; i++) + threadGroup.add_thread(new boost::thread(filter_thread, &std::cin, &std::cout, pfe_index)); + threadGroup.join_all(); + + float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines; + float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines; + + std::cerr << "\n\n------------------------------------------------------\n" + << " unfiltered phrases pairs: " << pt_lines << "\n" + << "\n" + << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n" + << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n" + << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n" + << "\n" + << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n" + << "------------------------------------------------------\n"; +} diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index ed60771ae..024ae44d3 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -188,7 +188,7 @@ namespace Moses dflt = pair<string,string>("workers","0"); m_workers = atoi(param.insert(dflt).first->second.c_str()); - if (m_workers == 0) m_workers = boost::thread::hardware_concurrency(); + if (m_workers == 0) m_workers = StaticData::Instance().ThreadCount(); else m_workers = min(m_workers,size_t(boost::thread::hardware_concurrency())); dflt = pair<string,string>("bias-loglevel","0"); diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp index 1f09b2eed..012b9a7af 100644 --- a/moses/TrellisPath.cpp +++ b/moses/TrellisPath.cpp @@ -42,7 +42,7 @@ TrellisPath::TrellisPath(const Hypothesis *hypo) void TrellisPath::InitTotalScore() { - m_totalScore = m_path[0]->GetWinningHypo()->GetFutureScore(); + m_totalScore = m_path[0]->GetWinningHypo()->GetFutureScore(); //calc score size_t sizePath = m_path.size(); @@ -50,7 +50,7 @@ void TrellisPath::InitTotalScore() const Hypothesis *hypo = m_path[pos]; const Hypothesis *winningHypo = hypo->GetWinningHypo(); if (hypo != winningHypo) { - m_totalScore = m_totalScore - winningHypo->GetFutureScore() + hypo->GetFutureScore(); + m_totalScore += hypo->GetFutureScore() - winningHypo->GetFutureScore(); } } } @@ -169,9 +169,6 @@ TrellisPath:: GetScoreBreakdown() const { if (!m_scoreBreakdown) { - float totalScore = m_path[0]->GetWinningHypo()->GetFutureScore(); - // calculated for sanity check only - m_scoreBreakdown.reset(new ScoreComponentCollection()); m_scoreBreakdown->PlusEquals(m_path[0]->GetWinningHypo()->GetScoreBreakdown()); @@ -184,13 +181,10 @@ GetScoreBreakdown() const const Hypothesis *hypo = m_path[pos]; const Hypothesis *winningHypo = hypo->GetWinningHypo(); if (hypo != winningHypo) { - totalScore += hypo->GetFutureScore() - winningHypo->GetFutureScore(); m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown()); m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown()); } } - - assert(totalScore == m_totalScore); } return m_scoreBreakdown; diff --git a/run-regtests.sh b/run-regtests.sh index 3d93741d5..843ee3a94 100755 --- a/run-regtests.sh +++ b/run-regtests.sh @@ -53,18 +53,18 @@ git submodule update regtest # -- compile from scratch with server, run regtests set -x if [ "$full" == true ] ; then - ./bjam -j$j --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $? + ./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $? if ./regression-testing/run-single-test.perl --server --startuptest ; then - ./bjam -j$j --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q + ./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q fi else # when investigating failures, always run single-threaded if [ "$q" == "-q" ] ; then j=1; fi if ./regression-testing/run-single-test.perl --server --startuptest ; then - ./bjam -j$j $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@ + ./bjam -j$j --with-mm $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@ else - ./bjam -j$j $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@ + ./bjam -j$j --with-mm --with-mm-extras $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@ fi fi diff --git a/util/file_stream.hh b/util/file_stream.hh index ae9ad5aa7..be26a0921 100644 --- a/util/file_stream.hh +++ b/util/file_stream.hh @@ -58,6 +58,7 @@ class FileStream : public FakeOStream<FileStream> { } FileStream &seekp(uint64_t to) { + flush(); util::SeekOrThrow(fd_, to); return *this; } |