diff options
author | Ulrich Germann <Ulrich.Germann@gmail.com> | 2015-03-26 22:45:18 +0300 |
---|---|---|
committer | Ulrich Germann <Ulrich.Germann@gmail.com> | 2015-03-26 22:45:18 +0300 |
commit | 0c49fb9a006db97008ade5488c793bea0d057fd4 (patch) | |
tree | be07fddd65c53121a3a271c23a3fb4bf32ef9866 | |
parent | 9dc75bfd8ad3092f08f7a6d2a6492323b7803e56 (diff) | |
parent | 206d0c969885817521e941eaec879517e39a5b59 (diff) |
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder into mmt-dev
-rw-r--r-- | lm/Jamfile | 8 | ||||
-rw-r--r-- | lm/builder/adjust_counts.cc | 3 | ||||
-rw-r--r-- | lm/builder/adjust_counts_test.cc | 2 | ||||
-rw-r--r-- | lm/builder/corpus_count_test.cc | 3 | ||||
-rw-r--r-- | lm/builder/lmplz_main.cc | 2 | ||||
-rw-r--r-- | lm/config.hh | 7 | ||||
-rw-r--r-- | lm/wrappers/nplm.cc | 57 | ||||
-rw-r--r-- | lm/wrappers/nplm.hh | 4 | ||||
-rw-r--r-- | util/fake_ofstream.hh | 30 |
9 files changed, 78 insertions, 38 deletions
diff --git a/lm/Jamfile b/lm/Jamfile index edc3751a7..0b5bbf259 100644 --- a/lm/Jamfile +++ b/lm/Jamfile @@ -14,12 +14,12 @@ update-if-changed $(ORDER-LOG) $(max-order) ; max-order += <dependency>$(ORDER-LOG) ; wrappers = ; -local with-nplm = [ option.get "with-nplm-0.1" ] ; +local with-nplm = [ option.get "with-nplm" ] ; if $(with-nplm) { - lib neuralLM : : <search>$(with-nplm)/src ; + lib nplm : : <search>$(with-nplm)/src ; obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ; - alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ; - wrappers += nplm ; + alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ; + wrappers += nplm-all ; } fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ; diff --git a/lm/builder/adjust_counts.cc b/lm/builder/adjust_counts.cc index 03ccbb934..fa77d45a7 100644 --- a/lm/builder/adjust_counts.cc +++ b/lm/builder/adjust_counts.cc @@ -48,7 +48,8 @@ class StatCollector { // TODO: Specialize error message for j == 3, meaning 3+ UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for " << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any " - << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?"); + << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n" + << "Try deduplicating the input. To override this error for e.g. a class-based model, rerun with --discount_fallback\n"); } // See equation (26) in Chen and Goodman. diff --git a/lm/builder/adjust_counts_test.cc b/lm/builder/adjust_counts_test.cc index 073c5dfeb..353e3dd35 100644 --- a/lm/builder/adjust_counts_test.cc +++ b/lm/builder/adjust_counts_test.cc @@ -78,7 +78,7 @@ BOOST_AUTO_TEST_CASE(Simple) { DiscountConfig discount_config; discount_config.fallback = Discount(); discount_config.bad_action = THROW_UP; - BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException); + BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector<bool>(), discount_config, discount).Run(for_adjust), BadDiscountException); } BOOST_REQUIRE_EQUAL(4UL, counts.size()); BOOST_CHECK_EQUAL(4UL, counts[0]); diff --git a/lm/builder/corpus_count_test.cc b/lm/builder/corpus_count_test.cc index 26cb63469..18301656f 100644 --- a/lm/builder/corpus_count_test.cc +++ b/lm/builder/corpus_count_test.cc @@ -45,7 +45,8 @@ BOOST_AUTO_TEST_CASE(Short) { NGramStream stream; uint64_t token_count; WordIndex type_count = 10; - CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), SILENT); + std::vector<bool> prune_words; + CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT); chain >> boost::ref(counter) >> stream >> util::stream::kRecycle; const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"}; diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc index d3bd99d23..65ec55729 100644 --- a/lm/builder/lmplz_main.cc +++ b/lm/builder/lmplz_main.cc @@ -202,6 +202,7 @@ int main(int argc, char *argv[]) { initial.adder_out.block_count = 2; pipeline.read_backoffs = initial.adder_out; + // Read from stdin, write to stdout by default util::scoped_fd in(0), out(1); if (vm.count("text")) { in.reset(util::OpenReadOrThrow(text.c_str())); @@ -210,7 +211,6 @@ int main(int argc, char *argv[]) { out.reset(util::CreateOrThrow(arpa.c_str())); } - // Read from stdin try { lm::builder::Output output; output.Add(new lm::builder::PrintARPA(out.release(), verbose_header)); diff --git a/lm/config.hh b/lm/config.hh index a4238cd9a..21b9e7eeb 100644 --- a/lm/config.hh +++ b/lm/config.hh @@ -30,9 +30,10 @@ struct Config { return show_progress ? messages : 0; } - // This will be called with every string in the vocabulary. See - // enumerate_vocab.hh for more detail. Config does not take ownership; you - // are still responsible for deleting it (or stack allocating). + // This will be called with every string in the vocabulary by the + // constructor; it need only exist for the lifetime of the constructor. + // See enumerate_vocab.hh for more detail. Config does not take ownership; + // just delete/let it go out of scope after the constructor exits. EnumerateVocab *enumerate_vocab; diff --git a/lm/wrappers/nplm.cc b/lm/wrappers/nplm.cc index 70622bd2b..44fd75a83 100644 --- a/lm/wrappers/nplm.cc +++ b/lm/wrappers/nplm.cc @@ -21,6 +21,26 @@ WordIndex Vocabulary::Index(const std::string &str) const { return vocab_.lookup_word(str); } +class Backend { + public: + Backend(const nplm::neuralLM &from, const std::size_t cache_size) : lm_(from), ngram_(from.get_order()) { + lm_.set_cache(cache_size); + } + + nplm::neuralLM &LM() { return lm_; } + const nplm::neuralLM &LM() const { return lm_; } + + Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram_; } + + double lookup_from_staging() { return lm_.lookup_ngram(ngram_); } + + int order() const { return lm_.get_order(); } + + private: + nplm::neuralLM lm_; + Eigen::Matrix<int,Eigen::Dynamic,1> ngram_; +}; + bool Model::Recognize(const std::string &name) { try { util::scoped_fd file(util::OpenReadOrThrow(name.c_str())); @@ -31,10 +51,18 @@ bool Model::Recognize(const std::string &name) { } catch (const util::Exception &) { return false; } -} +} + +namespace { +nplm::neuralLM *LoadNPLM(const std::string &file) { + util::scoped_ptr<nplm::neuralLM> ret(new nplm::neuralLM()); + ret->read(file); + return ret.release(); +} +} // namespace Model::Model(const std::string &file, std::size_t cache) - : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) { + : base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) { UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile."); // log10 compatible with backoff models. base_instance_->set_log_base(10.0); @@ -49,26 +77,25 @@ Model::Model(const std::string &file, std::size_t cache) Model::~Model() {} FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const { - nplm::neuralLM *lm = backend_.get(); - if (!lm) { - lm = new nplm::neuralLM(*base_instance_); - backend_.reset(lm); - lm->set_cache(cache_size_); + Backend *backend = backend_.get(); + if (!backend) { + backend = new Backend(*base_instance_, cache_size_); + backend_.reset(backend); } // State is in natural word order. FullScoreReturn ret; - for (int i = 0; i < lm->get_order() - 1; ++i) { - lm->staging_ngram()(i) = from.words[i]; + for (int i = 0; i < backend->order() - 1; ++i) { + backend->staging_ngram()(i) = from.words[i]; } - lm->staging_ngram()(lm->get_order() - 1) = new_word; - ret.prob = lm->lookup_from_staging(); + backend->staging_ngram()(backend->order() - 1) = new_word; + ret.prob = backend->lookup_from_staging(); // Always say full order. - ret.ngram_length = lm->get_order(); + ret.ngram_length = backend->order(); // Shift everything down by one. - memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2)); - out_state.words[lm->get_order() - 2] = new_word; + memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (backend->order() - 2)); + out_state.words[backend->order() - 2] = new_word; // Fill in trailing words with zeros so state comparison works. - memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order())); + memset(out_state.words + backend->order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - backend->order())); return ret; } diff --git a/lm/wrappers/nplm.hh b/lm/wrappers/nplm.hh index b7dd4a21e..416281de2 100644 --- a/lm/wrappers/nplm.hh +++ b/lm/wrappers/nplm.hh @@ -49,6 +49,8 @@ struct State { WordIndex words[NPLM_MAX_ORDER - 1]; }; +class Backend; + class Model : public lm::base::ModelFacade<Model, State, Vocabulary> { private: typedef lm::base::ModelFacade<Model, State, Vocabulary> P; @@ -68,7 +70,7 @@ class Model : public lm::base::ModelFacade<Model, State, Vocabulary> { private: boost::scoped_ptr<nplm::neuralLM> base_instance_; - mutable boost::thread_specific_ptr<nplm::neuralLM> backend_; + mutable boost::thread_specific_ptr<Backend> backend_; Vocabulary vocab_; diff --git a/util/fake_ofstream.hh b/util/fake_ofstream.hh index 987fa8015..8299ba9ac 100644 --- a/util/fake_ofstream.hh +++ b/util/fake_ofstream.hh @@ -36,6 +36,25 @@ class FakeOFStream { fd_ = to; } + FakeOFStream &Write(const void *data, std::size_t length) { + // Dominant case + if (static_cast<std::size_t>(builder_.size() - builder_.position()) > length) { + builder_.AddSubstring((const char*)data, length); + return *this; + } + Flush(); + if (length > buffer_size_) { + util::WriteOrThrow(fd_, data, length); + } else { + builder_.AddSubstring((const char*)data, length); + } + return *this; + } + + FakeOFStream &operator<<(StringPiece str) { + return Write(str.data(), str.size()); + } + FakeOFStream &operator<<(float value) { // Odd, but this is the largest number found in the comments. EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); @@ -49,17 +68,6 @@ class FakeOFStream { return *this; } - FakeOFStream &operator<<(StringPiece str) { - if (str.size() > buffer_size_) { - Flush(); - util::WriteOrThrow(fd_, str.data(), str.size()); - } else { - EnsureRemaining(str.size()); - builder_.AddSubstring(str.data(), str.size()); - } - return *this; - } - // Inefficient! TODO: more efficient implementation FakeOFStream &operator<<(unsigned value) { return *this << boost::lexical_cast<std::string>(value); |