Merge branch 'master' of https://github.com/moses-smt/mosesdecoder into mmt-dev

author: Ulrich Germann <Ulrich.Germann@gmail.com> 2015-03-26 22:45:18 +0300
committer: Ulrich Germann <Ulrich.Germann@gmail.com> 2015-03-26 22:45:18 +0300
commit: 0c49fb9a006db97008ade5488c793bea0d057fd4 (patch)
tree: be07fddd65c53121a3a271c23a3fb4bf32ef9866
parent: 9dc75bfd8ad3092f08f7a6d2a6492323b7803e56 (diff)
parent: 206d0c969885817521e941eaec879517e39a5b59 (diff)
9 files changed, 78 insertions, 38 deletions
diff --git a/lm/Jamfile b/lm/Jamfile
index edc3751a7..0b5bbf259 100644
--- a/lm/Jamfile
+++ b/lm/Jamfile
@@ -14,12 +14,12 @@ update-if-changed $(ORDER-LOG) $(max-order) ;
 max-order += <dependency>$(ORDER-LOG) ;
 
 wrappers = ;
-local with-nplm = [ option.get "with-nplm-0.1" ] ;
+local with-nplm = [ option.get "with-nplm" ] ;
 if $(with-nplm) {
-  lib neuralLM : : <search>$(with-nplm)/src ;
+  lib nplm : : <search>$(with-nplm)/src ;
   obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
-  alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
-  wrappers += nplm ;
+  alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
+  wrappers += nplm-all ;
 }
 
 fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
diff --git a/lm/builder/adjust_counts.cc b/lm/builder/adjust_counts.cc
index 03ccbb934..fa77d45a7 100644
--- a/lm/builder/adjust_counts.cc
+++ b/lm/builder/adjust_counts.cc
@@ -48,7 +48,8 @@ class StatCollector {
             // TODO: Specialize error message for j == 3, meaning 3+
             UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
                 << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
-                << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
+                << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n"
+                << "Try deduplicating the input.  To override this error for e.g. a class-based model, rerun with --discount_fallback\n");
           }
 
           // See equation (26) in Chen and Goodman.
diff --git a/lm/builder/adjust_counts_test.cc b/lm/builder/adjust_counts_test.cc
index 073c5dfeb..353e3dd35 100644
--- a/lm/builder/adjust_counts_test.cc
+++ b/lm/builder/adjust_counts_test.cc
@@ -78,7 +78,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
     DiscountConfig discount_config;
     discount_config.fallback = Discount();
     discount_config.bad_action = THROW_UP;
-    BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException);
+    BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector<bool>(), discount_config, discount).Run(for_adjust), BadDiscountException);
   }
   BOOST_REQUIRE_EQUAL(4UL, counts.size());
   BOOST_CHECK_EQUAL(4UL, counts[0]);
diff --git a/lm/builder/corpus_count_test.cc b/lm/builder/corpus_count_test.cc
index 26cb63469..18301656f 100644
--- a/lm/builder/corpus_count_test.cc
+++ b/lm/builder/corpus_count_test.cc
@@ -45,7 +45,8 @@ BOOST_AUTO_TEST_CASE(Short) {
   NGramStream stream;
   uint64_t token_count;
   WordIndex type_count = 10;
-  CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), SILENT);
+  std::vector<bool> prune_words;
+  CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
   chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;
 
   const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index d3bd99d23..65ec55729 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -202,6 +202,7 @@ int main(int argc, char *argv[]) {
     initial.adder_out.block_count = 2;
     pipeline.read_backoffs = initial.adder_out;
 
+    // Read from stdin, write to stdout by default
     util::scoped_fd in(0), out(1);
     if (vm.count("text")) {
       in.reset(util::OpenReadOrThrow(text.c_str()));
@@ -210,7 +211,6 @@ int main(int argc, char *argv[]) {
       out.reset(util::CreateOrThrow(arpa.c_str()));
     }
 
-    // Read from stdin
     try {
       lm::builder::Output output;
       output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
diff --git a/lm/config.hh b/lm/config.hh
index a4238cd9a..21b9e7eeb 100644
--- a/lm/config.hh
+++ b/lm/config.hh
@@ -30,9 +30,10 @@ struct Config {
     return show_progress ? messages : 0;
   }
 
-  // This will be called with every string in the vocabulary.  See
-  // enumerate_vocab.hh for more detail.  Config does not take ownership; you
-  // are still responsible for deleting it (or stack allocating).
+  // This will be called with every string in the vocabulary by the
+  // constructor; it need only exist for the lifetime of the constructor.
+  // See enumerate_vocab.hh for more detail.  Config does not take ownership;
+  // just delete/let it go out of scope after the constructor exits.
   EnumerateVocab *enumerate_vocab;
 
 
diff --git a/lm/wrappers/nplm.cc b/lm/wrappers/nplm.cc
index 70622bd2b..44fd75a83 100644
--- a/lm/wrappers/nplm.cc
+++ b/lm/wrappers/nplm.cc
@@ -21,6 +21,26 @@ WordIndex Vocabulary::Index(const std::string &str) const {
   return vocab_.lookup_word(str);
 }
 
+class Backend {
+  public:
+    Backend(const nplm::neuralLM &from, const std::size_t cache_size) : lm_(from), ngram_(from.get_order()) {
+      lm_.set_cache(cache_size);
+    }
+
+    nplm::neuralLM &LM() { return lm_; }
+    const nplm::neuralLM &LM() const { return lm_; }
+
+    Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram_; }
+
+    double lookup_from_staging() { return lm_.lookup_ngram(ngram_); }
+
+    int order() const { return lm_.get_order(); }
+
+  private:
+    nplm::neuralLM lm_;
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram_;
+};
+
 bool Model::Recognize(const std::string &name) {
   try {
     util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
@@ -31,10 +51,18 @@ bool Model::Recognize(const std::string &name) {
   } catch (const util::Exception &) {
     return false;
   }
-} 
+}
+
+namespace {
+nplm::neuralLM *LoadNPLM(const std::string &file) {
+  util::scoped_ptr<nplm::neuralLM> ret(new nplm::neuralLM());
+  ret->read(file);
+  return ret.release();
+}
+} // namespace
 
 Model::Model(const std::string &file, std::size_t cache) 
-  : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
+  : base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
   UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ".  Change the defintion of NPLM_MAX_ORDER and recompile.");
   // log10 compatible with backoff models.
   base_instance_->set_log_base(10.0);
@@ -49,26 +77,25 @@ Model::Model(const std::string &file, std::size_t cache)
 Model::~Model() {}
 
 FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
-  nplm::neuralLM *lm = backend_.get();
-  if (!lm) {
-    lm = new nplm::neuralLM(*base_instance_);
-    backend_.reset(lm);
-    lm->set_cache(cache_size_);
+  Backend *backend = backend_.get();
+  if (!backend) {
+    backend = new Backend(*base_instance_, cache_size_);
+    backend_.reset(backend);
   }
   // State is in natural word order.
   FullScoreReturn ret;
-  for (int i = 0; i < lm->get_order() - 1; ++i) {
-    lm->staging_ngram()(i) = from.words[i];
+  for (int i = 0; i < backend->order() - 1; ++i) {
+    backend->staging_ngram()(i) = from.words[i];
   }
-  lm->staging_ngram()(lm->get_order() - 1) = new_word;
-  ret.prob = lm->lookup_from_staging();
+  backend->staging_ngram()(backend->order() - 1) = new_word;
+  ret.prob = backend->lookup_from_staging();
   // Always say full order.
-  ret.ngram_length = lm->get_order();
+  ret.ngram_length = backend->order();
   // Shift everything down by one.
-  memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2));
-  out_state.words[lm->get_order() - 2] = new_word;
+  memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (backend->order() - 2));
+  out_state.words[backend->order() - 2] = new_word;
   // Fill in trailing words with zeros so state comparison works.
-  memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order()));
+  memset(out_state.words + backend->order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - backend->order()));
   return ret;
 }
 
diff --git a/lm/wrappers/nplm.hh b/lm/wrappers/nplm.hh
index b7dd4a21e..416281de2 100644
--- a/lm/wrappers/nplm.hh
+++ b/lm/wrappers/nplm.hh
@@ -49,6 +49,8 @@ struct State {
   WordIndex words[NPLM_MAX_ORDER - 1];
 };
 
+class Backend;
+
 class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
   private:
     typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
@@ -68,7 +70,7 @@ class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
   private:
     boost::scoped_ptr<nplm::neuralLM> base_instance_;
 
-    mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;
+    mutable boost::thread_specific_ptr<Backend> backend_;
 
     Vocabulary vocab_;
 
diff --git a/util/fake_ofstream.hh b/util/fake_ofstream.hh
index 987fa8015..8299ba9ac 100644
--- a/util/fake_ofstream.hh
+++ b/util/fake_ofstream.hh
@@ -36,6 +36,25 @@ class FakeOFStream {
       fd_ = to;
     }
 
+    FakeOFStream &Write(const void *data, std::size_t length) {
+      // Dominant case
+      if (static_cast<std::size_t>(builder_.size() - builder_.position()) > length) {
+        builder_.AddSubstring((const char*)data, length);
+        return *this;
+      }
+      Flush();
+      if (length > buffer_size_) {
+        util::WriteOrThrow(fd_, data, length);
+      } else {
+        builder_.AddSubstring((const char*)data, length);
+      }
+      return *this;
+    }
+
+    FakeOFStream &operator<<(StringPiece str) {
+      return Write(str.data(), str.size());
+    }
+
     FakeOFStream &operator<<(float value) {
       // Odd, but this is the largest number found in the comments.
       EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
@@ -49,17 +68,6 @@ class FakeOFStream {
       return *this;
     }
 
-    FakeOFStream &operator<<(StringPiece str) {
-      if (str.size() > buffer_size_) {
-        Flush();
-        util::WriteOrThrow(fd_, str.data(), str.size());
-      } else {
-        EnsureRemaining(str.size());
-        builder_.AddSubstring(str.data(), str.size());
-      }
-      return *this;
-    }
-
     // Inefficient!  TODO: more efficient implementation
     FakeOFStream &operator<<(unsigned value) {
       return *this << boost::lexical_cast<std::string>(value);
author	Ulrich Germann <Ulrich.Germann@gmail.com>	2015-03-26 22:45:18 +0300
committer	Ulrich Germann <Ulrich.Germann@gmail.com>	2015-03-26 22:45:18 +0300
commit	0c49fb9a006db97008ade5488c793bea0d057fd4 (patch)
tree	be07fddd65c53121a3a271c23a3fb4bf32ef9866
parent	9dc75bfd8ad3092f08f7a6d2a6492323b7803e56 (diff)
parent	206d0c969885817521e941eaec879517e39a5b59 (diff)