KenLM 10ddf7d923355b35a7de9a5219673eca9e18be98 except Hieu's slow string_piece_hash

author: Kenneth Heafield <github@kheafield.com> 2013-05-19 18:12:06 +0400
committer: Kenneth Heafield <github@kheafield.com> 2013-05-19 18:12:06 +0400
commit: 50652382e9285740de73654a7f47a8f4a9d993a1 (patch)
tree: 31f37b7f09559678c3f4661290287ce39d34da39 /lm
parent: 41da56364565e0aa9d40cce018e5ef82f9766430 (diff)
9 files changed, 24 insertions, 95 deletions
diff --git a/lm/Jamfile b/lm/Jamfile
index 3f25d9ce4..eaf629fd0 100644
--- a/lm/Jamfile
+++ b/lm/Jamfile
@@ -21,9 +21,11 @@ run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
 run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ;
 run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
 
-exe query : query_main.cc kenlm ../util//kenutil ;
-exe build_binary : build_binary_main.cc kenlm ../util//kenutil ;
-exe kenlm_max_order : kenlm_max_order_main.cc : <include>.. $(max-order) ;
-exe fragment : fragment_main.cc kenlm ;
+exes = ;
+for local p in [ glob *_main.cc ] {
+  local name = [ MATCH "(.*)\_main.cc" : $(p) ] ;
+  exe $(name) : $(p) kenlm ;
+  exes += $(name) ;
+}
 
-alias programs : query build_binary kenlm_max_order fragment filter//filter : <threading>multi:<source>builder//lmplz ;
+alias programs : $(exes) filter//filter : <threading>multi:<source>builder//lmplz ;
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 3714dddad..aea93ad10 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -3,6 +3,7 @@
 #include "lm/builder/ngram.hh"
 #include "lm/lm_exception.hh"
 #include "lm/word_index.hh"
+#include "util/fake_ofstream.hh"
 #include "util/file.hh"
 #include "util/file_piece.hh"
 #include "util/murmur_hash.hh"
@@ -48,10 +49,8 @@ class VocabHandout {
     explicit VocabHandout(int fd, WordIndex initial_guess) :
         table_backing_(util::CallocOrThrow(MemUsage(initial_guess))),
         table_(table_backing_.get(), MemUsage(initial_guess)),
-        double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)) {
-      util::scoped_fd duped(util::DupOrThrow(fd));
-      word_list_.reset(util::FDOpenOrThrow(duped));
-      
+        double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)),
+        word_list_(fd) {
       Lookup("<unk>"); // Force 0
       Lookup("<s>"); // Force 1
       Lookup("</s>"); // Force 2
@@ -65,9 +64,7 @@ class VocabHandout {
       Table::MutableIterator it;
       if (table_.FindOrInsert(entry, it))
         return it->value;
-      char null_delimit = 0;
-      util::WriteOrThrow(word_list_.get(), word.data(), word.size());
-      util::WriteOrThrow(word_list_.get(), &null_delimit, 1);
+      word_list_ << word << '\0';
       UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words.  Change WordIndex to uint64_t in lm/word_index.hh.");
       if (Size() >= double_cutoff_) {
         table_backing_.call_realloc(table_.DoubleTo());
@@ -90,8 +87,8 @@ class VocabHandout {
     Table table_;
 
     std::size_t double_cutoff_;
-
-    util::scoped_FILE word_list_;
+    
+    util::FakeOFStream word_list_;
 };
 
 class DedupeHash : public std::unary_function<const WordIndex *, bool> {
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index fc3314bf1..845e5481d 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -34,7 +34,7 @@ struct PipelineConfig {
   std::size_t TotalMemory() const { return sort.total_memory; }
 };
 
-// Takes ownership of text_file.
+// Takes ownership of text_file and out_arpa.
 void Pipeline(PipelineConfig config, int text_file, int out_arpa);
 
 }} // namespaces
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index b0323221a..84bd81cad 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -1,15 +1,11 @@
 #include "lm/builder/print.hh"
 
-#include "util/double-conversion/double-conversion.h"
-#include "util/double-conversion/utils.h"
+#include "util/fake_ofstream.hh"
 #include "util/file.hh"
 #include "util/mmap.hh"
 #include "util/scoped.hh"
 #include "util/stream/timer.hh"
 
-#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
-#include <boost/lexical_cast.hpp>
-
 #include <sstream>
 
 #include <string.h>
@@ -28,71 +24,6 @@ VocabReconstitute::VocabReconstitute(int fd) {
   map_.push_back(i);
 }
 
-namespace {
-class OutputManager {
-  public:
-    static const std::size_t kOutBuf = 1048576;
-
-    // Does not take ownership of out.
-    explicit OutputManager(int out)
-      : buf_(util::MallocOrThrow(kOutBuf)),
-        builder_(static_cast<char*>(buf_.get()), kOutBuf),
-        // Mostly the default but with inf instead.  And no flags.
-        convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0),
-        fd_(out) {}
-
-    ~OutputManager() {
-      Flush();
-    }
-
-    OutputManager &operator<<(float value) {
-      // Odd, but this is the largest number found in the comments.
-      EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
-      convert_.ToShortestSingle(value, &builder_);
-      return *this;
-    }
-
-    OutputManager &operator<<(StringPiece str) {
-      if (str.size() > kOutBuf) {
-        Flush();
-        util::WriteOrThrow(fd_, str.data(), str.size());
-      } else {
-        EnsureRemaining(str.size());
-        builder_.AddSubstring(str.data(), str.size());
-      }
-      return *this;
-    }
-
-    // Inefficient!
-    OutputManager &operator<<(unsigned val) {
-      return *this << boost::lexical_cast<std::string>(val);
-    }
-
-    OutputManager &operator<<(char c) {
-      EnsureRemaining(1);
-      builder_.AddCharacter(c);
-      return *this;
-    }
-
-    void Flush() {
-      util::WriteOrThrow(fd_, buf_.get(), builder_.position());
-      builder_.Reset();
-    }
-
-  private:
-    void EnsureRemaining(std::size_t amount) {
-      if (static_cast<std::size_t>(builder_.size() - builder_.position()) < amount) {
-        Flush();
-      }
-    }
-
-    util::scoped_malloc buf_;
-    double_conversion::StringBuilder builder_;
-    double_conversion::DoubleToStringConverter convert_;
-    int fd_;
-};
-} // namespace
-
 PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd) 
   : vocab_(vocab), out_fd_(out_fd) {
   std::stringstream stream;
@@ -112,8 +43,9 @@ PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t>
 }
 
 void PrintARPA::Run(const ChainPositions &positions) {
+  util::scoped_fd closer(out_fd_);
   UTIL_TIMER("(%w s) Wrote ARPA file\n");
-  OutputManager out(out_fd_);
+  util::FakeOFStream out(out_fd_);
   for (unsigned order = 1; order <= positions.size(); ++order) {
     out << "\\" << order << "-grams:" << '\n';
     for (NGramStream stream(positions[order - 1]); stream; ++stream) {
diff --git a/lm/builder/print.hh b/lm/builder/print.hh
index aa932e757..adbbb94a9 100644
--- a/lm/builder/print.hh
+++ b/lm/builder/print.hh
@@ -88,7 +88,8 @@ template <class V> class Print {
 
 class PrintARPA {
   public:
-    // header_info may be NULL to disable the header
+    // header_info may be NULL to disable the header.
+    // Takes ownership of out_fd upon Run().
     explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd);
 
     void Run(const ChainPositions &positions);
diff --git a/lm/filter/Jamfile b/lm/filter/Jamfile
index adee3d233..bcf62da78 100644
--- a/lm/filter/Jamfile
+++ b/lm/filter/Jamfile
@@ -3,3 +3,5 @@ fakelib lm_filter : phrase.cc vocab.cc arpa_io.cc ../../util//kenutil : <threadi
 obj main : filter_main.cc : <threading>single:<define>NTHREAD <include>../.. ;
 
 exe filter : main lm_filter ../../util//kenutil ..//kenlm : <threading>multi:<library>/top//boost_thread ;
+
+exe phrase_table_vocab : phrase_table_vocab_main.cc ../../util//kenutil ;
diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc
index 1a4ba84fe..1736bc405 100644
--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@@ -25,8 +25,8 @@ void DisplayHelp(const char *name) {
     "    parser.\n"
     "single mode treats the entire input as a single sentence.\n"
     "multiple mode filters to multiple sentences in parallel.  Each sentence is on\n"
-    "    a separate line.  A separate file is created for each file by appending the\n"
-    "    0-indexed line number to the output file name.\n"
+    "    a separate line.  A separate file is created for each sentence by appending\n"
+    "    the 0-indexed line number to the output file name.\n"
     "union mode produces one filtered model that is the union of models created by\n"
     "    multiple mode.\n\n"
     "context means only the context (all but last word) has to pass the filter, but\n"
diff --git a/lm/kenlm_max_order_main.cc b/lm/kenlm_max_order_main.cc
deleted file mode 100644
index 94221201c..000000000
--- a/lm/kenlm_max_order_main.cc
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "lm/max_order.hh"
-#include <iostream>
-
-int main(int argc, char *argv[]) {
-  std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl;
-}
diff --git a/lm/query_main.cc b/lm/query_main.cc
index 49757d9aa..27d3a1a56 100644
--- a/lm/query_main.cc
+++ b/lm/query_main.cc
@@ -2,6 +2,7 @@
 
 int main(int argc, char *argv[]) {
   if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) {
+    std::cerr << "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << "." << std::endl;
     std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl;
     std::cerr << "Input is wrapped in <s> and </s> unless null is passed." << std::endl;
     return 1;
author	Kenneth Heafield <github@kheafield.com>	2013-05-19 18:12:06 +0400
committer	Kenneth Heafield <github@kheafield.com>	2013-05-19 18:12:06 +0400
commit	50652382e9285740de73654a7f47a8f4a9d993a1 (patch)
tree	31f37b7f09559678c3f4661290287ce39d34da39 /lm
parent	41da56364565e0aa9d40cce018e5ef82f9766430 (diff)