KenLM 10ddf7d923355b35a7de9a5219673eca9e18be98 except Hieu's slow string_piece_hash

author: Kenneth Heafield <github@kheafield.com> 2013-05-19 18:12:06 +0400
committer: Kenneth Heafield <github@kheafield.com> 2013-05-19 18:12:06 +0400
commit: 50652382e9285740de73654a7f47a8f4a9d993a1 (patch)
tree: 31f37b7f09559678c3f4661290287ce39d34da39 /util
parent: 41da56364565e0aa9d40cce018e5ef82f9766430 (diff)
8 files changed, 191 insertions, 33 deletions
diff --git a/util/double-conversion/utils.h b/util/double-conversion/utils.h
index 767094b8b..2bd716050 100644
--- a/util/double-conversion/utils.h
+++ b/util/double-conversion/utils.h
@@ -218,7 +218,8 @@ class StringBuilder {
   // 0-characters; use the Finalize() method to terminate the string
   // instead.
   void AddCharacter(char c) {
-    ASSERT(c != '\0');
+    // I just extract raw data not a cstr so null is fine.
+    //ASSERT(c != '\0');
     ASSERT(!is_finalized() && position_ < buffer_.length());
     buffer_[position_++] = c;
   }
@@ -233,7 +234,8 @@ class StringBuilder {
   // builder. The input string must have enough characters.
   void AddSubstring(const char* s, int n) {
     ASSERT(!is_finalized() && position_ + n < buffer_.length());
-    ASSERT(static_cast<size_t>(n) <= strlen(s));
+    // I just extract raw data not a cstr so null is fine.
+    //ASSERT(static_cast<size_t>(n) <= strlen(s));
     memmove(&buffer_[position_], s, n * kCharSize);
     position_ += n;
   }
@@ -253,7 +255,8 @@ class StringBuilder {
     buffer_[position_] = '\0';
     // Make sure nobody managed to add a 0-character to the
     // buffer while building the string.
-    ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
+    // I just extract raw data not a cstr so null is fine.
+    //ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
     position_ = -1;
     ASSERT(is_finalized());
     return buffer_.start();
diff --git a/util/fake_ofstream.hh b/util/fake_ofstream.hh
new file mode 100644
index 000000000..bcdebe455
--- /dev/null
+++ b/util/fake_ofstream.hh
@@ -0,0 +1,94 @@
+/* Like std::ofstream but without being incredibly slow.  Backed by a raw fd.
+ * Does not support many data types.  Currently, it's targeted at writing ARPA
+ * files quickly.
+ */
+#include "util/double-conversion/double-conversion.h"
+#include "util/double-conversion/utils.h"
+#include "util/file.hh"
+#include "util/scoped.hh"
+#include "util/string_piece.hh"
+
+#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
+#include <boost/lexical_cast.hpp>
+
+namespace util {
+class FakeOFStream {
+  public:
+    static const std::size_t kOutBuf = 1048576;
+
+    // Does not take ownership of out.
+    explicit FakeOFStream(int out)
+      : buf_(util::MallocOrThrow(kOutBuf)),
+        builder_(static_cast<char*>(buf_.get()), kOutBuf),
+        // Mostly the default but with inf instead.  And no flags.
+        convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0),
+        fd_(out) {}
+
+    ~FakeOFStream() {
+      if (buf_.get()) Flush();
+    }
+
+    FakeOFStream &operator<<(float value) {
+      // Odd, but this is the largest number found in the comments.
+      EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
+      convert_.ToShortestSingle(value, &builder_);
+      return *this;
+    }
+
+    FakeOFStream &operator<<(double value) {
+      EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
+      convert_.ToShortest(value, &builder_);
+      return *this;
+    }
+
+    FakeOFStream &operator<<(StringPiece str) {
+      if (str.size() > kOutBuf) {
+        Flush();
+        util::WriteOrThrow(fd_, str.data(), str.size());
+      } else {
+        EnsureRemaining(str.size());
+        builder_.AddSubstring(str.data(), str.size());
+      }
+      return *this;
+    }
+
+    // Inefficient!  TODO: more efficient implementation
+    FakeOFStream &operator<<(unsigned value) {
+      return *this << boost::lexical_cast<std::string>(value);
+    }
+
+    FakeOFStream &operator<<(char c) {
+      EnsureRemaining(1);
+      builder_.AddCharacter(c);
+      return *this;
+    }
+
+    // Note this does not sync.
+    void Flush() {
+      util::WriteOrThrow(fd_, buf_.get(), builder_.position());
+      builder_.Reset();
+    }
+
+    // Not necessary, but does assure the data is cleared.
+    void Finish() {
+      Flush();
+      // It will segfault trying to null terminate otherwise.
+      builder_.Finalize();
+      buf_.reset();
+      util::FSyncOrThrow(fd_);
+    }
+
+  private:
+    void EnsureRemaining(std::size_t amount) {
+      if (static_cast<std::size_t>(builder_.size() - builder_.position()) <= amount) {
+        Flush();
+      }
+    }
+
+    util::scoped_malloc buf_;
+    double_conversion::StringBuilder builder_;
+    double_conversion::DoubleToStringConverter convert_;
+    int fd_;
+};
+
+} // namespace
diff --git a/util/file_piece.cc b/util/file_piece.cc
index bed5f85af..b5961bea6 100644
--- a/util/file_piece.cc
+++ b/util/file_piece.cc
@@ -218,7 +218,7 @@ void FilePiece::MMapShift(uint64_t desired_begin) {
   // Use mmap.  
   uint64_t ignore = desired_begin % page_;
   // Duplicate request for Shift means give more data.  
-  if (position_ == data_.begin() + ignore) {
+  if (position_ == data_.begin() + ignore && position_) {
     default_map_size_ *= 2;
   }
   // Local version so that in case of failure it doesn't overwrite the class variable.  
diff --git a/util/have.hh b/util/have.hh
index ab1393074..6e18529d2 100644
--- a/util/have.hh
+++ b/util/have.hh
@@ -2,6 +2,10 @@
 #ifndef UTIL_HAVE__
 #define UTIL_HAVE__
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #ifndef HAVE_ICU
 //#define HAVE_ICU
 #endif
diff --git a/util/mmap.cc b/util/mmap.cc
index bc9e3f815..6f79f26f5 100644
--- a/util/mmap.cc
+++ b/util/mmap.cc
@@ -6,6 +6,7 @@
 
 #include "util/exception.hh"
 #include "util/file.hh"
+#include "util/scoped.hh"
 
 #include <iostream>
 
@@ -110,8 +111,14 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
   UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
 #else
   int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
-  void *ret = mmap(NULL, size, protect, flags, fd, offset);
-  UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+  void *ret;
+  UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+#  ifdef MADV_HUGEPAGE
+  /* We like huge pages but it's fine if we can't have them.  Note that huge
+   * pages are not supported for file-backed mmap on linux.
+   */
+  madvise(ret, size, MADV_HUGEPAGE);
+#  endif
 #endif
   return ret;
 }
@@ -141,8 +148,7 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
     case POPULATE_OR_READ:
 #endif
     case READ:
-      out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
-      if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
+      out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
       SeekOrThrow(fd, offset);
       ReadOrThrow(fd, out.get(), size);
       break;
diff --git a/util/scoped.cc b/util/scoped.cc
index b6972f14b..6c5b0c2db 100644
--- a/util/scoped.cc
+++ b/util/scoped.cc
@@ -1,6 +1,9 @@
 #include "util/scoped.hh"
 
 #include <cstdlib>
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/mman.h>
+#endif
 
 namespace util {
 
@@ -10,26 +13,31 @@ MallocException::MallocException(std::size_t requested) throw() {
 
 MallocException::~MallocException() throw() {}
 
+namespace {
+void *InspectAddr(void *addr, std::size_t requested, const char *func_name) {
+  UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name);
+  // These routines are often used for large chunks of memory where huge pages help.
+#if MADV_HUGEPAGE
+  madvise(addr, requested, MADV_HUGEPAGE);
+#endif
+  return addr;
+}
+} // namespace
+
 void *MallocOrThrow(std::size_t requested) {
-  void *ret;
-  UTIL_THROW_IF_ARG(!(ret = std::malloc(requested)), MallocException, (requested), "in malloc");
-  return ret;
+  return InspectAddr(std::malloc(requested), requested, "malloc");
 }
 
 void *CallocOrThrow(std::size_t requested) {
-  void *ret;
-  UTIL_THROW_IF_ARG(!(ret = std::calloc(1, requested)), MallocException, (requested), "in calloc");
-  return ret;
+  return InspectAddr(std::calloc(1, requested), requested, "calloc");
 }
 
 scoped_malloc::~scoped_malloc() {
   std::free(p_);
 }
 
-void scoped_malloc::call_realloc(std::size_t to) {
-  void *ret;
-  UTIL_THROW_IF_ARG(!(ret = std::realloc(p_, to)) && to, MallocException, (to), "in realloc");
-  p_ = ret;
+void scoped_malloc::call_realloc(std::size_t requested) {
+  p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
 }
 
 } // namespace util
diff --git a/util/sized_iterator.hh b/util/sized_iterator.hh
index aabcc5319..cf998953b 100644
--- a/util/sized_iterator.hh
+++ b/util/sized_iterator.hh
@@ -3,6 +3,7 @@
 
 #include "util/proxy_iterator.hh"
 
+#include <algorithm>
 #include <functional>
 #include <string>
 
@@ -63,6 +64,13 @@ class SizedProxy {
     const void *Data() const { return inner_.Data(); }
     void *Data() { return inner_.Data(); }
 
+    friend void swap(SizedProxy &first, SizedProxy &second) {
+      std::swap_ranges(
+          static_cast<char*>(first.inner_.Data()), 
+          static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
+          static_cast<char*>(second.inner_.Data()));
+    }
+
   private:
     friend class util::ProxyIterator<SizedProxy>;
 
diff --git a/util/usage.cc b/util/usage.cc
index b8e125d0d..5fa3cc9ae 100644
--- a/util/usage.cc
+++ b/util/usage.cc
@@ -5,45 +5,80 @@
 #include <fstream>
 #include <ostream>
 #include <sstream>
+#include <set>
+#include <string>
 
 #include <string.h>
 #include <ctype.h>
 #if !defined(_WIN32) && !defined(_WIN64)
 #include <sys/resource.h>
 #include <sys/time.h>
+#include <time.h>
 #include <unistd.h>
 #endif
 
 namespace util {
 
-namespace {
 #if !defined(_WIN32) && !defined(_WIN64)
+namespace {
 float FloatSec(const struct timeval &tv) {
   return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000.0);
 }
-#endif
+float FloatSec(const struct timespec &tv) {
+  return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_nsec) / 1000000000.0);
+}
+
+const char *SkipSpaces(const char *at) {
+  for (; *at == ' ' || *at == '\t'; ++at) {}
+  return at;
+}
+
+class RecordStart {
+  public:
+    RecordStart() {
+      clock_gettime(CLOCK_MONOTONIC, &started_);
+    }
+
+    const struct timespec &Started() const {
+      return started_;
+    }
+
+  private:
+    struct timespec started_;
+};
+
+const RecordStart kRecordStart;
 } // namespace
+#endif
 
 void PrintUsage(std::ostream &out) {
 #if !defined(_WIN32) && !defined(_WIN64)
+  // Linux doesn't set memory usage in getrusage :-(
+  std::set<std::string> headers;
+  headers.insert("VmPeak:");
+  headers.insert("VmRSS:");
+  headers.insert("Name:");
+
+  std::ifstream status("/proc/self/status", std::ios::in);
+  std::string header, value;
+  while ((status >> header) && getline(status, value)) {
+    if (headers.find(header) != headers.end()) {
+      out << header << SkipSpaces(value.c_str()) << '\t';
+    }
+  }
+
   struct rusage usage;
-  if (getrusage(RUSAGE_SELF, &usage)) {
+  if (getrusage(RUSAGE_CHILDREN, &usage)) {
     perror("getrusage");
     return;
   }
-  out << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
+  out << "RSSMax:" << usage.ru_maxrss << " kB" << '\t';
+  out << "user:" << FloatSec(usage.ru_utime) << "\tsys:" << FloatSec(usage.ru_stime) << '\t';
+  out << "CPU:" << (FloatSec(usage.ru_utime) + FloatSec(usage.ru_stime));
 
-  // Linux doesn't set memory usage :-(.  
-  std::ifstream status("/proc/self/status", std::ios::in);
-  std::string line;
-  while (getline(status, line)) {
-    if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
-      out << "VmRSS:  " << (line.c_str() + 7) << '\n';
-      break;
-    } else if (!strncmp(line.c_str(), "VmPeak:\t", 8)) {
-      out << "VmPeak: " << (line.c_str() + 8) << '\n';
-    }
-  }
+  struct timespec current;
+  clock_gettime(CLOCK_MONOTONIC, &current);
+  out << "\treal:" << (FloatSec(current) - FloatSec(kRecordStart.Started())) << '\n';
 #endif
 }
author	Kenneth Heafield <github@kheafield.com>	2013-05-19 18:12:06 +0400
committer	Kenneth Heafield <github@kheafield.com>	2013-05-19 18:12:06 +0400
commit	50652382e9285740de73654a7f47a8f4a9d993a1 (patch)
tree	31f37b7f09559678c3f4661290287ce39d34da39 /util
parent	41da56364565e0aa9d40cce018e5ef82f9766430 (diff)