Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-05-19 18:12:06 +0400
committerKenneth Heafield <github@kheafield.com>2013-05-19 18:12:06 +0400
commit50652382e9285740de73654a7f47a8f4a9d993a1 (patch)
tree31f37b7f09559678c3f4661290287ce39d34da39 /util
parent41da56364565e0aa9d40cce018e5ef82f9766430 (diff)
KenLM 10ddf7d923355b35a7de9a5219673eca9e18be98 except Hieu's slow string_piece_hash
Diffstat (limited to 'util')
-rw-r--r--util/double-conversion/utils.h9
-rw-r--r--util/fake_ofstream.hh94
-rw-r--r--util/file_piece.cc2
-rw-r--r--util/have.hh4
-rw-r--r--util/mmap.cc14
-rw-r--r--util/scoped.cc28
-rw-r--r--util/sized_iterator.hh8
-rw-r--r--util/usage.cc65
8 files changed, 191 insertions, 33 deletions
diff --git a/util/double-conversion/utils.h b/util/double-conversion/utils.h
index 767094b8b..2bd716050 100644
--- a/util/double-conversion/utils.h
+++ b/util/double-conversion/utils.h
@@ -218,7 +218,8 @@ class StringBuilder {
// 0-characters; use the Finalize() method to terminate the string
// instead.
void AddCharacter(char c) {
- ASSERT(c != '\0');
+ // I just extract raw data not a cstr so null is fine.
+ //ASSERT(c != '\0');
ASSERT(!is_finalized() && position_ < buffer_.length());
buffer_[position_++] = c;
}
@@ -233,7 +234,8 @@ class StringBuilder {
// builder. The input string must have enough characters.
void AddSubstring(const char* s, int n) {
ASSERT(!is_finalized() && position_ + n < buffer_.length());
- ASSERT(static_cast<size_t>(n) <= strlen(s));
+ // I just extract raw data not a cstr so null is fine.
+ //ASSERT(static_cast<size_t>(n) <= strlen(s));
memmove(&buffer_[position_], s, n * kCharSize);
position_ += n;
}
@@ -253,7 +255,8 @@ class StringBuilder {
buffer_[position_] = '\0';
// Make sure nobody managed to add a 0-character to the
// buffer while building the string.
- ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
+ // I just extract raw data not a cstr so null is fine.
+ //ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
position_ = -1;
ASSERT(is_finalized());
return buffer_.start();
diff --git a/util/fake_ofstream.hh b/util/fake_ofstream.hh
new file mode 100644
index 000000000..bcdebe455
--- /dev/null
+++ b/util/fake_ofstream.hh
@@ -0,0 +1,94 @@
+/* Like std::ofstream but without being incredibly slow. Backed by a raw fd.
+ * Does not support many data types. Currently, it's targeted at writing ARPA
+ * files quickly.
+ */
+#include "util/double-conversion/double-conversion.h"
+#include "util/double-conversion/utils.h"
+#include "util/file.hh"
+#include "util/scoped.hh"
+#include "util/string_piece.hh"
+
+#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
+#include <boost/lexical_cast.hpp>
+
+namespace util {
+class FakeOFStream {
+ public:
+ static const std::size_t kOutBuf = 1048576;
+
+ // Does not take ownership of out.
+ explicit FakeOFStream(int out)
+ : buf_(util::MallocOrThrow(kOutBuf)),
+ builder_(static_cast<char*>(buf_.get()), kOutBuf),
+ // Mostly the default but with inf instead. And no flags.
+ convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0),
+ fd_(out) {}
+
+ ~FakeOFStream() {
+ if (buf_.get()) Flush();
+ }
+
+ FakeOFStream &operator<<(float value) {
+ // Odd, but this is the largest number found in the comments.
+ EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
+ convert_.ToShortestSingle(value, &builder_);
+ return *this;
+ }
+
+ FakeOFStream &operator<<(double value) {
+ EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
+ convert_.ToShortest(value, &builder_);
+ return *this;
+ }
+
+ FakeOFStream &operator<<(StringPiece str) {
+ if (str.size() > kOutBuf) {
+ Flush();
+ util::WriteOrThrow(fd_, str.data(), str.size());
+ } else {
+ EnsureRemaining(str.size());
+ builder_.AddSubstring(str.data(), str.size());
+ }
+ return *this;
+ }
+
+ // Inefficient! TODO: more efficient implementation
+ FakeOFStream &operator<<(unsigned value) {
+ return *this << boost::lexical_cast<std::string>(value);
+ }
+
+ FakeOFStream &operator<<(char c) {
+ EnsureRemaining(1);
+ builder_.AddCharacter(c);
+ return *this;
+ }
+
+ // Note this does not sync.
+ void Flush() {
+ util::WriteOrThrow(fd_, buf_.get(), builder_.position());
+ builder_.Reset();
+ }
+
+ // Not necessary, but does assure the data is cleared.
+ void Finish() {
+ Flush();
+ // It will segfault trying to null terminate otherwise.
+ builder_.Finalize();
+ buf_.reset();
+ util::FSyncOrThrow(fd_);
+ }
+
+ private:
+ void EnsureRemaining(std::size_t amount) {
+ if (static_cast<std::size_t>(builder_.size() - builder_.position()) <= amount) {
+ Flush();
+ }
+ }
+
+ util::scoped_malloc buf_;
+ double_conversion::StringBuilder builder_;
+ double_conversion::DoubleToStringConverter convert_;
+ int fd_;
+};
+
+} // namespace
diff --git a/util/file_piece.cc b/util/file_piece.cc
index bed5f85af..b5961bea6 100644
--- a/util/file_piece.cc
+++ b/util/file_piece.cc
@@ -218,7 +218,7 @@ void FilePiece::MMapShift(uint64_t desired_begin) {
// Use mmap.
uint64_t ignore = desired_begin % page_;
// Duplicate request for Shift means give more data.
- if (position_ == data_.begin() + ignore) {
+ if (position_ == data_.begin() + ignore && position_) {
default_map_size_ *= 2;
}
// Local version so that in case of failure it doesn't overwrite the class variable.
diff --git a/util/have.hh b/util/have.hh
index ab1393074..6e18529d2 100644
--- a/util/have.hh
+++ b/util/have.hh
@@ -2,6 +2,10 @@
#ifndef UTIL_HAVE__
#define UTIL_HAVE__
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
#ifndef HAVE_ICU
//#define HAVE_ICU
#endif
diff --git a/util/mmap.cc b/util/mmap.cc
index bc9e3f815..6f79f26f5 100644
--- a/util/mmap.cc
+++ b/util/mmap.cc
@@ -6,6 +6,7 @@
#include "util/exception.hh"
#include "util/file.hh"
+#include "util/scoped.hh"
#include <iostream>
@@ -110,8 +111,14 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
#else
int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
- void *ret = mmap(NULL, size, protect, flags, fd, offset);
- UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+ void *ret;
+ UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+# ifdef MADV_HUGEPAGE
+ /* We like huge pages but it's fine if we can't have them. Note that huge
+ * pages are not supported for file-backed mmap on linux.
+ */
+ madvise(ret, size, MADV_HUGEPAGE);
+# endif
#endif
return ret;
}
@@ -141,8 +148,7 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
case POPULATE_OR_READ:
#endif
case READ:
- out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
- if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
+ out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
SeekOrThrow(fd, offset);
ReadOrThrow(fd, out.get(), size);
break;
diff --git a/util/scoped.cc b/util/scoped.cc
index b6972f14b..6c5b0c2db 100644
--- a/util/scoped.cc
+++ b/util/scoped.cc
@@ -1,6 +1,9 @@
#include "util/scoped.hh"
#include <cstdlib>
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/mman.h>
+#endif
namespace util {
@@ -10,26 +13,31 @@ MallocException::MallocException(std::size_t requested) throw() {
MallocException::~MallocException() throw() {}
+namespace {
+void *InspectAddr(void *addr, std::size_t requested, const char *func_name) {
+ UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name);
+ // These routines are often used for large chunks of memory where huge pages help.
+#if MADV_HUGEPAGE
+ madvise(addr, requested, MADV_HUGEPAGE);
+#endif
+ return addr;
+}
+} // namespace
+
void *MallocOrThrow(std::size_t requested) {
- void *ret;
- UTIL_THROW_IF_ARG(!(ret = std::malloc(requested)), MallocException, (requested), "in malloc");
- return ret;
+ return InspectAddr(std::malloc(requested), requested, "malloc");
}
void *CallocOrThrow(std::size_t requested) {
- void *ret;
- UTIL_THROW_IF_ARG(!(ret = std::calloc(1, requested)), MallocException, (requested), "in calloc");
- return ret;
+ return InspectAddr(std::calloc(1, requested), requested, "calloc");
}
scoped_malloc::~scoped_malloc() {
std::free(p_);
}
-void scoped_malloc::call_realloc(std::size_t to) {
- void *ret;
- UTIL_THROW_IF_ARG(!(ret = std::realloc(p_, to)) && to, MallocException, (to), "in realloc");
- p_ = ret;
+void scoped_malloc::call_realloc(std::size_t requested) {
+ p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
}
} // namespace util
diff --git a/util/sized_iterator.hh b/util/sized_iterator.hh
index aabcc5319..cf998953b 100644
--- a/util/sized_iterator.hh
+++ b/util/sized_iterator.hh
@@ -3,6 +3,7 @@
#include "util/proxy_iterator.hh"
+#include <algorithm>
#include <functional>
#include <string>
@@ -63,6 +64,13 @@ class SizedProxy {
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
+ friend void swap(SizedProxy &first, SizedProxy &second) {
+ std::swap_ranges(
+ static_cast<char*>(first.inner_.Data()),
+ static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
+ static_cast<char*>(second.inner_.Data()));
+ }
+
private:
friend class util::ProxyIterator<SizedProxy>;
diff --git a/util/usage.cc b/util/usage.cc
index b8e125d0d..5fa3cc9ae 100644
--- a/util/usage.cc
+++ b/util/usage.cc
@@ -5,45 +5,80 @@
#include <fstream>
#include <ostream>
#include <sstream>
+#include <set>
+#include <string>
#include <string.h>
#include <ctype.h>
#if !defined(_WIN32) && !defined(_WIN64)
#include <sys/resource.h>
#include <sys/time.h>
+#include <time.h>
#include <unistd.h>
#endif
namespace util {
-namespace {
#if !defined(_WIN32) && !defined(_WIN64)
+namespace {
float FloatSec(const struct timeval &tv) {
return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000.0);
}
-#endif
+float FloatSec(const struct timespec &tv) {
+ return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_nsec) / 1000000000.0);
+}
+
+const char *SkipSpaces(const char *at) {
+ for (; *at == ' ' || *at == '\t'; ++at) {}
+ return at;
+}
+
+class RecordStart {
+ public:
+ RecordStart() {
+ clock_gettime(CLOCK_MONOTONIC, &started_);
+ }
+
+ const struct timespec &Started() const {
+ return started_;
+ }
+
+ private:
+ struct timespec started_;
+};
+
+const RecordStart kRecordStart;
} // namespace
+#endif
void PrintUsage(std::ostream &out) {
#if !defined(_WIN32) && !defined(_WIN64)
+ // Linux doesn't set memory usage in getrusage :-(
+ std::set<std::string> headers;
+ headers.insert("VmPeak:");
+ headers.insert("VmRSS:");
+ headers.insert("Name:");
+
+ std::ifstream status("/proc/self/status", std::ios::in);
+ std::string header, value;
+ while ((status >> header) && getline(status, value)) {
+ if (headers.find(header) != headers.end()) {
+ out << header << SkipSpaces(value.c_str()) << '\t';
+ }
+ }
+
struct rusage usage;
- if (getrusage(RUSAGE_SELF, &usage)) {
+ if (getrusage(RUSAGE_CHILDREN, &usage)) {
perror("getrusage");
return;
}
- out << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
+ out << "RSSMax:" << usage.ru_maxrss << " kB" << '\t';
+ out << "user:" << FloatSec(usage.ru_utime) << "\tsys:" << FloatSec(usage.ru_stime) << '\t';
+ out << "CPU:" << (FloatSec(usage.ru_utime) + FloatSec(usage.ru_stime));
- // Linux doesn't set memory usage :-(.
- std::ifstream status("/proc/self/status", std::ios::in);
- std::string line;
- while (getline(status, line)) {
- if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
- out << "VmRSS: " << (line.c_str() + 7) << '\n';
- break;
- } else if (!strncmp(line.c_str(), "VmPeak:\t", 8)) {
- out << "VmPeak: " << (line.c_str() + 8) << '\n';
- }
- }
+ struct timespec current;
+ clock_gettime(CLOCK_MONOTONIC, &current);
+ out << "\treal:" << (FloatSec(current) - FloatSec(kRecordStart.Started())) << '\n';
#endif
}