Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/kenlm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2011-11-11 00:46:59 +0400
committerKenneth Heafield <github@kheafield.com>2011-11-11 00:46:59 +0400
commitd732f63ec2f8d54092da298fff289ee8bba1e419 (patch)
tree2b9299039fe1841acae93d4bc8932d06e20e8907 /kenlm
parent9903a239ea33d58484cdb625f89135d96869dd5c (diff)
KenLM update including progress on ARM and MinGW from NICT
Diffstat (limited to 'kenlm')
-rw-r--r--kenlm/lm/bhiksha.cc13
-rw-r--r--kenlm/lm/bhiksha.hh12
-rw-r--r--kenlm/lm/binary_format.cc74
-rw-r--r--kenlm/lm/binary_format.hh19
-rw-r--r--kenlm/lm/blank.hh5
-rw-r--r--kenlm/lm/build_binary.cc5
-rw-r--r--kenlm/lm/model.cc11
-rw-r--r--kenlm/lm/model.hh12
-rw-r--r--kenlm/lm/ngram_query.cc11
-rw-r--r--kenlm/lm/quantize.cc32
-rw-r--r--kenlm/lm/quantize.hh6
-rw-r--r--kenlm/lm/read_arpa.cc2
-rw-r--r--kenlm/lm/return.hh2
-rw-r--r--kenlm/lm/search_hashed.hh2
-rw-r--r--kenlm/lm/search_trie.cc70
-rw-r--r--kenlm/lm/search_trie.hh11
-rw-r--r--kenlm/lm/sri.cc108
-rw-r--r--kenlm/lm/sri.hh102
-rw-r--r--kenlm/lm/trie.hh2
-rw-r--r--kenlm/lm/trie_sort.cc202
-rw-r--r--kenlm/lm/trie_sort.hh55
-rw-r--r--kenlm/lm/vocab.cc35
-rw-r--r--kenlm/lm/vocab.hh8
-rw-r--r--kenlm/util/bit_packing.hh54
-rw-r--r--kenlm/util/exception.cc6
-rw-r--r--kenlm/util/file.cc225
-rw-r--r--kenlm/util/file.hh64
-rw-r--r--kenlm/util/file_piece.cc58
-rw-r--r--kenlm/util/file_piece.hh22
-rw-r--r--kenlm/util/key_value_packing.hh2
-rw-r--r--kenlm/util/mmap.cc119
-rw-r--r--kenlm/util/mmap.hh13
-rw-r--r--kenlm/util/murmur_hash.hh2
-rw-r--r--kenlm/util/portability.cc74
-rw-r--r--kenlm/util/portability.hh127
-rw-r--r--kenlm/util/probing_hash_table.hh4
-rw-r--r--kenlm/util/sized_iterator.hh2
-rw-r--r--kenlm/util/sorted_uniform.hh2
38 files changed, 678 insertions, 895 deletions
diff --git a/kenlm/lm/bhiksha.cc b/kenlm/lm/bhiksha.cc
index 0c187960d..cdeafb478 100644
--- a/kenlm/lm/bhiksha.cc
+++ b/kenlm/lm/bhiksha.cc
@@ -1,5 +1,6 @@
#include "lm/bhiksha.hh"
#include "lm/config.hh"
+#include "util/file.hh"
#include <limits>
@@ -12,16 +13,12 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_
const uint8_t kArrayBhikshaVersion = 0;
-void ArrayBhiksha::UpdateConfigFromBinary(FD fd, Config &config) {
+// TODO: put this in binary file header instead when I change the binary file format again.
+void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
uint8_t version;
uint8_t configured_bits;
-#ifdef WIN32
-#else
- if (read(fd, &version, 1) != 1 || read(fd, &configured_bits, 1) != 1) {
- UTIL_THROW(util::ErrnoException, "Could not read from binary file");
- }
-#endif
-
+ util::ReadOrThrow(fd, &version, 1);
+ util::ReadOrThrow(fd, &configured_bits, 1);
if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
config.pointer_bhiksha_bits = configured_bits;
}
diff --git a/kenlm/lm/bhiksha.hh b/kenlm/lm/bhiksha.hh
index 9f615a477..3df43dda9 100644
--- a/kenlm/lm/bhiksha.hh
+++ b/kenlm/lm/bhiksha.hh
@@ -10,14 +10,16 @@
* Currently only used for next pointers.
*/
-#include <stdint.h>
+#ifndef LM_BHIKSHA__
+#define LM_BHIKSHA__
+
+#include <inttypes.h>
#include <assert.h>
#include "lm/model_type.hh"
#include "lm/trie.hh"
#include "util/bit_packing.hh"
#include "util/sorted_uniform.hh"
-#include "util/portability.hh"
namespace lm {
namespace ngram {
@@ -29,7 +31,7 @@ class DontBhiksha {
public:
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
- static void UpdateConfigFromBinary(FD /*fd*/, Config &/*config*/) {}
+ static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
static std::size_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
@@ -63,7 +65,7 @@ class ArrayBhiksha {
public:
static const ModelType kModelTypeAdd = kArrayAdd;
- static void UpdateConfigFromBinary(FD fd, Config &config);
+ static void UpdateConfigFromBinary(int fd, Config &config);
static std::size_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
@@ -109,3 +111,5 @@ class ArrayBhiksha {
} // namespace trie
} // namespace ngram
} // namespace lm
+
+#endif // LM_BHIKSHA__
diff --git a/kenlm/lm/binary_format.cc b/kenlm/lm/binary_format.cc
index 5fea35118..e7f9cd048 100644
--- a/kenlm/lm/binary_format.cc
+++ b/kenlm/lm/binary_format.cc
@@ -1,17 +1,15 @@
#include "lm/binary_format.hh"
#include "lm/lm_exception.hh"
+#include "util/file.hh"
#include "util/file_piece.hh"
+#include <cstddef>
+#include <cstring>
#include <limits>
#include <string>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
+#include <inttypes.h>
namespace lm {
namespace ngram {
@@ -30,6 +28,7 @@ struct Sanity {
uint64_t one_uint64;
void SetToReference() {
+ std::memset(this, 0, sizeof(Sanity));
std::memcpy(magic, kMagicBytes, sizeof(magic));
zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
one_word_index = 1;
@@ -41,28 +40,13 @@ struct Sanity {
const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
std::size_t TotalHeaderSize(unsigned char order) {
- return Align8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
-}
-
-void ReadLoop(FD fd, void *to_void, std::size_t size) {
- uint8_t *to = static_cast<uint8_t*>(to_void);
- while (size) {
-#ifdef WIN32
- ssize_t ret;
-#else
- ssize_t ret = read(fd, to, size);
-#endif
- if (ret == -1) UTIL_THROW(util::ErrnoException, "Failed to read from binary file");
- if (ret == 0) UTIL_THROW(util::ErrnoException, "Binary file too short");
- to += ret;
- size -= ret;
- }
+ return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
}
void WriteHeader(void *to, const Parameters &params) {
Sanity header = Sanity();
header.SetToReference();
- memcpy(to, &header, sizeof(Sanity));
+ std::memcpy(to, &header, sizeof(Sanity));
char *out = reinterpret_cast<char*>(to) + sizeof(Sanity);
*reinterpret_cast<FixedWidthParameters*>(out) = params.fixed;
@@ -76,20 +60,6 @@ void WriteHeader(void *to, const Parameters &params) {
} // namespace
-void SeekOrThrow(FD fd, off_t off) {
-#ifdef WIN32
-#else
- if ((off_t)-1 == lseek(fd, off, SEEK_SET)) UTIL_THROW(util::ErrnoException, "Seek failed");
-#endif
-}
-
-void AdvanceOrThrow(FD fd, off_t off) {
-#ifdef WIN32
-#else
- if ((off_t)-1 == lseek(fd, off, SEEK_CUR)) UTIL_THROW(util::ErrnoException, "Seek failed");
-#endif
-}
-
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
std::size_t total = TotalHeaderSize(order) + memory_size;
@@ -110,8 +80,8 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (adjusted_vocab + memory_size) << " failed");
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
- off_t page_size = sysconf(_SC_PAGE_SIZE);
- off_t alignment_cruft = adjusted_vocab % page_size;
+ std::size_t page_size = util::SizePage();
+ std::size_t alignment_cruft = adjusted_vocab % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
@@ -123,8 +93,8 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) {
if (config.write_mmap) {
- if (msync(backing.search.get(), backing.search.size(), MS_SYNC) || msync(backing.vocab.get(), backing.vocab.size(), MS_SYNC))
- UTIL_THROW(util::ErrnoException, "msync failed for " << config.write_mmap);
+ util::SyncOrThrow(backing.search.get(), backing.search.size());
+ util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params;
params.counts = counts;
@@ -139,9 +109,9 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
namespace detail {
-bool IsBinaryFormat(FD fd) {
- const off_t size = util::SizeFile(fd);
- if (size == util::kBadSize || (size <= static_cast<off_t>(sizeof(Sanity)))) return false;
+bool IsBinaryFormat(int fd) {
+ const uint64_t size = util::SizeFile(fd);
+ if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
// Try reading the header.
util::scoped_memory memory;
try {
@@ -167,14 +137,14 @@ bool IsBinaryFormat(FD fd) {
return false;
}
-void ReadHeader(FD fd, Parameters &out) {
- SeekOrThrow(fd, sizeof(Sanity));
- ReadLoop(fd, &out.fixed, sizeof(out.fixed));
+void ReadHeader(int fd, Parameters &out) {
+ util::SeekOrThrow(fd, sizeof(Sanity));
+ util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed));
if (out.fixed.probing_multiplier < 1.0)
UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
out.counts.resize(static_cast<std::size_t>(out.fixed.order));
- ReadLoop(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
+ util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
}
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params) {
@@ -186,12 +156,12 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
}
-void SeekPastHeader(FD fd, const Parameters &params) {
- SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
+void SeekPastHeader(int fd, const Parameters &params) {
+ util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
}
uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing) {
- const off_t file_size = util::SizeFile(backing.file.get());
+ const uint64_t file_size = util::SizeFile(backing.file.get());
// The header is smaller than a page, so we have to map the whole header as well.
std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size;
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
@@ -203,7 +173,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
if (config.enumerate_vocab) {
- SeekOrThrow(backing.file.get(), total_map);
+ util::SeekOrThrow(backing.file.get(), total_map);
}
return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
}
diff --git a/kenlm/lm/binary_format.hh b/kenlm/lm/binary_format.hh
index fc5995a96..8adb1ec48 100644
--- a/kenlm/lm/binary_format.hh
+++ b/kenlm/lm/binary_format.hh
@@ -8,12 +8,11 @@
#include "util/file_piece.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"
-#include "util/portability.hh"
#include <cstddef>
#include <vector>
-#include <stdint.h>
+#include <inttypes.h>
namespace lm {
namespace ngram {
@@ -34,10 +33,8 @@ struct FixedWidthParameters {
unsigned int search_version;
};
-inline std::size_t Align8(std::size_t in) {
- std::size_t off = in % 8;
- return off ? (in + 8 - off) : in;
-}
+// This is a macro instead of an inline function so constants can be assigned using it.
+#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
// Parameters stored in the header of a binary file.
struct Parameters {
@@ -54,10 +51,6 @@ struct Backing {
util::scoped_memory search;
};
-void SeekOrThrow(FD fd, off_t off);
-// Seek forward
-void AdvanceOrThrow(FD fd, off_t off);
-
// Create just enough of a binary file to write vocabulary to it.
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
@@ -69,13 +62,13 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
namespace detail {
-bool IsBinaryFormat(FD fd);
+bool IsBinaryFormat(int fd);
-void ReadHeader(FD fd, Parameters &params);
+void ReadHeader(int fd, Parameters &params);
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
-void SeekPastHeader(FD fd, const Parameters &params);
+void SeekPastHeader(int fd, const Parameters &params);
uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing);
diff --git a/kenlm/lm/blank.hh b/kenlm/lm/blank.hh
index 68a809a01..2fb64cd03 100644
--- a/kenlm/lm/blank.hh
+++ b/kenlm/lm/blank.hh
@@ -1,7 +1,10 @@
#ifndef LM_BLANK__
#define LM_BLANK__
-#include <stdint.h>
+#include <limits>
+
+#include <inttypes.h>
+#include <math.h>
namespace lm {
namespace ngram {
diff --git a/kenlm/lm/build_binary.cc b/kenlm/lm/build_binary.cc
index f4172f23c..5a0d98dc6 100644
--- a/kenlm/lm/build_binary.cc
+++ b/kenlm/lm/build_binary.cc
@@ -8,14 +8,13 @@
#include <math.h>
#include <stdlib.h>
-#include "util/portability.hh"
namespace lm {
namespace ngram {
namespace {
void Usage(const char *name) {
- std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-c bits] [type] input.arpa [output.mmap]\n\n"
+ std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
" Default is -100. The ARPA file will always take precedence.\n"
"-s allows models to be built even if they do not have <s> and </s>.\n"
@@ -87,7 +86,7 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
prefix = 'G';
divide = 1 << 30;
}
- long int length = std::max<long int>(2, lrint(ceil(log10((double) max_length / (double)divide))));
+ long int length = std::max<long int>(2, lrint(ceil(log10(max_length / divide))));
std::cout << "Memory estimate:\ntype ";
// right align bytes.
for (long int i = 0; i < length - 2; ++i) std::cout << ' ';
diff --git a/kenlm/lm/model.cc b/kenlm/lm/model.cc
index e11d36148..042955efd 100644
--- a/kenlm/lm/model.cc
+++ b/kenlm/lm/model.cc
@@ -44,7 +44,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
P::Init(begin_sentence, null_context, vocab_, search_.MiddleEnd() - search_.MiddleBegin() + 2);
}
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, FD fd) {
+template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
SetupMemory(start, params.counts, config);
vocab_.LoadedBinary(fd, config.enumerate_vocab);
search_.LoadedBinary();
@@ -89,10 +89,15 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
}
}
+template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
+ util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
+ Search::UpdateConfigFromBinary(fd, counts, config);
+}
+
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
- if (ret.ngram_length - 1 < in_state.length) {
- ret.prob = std::accumulate(in_state.backoff + ret.ngram_length - 1, in_state.backoff + in_state.length, ret.prob);
+ for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
+ ret.prob += *i;
}
return ret;
}
diff --git a/kenlm/lm/model.hh b/kenlm/lm/model.hh
index df9541fc7..1196a0c48 100644
--- a/kenlm/lm/model.hh
+++ b/kenlm/lm/model.hh
@@ -13,7 +13,6 @@
#include "lm/weights.hh"
#include "util/murmur_hash.hh"
-#include "util/portability.hh"
#include <algorithm>
#include <vector>
@@ -138,21 +137,16 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
unsigned char &next_use) const;
private:
- friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
+ friend void LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
- static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config) {
- AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
- Search::UpdateConfigFromBinary(fd, counts, config);
- }
-
- float SlowBackoffLookup(const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const;
+ static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
FullScoreReturn ScoreExceptBackoff(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
// Appears after Size in the cc file.
void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
- void InitializeFromBinary(void *start, const Parameters &params, const Config &config, FD fd);
+ void InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd);
void InitializeFromARPA(const char *file, const Config &config);
diff --git a/kenlm/lm/ngram_query.cc b/kenlm/lm/ngram_query.cc
index 2f5bd0725..50ceef5c8 100644
--- a/kenlm/lm/ngram_query.cc
+++ b/kenlm/lm/ngram_query.cc
@@ -7,16 +7,17 @@
#include <string>
#include <ctype.h>
-
-#include "util/portability.hh"
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
float FloatSec(const struct timeval &tv) {
return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0);
}
void PrintUsage(const char *message) {
-#ifdef WIN32
-#else
+#if !defined(_WIN32) && !defined(_WIN64)
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage)) {
perror("getrusage");
@@ -24,7 +25,6 @@ void PrintUsage(const char *message) {
}
std::cerr << message;
std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
-#endif
// Linux doesn't set memory usage :-(.
std::ifstream status("/proc/self/status", std::ios::in);
@@ -35,6 +35,7 @@ void PrintUsage(const char *message) {
break;
}
}
+#endif
}
template <class Model> void Query(const Model &model, bool sentence_context) {
diff --git a/kenlm/lm/quantize.cc b/kenlm/lm/quantize.cc
index 3a9eccf3a..8de37e827 100644
--- a/kenlm/lm/quantize.cc
+++ b/kenlm/lm/quantize.cc
@@ -1,25 +1,23 @@
+/* Quantize into bins of equal size as described in
+ * M. Federico and N. Bertoldi. 2006. How many bits are needed
+ * to store probabilities for phrase-based translation? In Proc.
+ * of the Workshop on Statistical Machine Translation, pages
+ * 94–101, New York City, June. Association for Computa-
+ * tional Linguistics.
+ */
+
#include "lm/quantize.hh"
#include "lm/binary_format.hh"
#include "lm/lm_exception.hh"
+#include "util/file.hh"
#include <algorithm>
#include <numeric>
-#include <limits>
-
-#include "util/portability.hh"
namespace lm {
namespace ngram {
-/* Quantize into bins of equal size as described in
- * M. Federico and N. Bertoldi. 2006. How many bits are needed
- * to store probabilities for phrase-based translation? In Proc.
- * of the Workshop on Statistical Machine Translation, pages
- * 94–101, New York City, June. Association for Computa-
- * tional Linguistics.
- */
-
namespace {
void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) {
@@ -40,15 +38,13 @@ const char kSeparatelyQuantizeVersion = 2;
} // namespace
-void SeparatelyQuantize::UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
+void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
char version;
-#ifdef WIN32
-#else
- if (read(fd, &version, 1) != 1 || read(fd, &config.prob_bits, 1) != 1 || read(fd, &config.backoff_bits, 1) != 1)
- UTIL_THROW(util::ErrnoException, "Failed to read header for quantization.");
-#endif
+ util::ReadOrThrow(fd, &version, 1);
+ util::ReadOrThrow(fd, &config.prob_bits, 1);
+ util::ReadOrThrow(fd, &config.backoff_bits, 1);
if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
- AdvanceOrThrow(fd, -3);
+ util::AdvanceOrThrow(fd, -3);
}
void SeparatelyQuantize::SetupMemory(void *start, const Config &config) {
diff --git a/kenlm/lm/quantize.hh b/kenlm/lm/quantize.hh
index 855c8ba66..4cf4236eb 100644
--- a/kenlm/lm/quantize.hh
+++ b/kenlm/lm/quantize.hh
@@ -9,7 +9,7 @@
#include <algorithm>
#include <vector>
-#include <stdint.h>
+#include <inttypes.h>
#include <iostream>
@@ -22,7 +22,7 @@ class Config;
class DontQuantize {
public:
static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
- static void UpdateConfigFromBinary(FD, const std::vector<uint64_t> &, Config &) {}
+ static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static std::size_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; }
static uint8_t MiddleBits(const Config &/*config*/) { return 63; }
static uint8_t LongestBits(const Config &/*config*/) { return 31; }
@@ -113,7 +113,7 @@ class SeparatelyQuantize {
public:
static const ModelType kModelTypeAdd = kQuantAdd;
- static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config);
+ static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
static std::size_t Size(uint8_t order, const Config &config) {
size_t longest_table = (static_cast<size_t>(1) << static_cast<size_t>(config.prob_bits)) * sizeof(float);
diff --git a/kenlm/lm/read_arpa.cc b/kenlm/lm/read_arpa.cc
index 05f761be6..dce73f771 100644
--- a/kenlm/lm/read_arpa.cc
+++ b/kenlm/lm/read_arpa.cc
@@ -8,7 +8,7 @@
#include <ctype.h>
#include <string.h>
-#include <stdint.h>
+#include <inttypes.h>
namespace lm {
diff --git a/kenlm/lm/return.hh b/kenlm/lm/return.hh
index 1b55091b2..155719605 100644
--- a/kenlm/lm/return.hh
+++ b/kenlm/lm/return.hh
@@ -1,7 +1,7 @@
#ifndef LM_RETURN__
#define LM_RETURN__
-#include <stdint.h>
+#include <inttypes.h>
namespace lm {
/* Structure returned by scoring routines. */
diff --git a/kenlm/lm/search_hashed.hh b/kenlm/lm/search_hashed.hh
index 30a86fb72..e289fd114 100644
--- a/kenlm/lm/search_hashed.hh
+++ b/kenlm/lm/search_hashed.hh
@@ -78,7 +78,7 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
static const unsigned int kVersion = 0;
// TODO: move probing_multiplier here with next binary file format update.
- static void UpdateConfigFromBinary(FD, const std::vector<uint64_t> &, Config &) {}
+ static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static std::size_t Size(const std::vector<uint64_t> &counts, const Config &config) {
std::size_t ret = Unigram::Size(counts[0]);
diff --git a/kenlm/lm/search_trie.cc b/kenlm/lm/search_trie.cc
index 633bcdf45..8cb6984b0 100644
--- a/kenlm/lm/search_trie.cc
+++ b/kenlm/lm/search_trie.cc
@@ -13,6 +13,7 @@
#include "lm/weights.hh"
#include "lm/word_index.hh"
#include "util/ersatz_progress.hh"
+#include "util/mmap.hh"
#include "util/proxy_iterator.hh"
#include "util/scoped.hh"
#include "util/sized_iterator.hh"
@@ -20,12 +21,15 @@
#include <algorithm>
#include <cstring>
#include <cstdio>
+#include <cstdlib>
#include <queue>
#include <limits>
#include <numeric>
#include <vector>
-#include "util/portability.hh"
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
namespace lm {
namespace ngram {
@@ -269,7 +273,7 @@ template <class Quant, class Bhiksha> class WriteEntries {
contexts_(contexts),
unigrams_(unigrams),
middle_(middle),
- longest_(longest),
+ longest_(longest),
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
order_(order),
sri_(sri) {}
@@ -332,7 +336,6 @@ template <class Doing> class BlankManager {
void Visit(const WordIndex *to, unsigned char length, float prob) {
basis_[length - 1] = prob;
- // Try to match everything except the last word, which is expected to be different.
unsigned char overlap = std::min<unsigned char>(length - 1, been_length_);
const WordIndex *cur;
WordIndex *pre;
@@ -349,9 +352,9 @@ template <class Doing> class BlankManager {
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
const float *lower_basis;
for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {}
- assert(*lower_basis != kBadProb);
unsigned char based_on = lower_basis - basis_ + 1;
for (; cur != to + length - 1; ++blank, ++cur, ++pre) {
+ assert(*lower_basis != kBadProb);
doing_.MiddleBlank(blank, to, based_on, *lower_basis);
*pre = *cur;
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
@@ -460,42 +463,32 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
} // namespace
-template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
+template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
RecordReader inputs[kMaxOrder - 1];
RecordReader contexts[kMaxOrder - 1];
for (unsigned char i = 2; i <= counts.size(); ++i) {
- std::stringstream assembled;
- assembled << file_prefix << static_cast<unsigned int>(i) << "_merged";
- inputs[i-2].Init(assembled.str(), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
- util::RemoveOrThrow(assembled.str().c_str());
- assembled << kContextSuffix;
- contexts[i-2].Init(assembled.str(), (i-1) * sizeof(WordIndex));
- util::RemoveOrThrow(assembled.str().c_str());
+ inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
+ contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex));
}
SRISucks sri;
std::vector<uint64_t> fixed_counts(counts.size());
+ util::scoped_FILE unigram_file;
+ util::scoped_fd unigram_fd(files.StealUnigram());
{
- std::string temp(file_prefix); temp += "unigrams";
- util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str()));
util::scoped_memory unigrams;
- MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
+ MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
}
+ unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
}
SanityCheckCounts(counts, fixed_counts);
counts = fixed_counts;
- util::scoped_FILE unigram_file;
- {
- std::string name(file_prefix + "unigrams");
- unigram_file.reset(OpenOrThrow(name.c_str(), "r+"));
- util::RemoveOrThrow(name.c_str());
- }
sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config);
@@ -586,42 +579,19 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBin
longest.LoadedBinary();
}
-namespace {
-bool IsDirectory(const char *path) {
- struct stat info;
- if (0 != stat(path, &info)) return false;
- return S_ISDIR(info.st_mode);
-}
-} // namespace
-
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
- std::string temporary_directory;
+ std::string temporary_prefix;
if (config.temporary_directory_prefix) {
- temporary_directory = config.temporary_directory_prefix;
- if (!temporary_directory.empty() && temporary_directory[temporary_directory.size() - 1] != '/' && IsDirectory(temporary_directory.c_str()))
- temporary_directory += '/';
+ temporary_prefix = config.temporary_directory_prefix;
} else if (config.write_mmap) {
- temporary_directory = config.write_mmap;
+ temporary_prefix = config.write_mmap;
} else {
- temporary_directory = file;
+ temporary_prefix = file;
}
- // Null on end is kludge to ensure null termination.
- temporary_directory += "_trie_tmp_XXXXXX";
- temporary_directory += '\0';
- if (!mkdtemp(&temporary_directory[0])) {
- UTIL_THROW(util::ErrnoException, "Failed to make a temporary directory based on the name " << temporary_directory.c_str());
- }
- // Chop off null kludge.
- temporary_directory.resize(strlen(temporary_directory.c_str()));
- // Add directory delimiter. Assumes a real operating system.
- temporary_directory += '/';
// At least 1MB sorting memory.
- ARPAToSortedFiles(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_directory.c_str(), vocab);
+ SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
- BuildTrie(temporary_directory, counts, config, *this, quant_, vocab, backing);
- if (rmdir(temporary_directory.c_str()) && config.messages) {
- *config.messages << "Failed to delete " << temporary_directory << std::endl;
- }
+ BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
}
template class TrieSearch<DontQuantize, DontBhiksha>;
diff --git a/kenlm/lm/search_trie.hh b/kenlm/lm/search_trie.hh
index 4a9fab648..caa7a05e2 100644
--- a/kenlm/lm/search_trie.hh
+++ b/kenlm/lm/search_trie.hh
@@ -7,8 +7,8 @@
#include "lm/trie.hh"
#include "lm/weights.hh"
+#include "util/file.hh"
#include "util/file_piece.hh"
-#include "util/portability.hh"
#include <vector>
@@ -21,7 +21,8 @@ class SortedVocabulary;
namespace trie {
template <class Quant, class Bhiksha> class TrieSearch;
-template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
+class SortedFiles;
+template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
template <class Quant, class Bhiksha> class TrieSearch {
public:
@@ -39,9 +40,9 @@ template <class Quant, class Bhiksha> class TrieSearch {
static const unsigned int kVersion = 1;
- static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config) {
+ static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
Quant::UpdateConfigFromBinary(fd, counts, config);
- AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
+ util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
Bhiksha::UpdateConfigFromBinary(fd, config);
}
@@ -109,7 +110,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
}
private:
- friend void BuildTrie<Quant, Bhiksha>(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
+ friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
// Middles are managed manually so we can delay construction and they don't have to be copyable.
void FreeMiddles() {
diff --git a/kenlm/lm/sri.cc b/kenlm/lm/sri.cc
deleted file mode 100644
index 825f699bc..000000000
--- a/kenlm/lm/sri.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "lm/lm_exception.hh"
-#include "lm/sri.hh"
-
-#include <Ngram.h>
-#include <Vocab.h>
-
-#include <errno.h>
-
-namespace lm {
-namespace sri {
-
-Vocabulary::Vocabulary() : sri_(new Vocab) {}
-
-Vocabulary::~Vocabulary() {}
-
-WordIndex Vocabulary::Index(const char *str) const {
- WordIndex ret = sri_->getIndex(str);
- // NGram wants the index of Vocab_Unknown for unknown words, but for some reason SRI returns Vocab_None here :-(.
- if (ret == Vocab_None) {
- return not_found_;
- } else {
- return ret;
- }
-}
-
-const char *Vocabulary::Word(WordIndex index) const {
- return sri_->getWord(index);
-}
-
-void Vocabulary::FinishedLoading() {
- SetSpecial(
- sri_->ssIndex(),
- sri_->seIndex(),
- sri_->unkIndex());
-}
-
-namespace {
-Ngram *MakeSRIModel(const char *file_name, unsigned int ngram_length, Vocab &sri_vocab) {
- sri_vocab.unkIsWord() = true;
- std::auto_ptr<Ngram> ret(new Ngram(sri_vocab, ngram_length));
- File file(file_name, "r");
- errno = 0;
- if (!ret->read(file)) {
- UTIL_THROW(FormatLoadException, "reading file " << file_name << " with SRI failed.");
- }
- return ret.release();
-}
-} // namespace
-
-Model::Model(const char *file_name, unsigned int ngram_length) : sri_(MakeSRIModel(file_name, ngram_length, *vocab_.sri_)) {
- if (!sri_->setorder()) {
- UTIL_THROW(FormatLoadException, "Can't have an SRI model with order 0.");
- }
- vocab_.FinishedLoading();
- State begin_state = State();
- begin_state.valid_length_ = 1;
- if (kMaxOrder > 1) {
- begin_state.history_[0] = vocab_.BeginSentence();
- if (kMaxOrder > 2) begin_state.history_[1] = Vocab_None;
- }
- State null_state = State();
- null_state.valid_length_ = 0;
- if (kMaxOrder > 1) null_state.history_[0] = Vocab_None;
- Init(begin_state, null_state, vocab_, sri_->setorder());
- not_found_ = vocab_.NotFound();
-}
-
-Model::~Model() {}
-
-namespace {
-
-/* Argh SRI's wordProb knows the ngram length but doesn't return it. One more
- * reason you should use my model. */
-// TODO(stolcke): fix SRILM so I don't have to do this.
-unsigned int MatchedLength(Ngram &model, const WordIndex new_word, const SRIVocabIndex *const_history) {
- unsigned int out_length = 0;
- // This gets the length of context used, which is ngram_length - 1 unless new_word is OOV in which case it is 0.
- model.contextID(new_word, const_history, out_length);
- return out_length + 1;
-}
-
-} // namespace
-
-FullScoreReturn Model::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
- // If you get a compiler in this function, change SRIVocabIndex in sri.hh to match the one found in SRI's Vocab.h.
- const SRIVocabIndex *const_history;
- SRIVocabIndex local_history[Order()];
- if (in_state.valid_length_ < kMaxOrder - 1) {
- const_history = in_state.history_;
- } else {
- std::copy(in_state.history_, in_state.history_ + in_state.valid_length_, local_history);
- local_history[in_state.valid_length_] = Vocab_None;
- const_history = local_history;
- }
- FullScoreReturn ret;
- ret.ngram_length = MatchedLength(*sri_, new_word, const_history);
- out_state.history_[0] = new_word;
- out_state.valid_length_ = std::min<unsigned char>(ret.ngram_length, Order() - 1);
- std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1);
- if (out_state.valid_length_ < kMaxOrder - 1) {
- out_state.history_[out_state.valid_length_] = Vocab_None;
- }
- ret.prob = sri_->wordProb(new_word, const_history);
- return ret;
-}
-
-} // namespace sri
-} // namespace lm
diff --git a/kenlm/lm/sri.hh b/kenlm/lm/sri.hh
deleted file mode 100644
index b57e9b73a..000000000
--- a/kenlm/lm/sri.hh
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef LM_SRI__
-#define LM_SRI__
-
-#include "lm/facade.hh"
-#include "util/murmur_hash.hh"
-
-#include <cmath>
-#include <exception>
-#include <memory>
-
-class Ngram;
-class Vocab;
-
-/* The ngram length reported uses some random API I found and may be wrong.
- *
- * See ngram, which should return equivalent results.
- */
-
-namespace lm {
-namespace sri {
-
-static const unsigned int kMaxOrder = 6;
-
-/* This should match VocabIndex found in SRI's Vocab.h
- * The reason I define this here independently is that SRI's headers
- * pollute and increase compile time.
- * It's difficult to extract this from their header and anyway would
- * break packaging.
- * If these differ there will be a compiler error in ActuallyCall.
- */
-typedef unsigned int SRIVocabIndex;
-
-class State {
- public:
- // You shouldn't need to touch these, but they're public so State will be a POD.
- // If valid_length_ < kMaxOrder - 1 then history_[valid_length_] == Vocab_None.
- SRIVocabIndex history_[kMaxOrder - 1];
- unsigned char valid_length_;
-};
-
-inline bool operator==(const State &left, const State &right) {
- if (left.valid_length_ != right.valid_length_) {
- return false;
- }
- for (const SRIVocabIndex *l = left.history_, *r = right.history_;
- l != left.history_ + left.valid_length_;
- ++l, ++r) {
- if (*l != *r) return false;
- }
- return true;
-}
-
-inline size_t hash_value(const State &state) {
- return util::MurmurHashNative(&state.history_, sizeof(SRIVocabIndex) * state.valid_length_);
-}
-
-class Vocabulary : public base::Vocabulary {
- public:
- Vocabulary();
-
- ~Vocabulary();
-
- WordIndex Index(const StringPiece &str) const {
- std::string temp(str.data(), str.length());
- return Index(temp.c_str());
- }
- WordIndex Index(const std::string &str) const {
- return Index(str.c_str());
- }
- WordIndex Index(const char *str) const;
-
- const char *Word(WordIndex index) const;
-
- private:
- friend class Model;
- void FinishedLoading();
-
- // The parent class isn't copyable so auto_ptr is the same as scoped_ptr
- // but without the boost dependence.
- mutable std::auto_ptr<Vocab> sri_;
-};
-
-class Model : public base::ModelFacade<Model, State, Vocabulary> {
- public:
- Model(const char *file_name, unsigned int ngram_length);
-
- ~Model();
-
- FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
-
- private:
- Vocabulary vocab_;
-
- mutable std::auto_ptr<Ngram> sri_;
-
- WordIndex not_found_;
-};
-
-} // namespace sri
-} // namespace lm
-
-#endif // LM_SRI__
diff --git a/kenlm/lm/trie.hh b/kenlm/lm/trie.hh
index ebe9910f0..06cc96ac4 100644
--- a/kenlm/lm/trie.hh
+++ b/kenlm/lm/trie.hh
@@ -1,7 +1,7 @@
#ifndef LM_TRIE__
#define LM_TRIE__
-#include <stdint.h>
+#include <inttypes.h>
#include <cstddef>
diff --git a/kenlm/lm/trie_sort.cc b/kenlm/lm/trie_sort.cc
index bb126f18c..9d1d5f27f 100644
--- a/kenlm/lm/trie_sort.cc
+++ b/kenlm/lm/trie_sort.cc
@@ -14,6 +14,7 @@
#include <algorithm>
#include <cstring>
#include <cstdio>
+#include <cstdlib>
#include <deque>
#include <limits>
#include <vector>
@@ -22,14 +23,6 @@ namespace lm {
namespace ngram {
namespace trie {
-const char *kContextSuffix = "_contexts";
-
-FILE *OpenOrThrow(const char *name, const char *mode) {
- FILE *ret = fopen(name, mode);
- if (!ret) UTIL_THROW(util::ErrnoException, "Could not open " << name << " for " << mode);
- return ret;
-}
-
void WriteOrThrow(FILE *to, const void *data, size_t size) {
assert(size);
if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size);
@@ -78,16 +71,13 @@ class PartialViewProxy {
typedef util::ProxyIterator<PartialViewProxy> PartialIter;
-std::string DiskFlush(const void *mem_begin, const void *mem_end, const std::string &file_prefix, std::size_t batch, unsigned char order) {
- std::stringstream assembled;
- assembled << file_prefix << static_cast<unsigned int>(order) << '_' << batch;
- std::string ret(assembled.str());
- util::scoped_fd out(util::CreateOrThrow(ret.c_str()));
- util::WriteOrThrow(out.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
- return ret;
+FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) {
+ util::scoped_fd file(maker.Make());
+ util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
+ return util::FDOpenOrThrow(file);
}
-void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_file_name, std::size_t entry_size, unsigned char order) {
+FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) {
const size_t context_size = sizeof(WordIndex) * (order - 1);
// Sort just the contexts using the same memory.
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
@@ -95,11 +85,10 @@ void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_fil
std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
- std::string name(ngram_file_name + kContextSuffix);
- util::scoped_FILE out(OpenOrThrow(name.c_str(), "w"));
+ util::scoped_FILE out(maker.MakeFile());
// Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
- if (context_begin == context_end) return;
+ if (context_begin == context_end) return out.release();
PartialIter i(context_begin);
WriteOrThrow(out.get(), i->Data(), context_size);
const void *previous = i->Data();
@@ -110,6 +99,7 @@ void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_fil
previous = i->Data();
}
}
+ return out.release();
}
struct ThrowCombine {
@@ -125,14 +115,12 @@ struct FirstCombine {
}
};
-template <class Combine> void MergeSortedFiles(const std::string &first_name, const std::string &second_name, const std::string &out, std::size_t weights_size, unsigned char order, const Combine &combine = ThrowCombine()) {
+template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) {
std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
RecordReader first, second;
- first.Init(first_name.c_str(), entry_size);
- util::RemoveOrThrow(first_name.c_str());
- second.Init(second_name.c_str(), entry_size);
- util::RemoveOrThrow(second_name.c_str());
- util::scoped_FILE out_file(OpenOrThrow(out.c_str(), "w"));
+ first.Init(first_file, entry_size);
+ second.Init(second_file, entry_size);
+ util::scoped_FILE out_file(maker.MakeFile());
EntryCompare less(order);
while (first && second) {
if (less(first.Data(), second.Data())) {
@@ -149,67 +137,14 @@ template <class Combine> void MergeSortedFiles(const std::string &first_name, co
for (RecordReader &remains = (first ? first : second); remains; ++remains) {
WriteOrThrow(out_file.get(), remains.Data(), entry_size);
}
-}
-
-void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn) {
- ReadNGramHeader(f, order);
- const size_t count = counts[order - 1];
- // Size of weights. Does it include backoff?
- const size_t words_size = sizeof(WordIndex) * order;
- const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
- const size_t entry_size = words_size + weights_size;
- const size_t batch_size = std::min(count, mem.size() / entry_size);
- uint8_t *const begin = reinterpret_cast<uint8_t*>(mem.get());
- std::deque<std::string> files;
- for (std::size_t batch = 0, done = 0; done < count; ++batch) {
- uint8_t *out = begin;
- uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
- if (order == counts.size()) {
- for (; out != out_end; out += entry_size) {
- ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
- }
- } else {
- for (; out != out_end; out += entry_size) {
- ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
- }
- }
- // Sort full records by full n-gram.
- util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
- // parallel_sort uses too much RAM
- std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
- files.push_back(DiskFlush(begin, out_end, file_prefix, batch, order));
- WriteContextFile(begin, out_end, files.back(), entry_size, order);
-
- done += (out_end - begin) / entry_size;
- }
-
- // All individual files created. Merge them.
-
- std::size_t merge_count = 0;
- while (files.size() > 1) {
- std::stringstream assembled;
- assembled << file_prefix << static_cast<unsigned int>(order) << "_merge_" << (merge_count++);
- files.push_back(assembled.str());
- MergeSortedFiles(files[0], files[1], files.back(), weights_size, order, ThrowCombine());
- MergeSortedFiles(files[0] + kContextSuffix, files[1] + kContextSuffix, files.back() + kContextSuffix, 0, order - 1, FirstCombine());
- files.pop_front();
- files.pop_front();
- }
- if (!files.empty()) {
- std::stringstream assembled;
- assembled << file_prefix << static_cast<unsigned int>(order) << "_merged";
- std::string merged_name(assembled.str());
- if (std::rename(files[0].c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << files[0].c_str() << " to " << merged_name.c_str());
- std::string context_name = files[0] + kContextSuffix;
- merged_name += kContextSuffix;
- if (std::rename(context_name.c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << context_name << " to " << merged_name.c_str());
- }
+ return out_file.release();
}
} // namespace
-void RecordReader::Init(const std::string &name, std::size_t entry_size) {
- file_.reset(OpenOrThrow(name.c_str(), "r+"));
+void RecordReader::Init(FILE *file, std::size_t entry_size) {
+ rewind(file);
+ file_ = file;
data_.reset(malloc(entry_size));
UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer");
remains_ = true;
@@ -219,20 +154,26 @@ void RecordReader::Init(const std::string &name, std::size_t entry_size) {
void RecordReader::Overwrite(const void *start, std::size_t amount) {
long internal = (uint8_t*)start - (uint8_t*)data_.get();
- UTIL_THROW_IF(fseek(file_.get(), internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
- WriteOrThrow(file_.get(), start, amount);
+ UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
+ WriteOrThrow(file_, start, amount);
long forward = entry_size_ - internal - amount;
- if (forward) UTIL_THROW_IF(fseek(file_.get(), forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
+ if (forward) UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
}
-void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+void RecordReader::Rewind() {
+ rewind(file_);
+ remains_ = true;
+ ++*this;
+}
+
+SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+ util::TempMaker maker(file_prefix);
PositiveProbWarn warn(config.positive_log_probability);
+ unigram_.reset(maker.Make());
{
- std::string unigram_name = file_prefix + "unigrams";
- util::scoped_fd unigram_file;
// In case <unk> appears.
- size_t file_out = (counts[0] + 1) * sizeof(ProbBackoff);
- util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), file_out, unigram_file), file_out);
+ size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
+ util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out);
Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
CheckSpecials(config, vocab);
if (!vocab.SawUnk()) ++counts[0];
@@ -246,16 +187,91 @@ void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uin
buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
buffer = std::min<size_t>(buffer, buffer_use);
- util::scoped_memory mem;
- mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED);
+ util::scoped_malloc mem;
+ mem.reset(malloc(buffer));
if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);
for (unsigned char order = 2; order <= counts.size(); ++order) {
- ConvertToSorted(f, vocab, counts, mem, file_prefix, order, warn);
+ ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer);
}
ReadEnd(f);
}
+namespace {
+class Closer {
+ public:
+ explicit Closer(std::deque<FILE*> &files) : files_(files) {}
+
+ ~Closer() {
+ for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); ++i) {
+ util::scoped_FILE deleter(*i);
+ }
+ }
+
+ void PopFront() {
+ util::scoped_FILE deleter(files_.front());
+ files_.pop_front();
+ }
+ private:
+ std::deque<FILE*> &files_;
+};
+} // namespace
+
+void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
+ ReadNGramHeader(f, order);
+ const size_t count = counts[order - 1];
+ // Size of weights. Does it include backoff?
+ const size_t words_size = sizeof(WordIndex) * order;
+ const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
+ const size_t entry_size = words_size + weights_size;
+ const size_t batch_size = std::min(count, mem_size / entry_size);
+ uint8_t *const begin = reinterpret_cast<uint8_t*>(mem);
+
+ std::deque<FILE*> files, contexts;
+ Closer files_closer(files), contexts_closer(contexts);
+
+ for (std::size_t batch = 0, done = 0; done < count; ++batch) {
+ uint8_t *out = begin;
+ uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
+ if (order == counts.size()) {
+ for (; out != out_end; out += entry_size) {
+ ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
+ }
+ } else {
+ for (; out != out_end; out += entry_size) {
+ ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
+ }
+ }
+ // Sort full records by full n-gram.
+ util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
+ // parallel_sort uses too much RAM
+ std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
+ files.push_back(DiskFlush(begin, out_end, maker));
+ contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
+
+ done += (out_end - begin) / entry_size;
+ }
+
+ // All individual files created. Merge them.
+
+ while (files.size() > 1) {
+ files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine()));
+ files_closer.PopFront();
+ files_closer.PopFront();
+ contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine()));
+ contexts_closer.PopFront();
+ contexts_closer.PopFront();
+ }
+
+ if (!files.empty()) {
+ // Steal from closers.
+ full_[order - 2].reset(files.front());
+ files.pop_front();
+ context_[order - 2].reset(contexts.front());
+ contexts.pop_front();
+ }
+}
+
} // namespace trie
} // namespace ngram
} // namespace lm
diff --git a/kenlm/lm/trie_sort.hh b/kenlm/lm/trie_sort.hh
index c57b36186..a8e552e6e 100644
--- a/kenlm/lm/trie_sort.hh
+++ b/kenlm/lm/trie_sort.hh
@@ -1,6 +1,9 @@
+// Step of trie builder: create sorted files.
+
#ifndef LM_TRIE_SORT__
#define LM_TRIE_SORT__
+#include "lm/max_order.hh"
#include "lm/word_index.hh"
#include "util/file.hh"
@@ -11,20 +14,21 @@
#include <string>
#include <vector>
-#include <stdint.h>
+#include <inttypes.h>
-namespace util { class FilePiece; }
+namespace util {
+class FilePiece;
+class TempMaker;
+} // namespace util
-// Step of trie builder: create sorted files.
namespace lm {
+class PositiveProbWarn;
namespace ngram {
class SortedVocabulary;
class Config;
namespace trie {
-extern const char *kContextSuffix;
-FILE *OpenOrThrow(const char *name, const char *mode);
void WriteOrThrow(FILE *to, const void *data, size_t size);
class EntryCompare : public std::binary_function<const void*, const void*, bool> {
@@ -49,15 +53,15 @@ class RecordReader {
public:
RecordReader() : remains_(true) {}
- void Init(const std::string &name, std::size_t entry_size);
+ void Init(FILE *file, std::size_t entry_size);
void *Data() { return data_.get(); }
const void *Data() const { return data_.get(); }
RecordReader &operator++() {
- std::size_t ret = fread(data_.get(), entry_size_, 1, file_.get());
+ std::size_t ret = fread(data_.get(), entry_size_, 1, file_);
if (!ret) {
- UTIL_THROW_IF(!feof(file_.get()), util::ErrnoException, "Error reading temporary file");
+ UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file");
remains_ = false;
}
return *this;
@@ -65,27 +69,46 @@ class RecordReader {
operator bool() const { return remains_; }
- void Rewind() {
- rewind(file_.get());
- remains_ = true;
- ++*this;
- }
+ void Rewind();
std::size_t EntrySize() const { return entry_size_; }
void Overwrite(const void *start, std::size_t amount);
private:
+ FILE *file_;
+
util::scoped_malloc data_;
bool remains_;
std::size_t entry_size_;
-
- util::scoped_FILE file_;
};
-void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab);
+class SortedFiles {
+ public:
+ // Build from ARPA
+ SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab);
+
+ int StealUnigram() {
+ return unigram_.release();
+ }
+
+ FILE *Full(unsigned char order) {
+ return full_[order - 2].get();
+ }
+
+ FILE *Context(unsigned char of_order) {
+ return context_[of_order - 2].get();
+ }
+
+ private:
+ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
+
+ util::scoped_fd unigram_;
+
+ util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1];
+};
} // namespace trie
} // namespace ngram
diff --git a/kenlm/lm/vocab.cc b/kenlm/lm/vocab.cc
index 12e19f0ec..5ac828178 100644
--- a/kenlm/lm/vocab.cc
+++ b/kenlm/lm/vocab.cc
@@ -6,6 +6,7 @@
#include "lm/config.hh"
#include "lm/weights.hh"
#include "util/exception.hh"
+#include "util/file.hh"
#include "util/joint_sort.hh"
#include "util/murmur_hash.hh"
#include "util/probing_hash_table.hh"
@@ -29,7 +30,7 @@ const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
// Sadly some LMs have <UNK>.
const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
-WordIndex ReadWords(FD fd, EnumerateVocab *enumerate) {
+WordIndex ReadWords(int fd, EnumerateVocab *enumerate) {
if (!enumerate) return std::numeric_limits<WordIndex>::max();
const std::size_t kInitialRead = 16384;
std::string buf;
@@ -37,23 +38,12 @@ WordIndex ReadWords(FD fd, EnumerateVocab *enumerate) {
buf.resize(kInitialRead);
WordIndex index = 0;
while (true) {
-#ifdef WIN32
- ssize_t got;
-#else
- ssize_t got = read(fd, &buf[0], kInitialRead);
-#endif
- UTIL_THROW_IF(got == -1, util::ErrnoException, "Reading vocabulary words");
+ std::size_t got = util::ReadOrEOF(fd, &buf[0], kInitialRead);
if (got == 0) return index;
buf.resize(got);
while (buf[buf.size() - 1]) {
char next_char;
-#ifdef WIN32
- ssize_t ret;
-#else
- ssize_t ret = read(fd, &next_char, 1);
-#endif
- UTIL_THROW_IF(ret == -1, util::ErrnoException, "Reading vocabulary words");
- UTIL_THROW_IF(ret == 0, FormatLoadException, "Missing null terminator on a vocab word.");
+ util::ReadOrThrow(fd, &next_char, 1);
buf.push_back(next_char);
}
// Ok now we have null terminated strings.
@@ -76,12 +66,8 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
buffer_.push_back(0);
}
-void WriteWordsWrapper::Write(FD fd) {
-#ifdef WIN32
-#else
- if ((off_t)-1 == lseek(fd, 0, SEEK_END))
- UTIL_THROW(util::ErrnoException, "Failed to seek in binary to vocab words");
-#endif
+void WriteWordsWrapper::Write(int fd) {
+ util::SeekEnd(fd);
util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
}
@@ -142,10 +128,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
bound_ = end_ - begin_ + 1;
}
-void SortedVocabulary::LoadedBinary(FD fd, EnumerateVocab *to) {
+void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
ReadWords(fd, to);
SetSpecial(Index("<s>"), Index("</s>"), 0);
+ bound_ = end_ - begin_ + 1;
}
namespace {
@@ -163,12 +150,12 @@ struct ProbingVocabularyHeader {
ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {}
std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) {
- return Align8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier);
+ return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier);
}
void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) {
header_ = static_cast<detail::ProbingVocabularyHeader*>(start);
- lookup_ = Lookup(static_cast<uint8_t*>(start) + Align8(sizeof(detail::ProbingVocabularyHeader)), allocated);
+ lookup_ = Lookup(static_cast<uint8_t*>(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated);
bound_ = 1;
saw_unk_ = false;
}
@@ -200,7 +187,7 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
SetSpecial(Index("<s>"), Index("</s>"), 0);
}
-void ProbingVocabulary::LoadedBinary(FD fd, EnumerateVocab *to) {
+void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
lookup_.LoadedBinary();
ReadWords(fd, to);
diff --git a/kenlm/lm/vocab.hh b/kenlm/lm/vocab.hh
index 26de3647d..3c3414fb9 100644
--- a/kenlm/lm/vocab.hh
+++ b/kenlm/lm/vocab.hh
@@ -8,7 +8,6 @@
#include "util/probing_hash_table.hh"
#include "util/sorted_uniform.hh"
#include "util/string_piece.hh"
-#include "util/portability.hh"
#include <limits>
#include <string>
@@ -37,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab {
void Add(WordIndex index, const StringPiece &str);
- void Write(FD fd);
+ void Write(int fd);
private:
EnumerateVocab *inner_;
@@ -67,7 +66,6 @@ class SortedVocabulary : public base::Vocabulary {
static size_t Size(std::size_t entries, const Config &config);
// Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
- // While this number is correct, ProbingVocabulary::Bound might not be correct in some cases.
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
@@ -85,7 +83,7 @@ class SortedVocabulary : public base::Vocabulary {
bool SawUnk() const { return saw_unk_; }
- void LoadedBinary(FD fd, EnumerateVocab *to);
+ void LoadedBinary(int fd, EnumerateVocab *to);
private:
uint64_t *begin_, *end_;
@@ -128,7 +126,7 @@ class ProbingVocabulary : public base::Vocabulary {
bool SawUnk() const { return saw_unk_; }
- void LoadedBinary(FD fd, EnumerateVocab *to);
+ void LoadedBinary(int fd, EnumerateVocab *to);
private:
// std::identity is an SGI extension :-(
diff --git a/kenlm/util/bit_packing.hh b/kenlm/util/bit_packing.hh
index ba3b8529e..62b1f9ea2 100644
--- a/kenlm/util/bit_packing.hh
+++ b/kenlm/util/bit_packing.hh
@@ -1,35 +1,37 @@
#ifndef UTIL_BIT_PACKING__
#define UTIL_BIT_PACKING__
-/* Bit-level packing routines */
+/* Bit-level packing routines
+ *
+ * WARNING WARNING WARNING:
+ * The write functions assume that memory is zero initially. This makes them
+ * faster and is the appropriate case for mmapped language model construction.
+ * These routines assume that unaligned access to uint64_t is fast. This is
+ * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on
+ * x86 but my target audience is large language models for which 64-bit is
+ * necessary.
+ *
+ * Call the BitPackingSanity function to sanity check. Calling once suffices,
+ * but it may be called multiple times when that's inconvenient.
+ *
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
#include <assert.h>
#ifdef __APPLE__
#include <architecture/byte_order.h>
#elif __linux__
#include <endian.h>
-#elif WIN32
- // TODO WIN32
-#else
+#elif !defined(_WIN32) && !defined(_WIN64)
#include <arpa/nameser_compat.h>
#endif
-#include <stdint.h>
-
-namespace util {
+#include <inttypes.h>
-/* WARNING WARNING WARNING:
- * The write functions assume that memory is zero initially. This makes them
- * faster and is the appropriate case for mmapped language model construction.
- * These routines assume that unaligned access to uint64_t is fast and that
- * storage is little endian. This is the case on x86_64. I'm not sure how
- * fast unaligned 64-bit access is on x86 but my target audience is large
- * language models for which 64-bit is necessary.
- *
- * Call the BitPackingSanity function to sanity check. Calling once suffices,
- * but it may be called multiple times when that's inconvenient.
- */
+#include <string.h>
+namespace util {
// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
#if BYTE_ORDER == LITTLE_ENDIAN
@@ -59,8 +61,16 @@ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, ui
* Assumes the memory is zero initially.
*/
inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) {
+#if defined(__arm) || defined(__arm__)
+ uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
+ uint64_t value64;
+ memcpy(&value64, base_off, sizeof(value64));
+ value64 |= (value << BitPackShift(bit_off & 7, length));
+ memcpy(base_off, &value64, sizeof(value64));
+#else
*reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
(value << BitPackShift(bit_off & 7, length));
+#endif
}
/* Same caveats as above, but for a 25 bit limit. */
@@ -69,8 +79,16 @@ inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, ui
}
inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
+#if defined(__arm) || defined(__arm__)
+ uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
+ uint32_t value32;
+ memcpy(&value32, base_off, sizeof(value32));
+ value32 |= (value << BitPackShift(bit_off & 7, length));
+ memcpy(base_off, &value32, sizeof(value32));
+#else
*reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
(value << BitPackShift(bit_off & 7, length));
+#endif
}
typedef union { float f; uint32_t i; } FloatEnc;
diff --git a/kenlm/util/exception.cc b/kenlm/util/exception.cc
index ebe06e424..c4f8c04ce 100644
--- a/kenlm/util/exception.cc
+++ b/kenlm/util/exception.cc
@@ -1,5 +1,4 @@
#include "util/exception.hh"
-#include "util/portability.hh"
#ifdef __GXX_RTTI
#include <typeinfo>
@@ -67,11 +66,8 @@ const char *HandleStrerror(const char *ret, const char * /*buf*/) {
ErrnoException::ErrnoException() throw() : errno_(errno) {
char buf[200];
buf[0] = 0;
-#ifdef sun
+#if defined(sun) || defined(_WIN32) || defined(_WIN64)
const char *add = strerror(errno);
-#elif WIN32
- // TODO WIN32
- const char *add;
#else
const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
#endif
diff --git a/kenlm/util/file.cc b/kenlm/util/file.cc
index 8da6fda62..18b7934ca 100644
--- a/kenlm/util/file.cc
+++ b/kenlm/util/file.cc
@@ -9,12 +9,17 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
namespace util {
scoped_fd::~scoped_fd() {
- if (fd_ != kBadFD && close(fd_)) {
+ if (fd_ != -1 && close(fd_)) {
std::cerr << "Could not close file " << fd_ << std::endl;
std::abort();
}
@@ -27,69 +32,215 @@ scoped_FILE::~scoped_FILE() {
}
}
-FD OpenReadOrThrow(const char *name) {
- FD ret;
-#ifdef WIN32
-
+int OpenReadOrThrow(const char *name) {
+ int ret;
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
#endif
return ret;
}
-FD CreateOrThrow(const char *name) {
- FD ret;
-#ifdef WIN32
-
-#else
- UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name);
-#endif
- return ret;
-}
-
-off_t SizeFile(FD fd) {
-#ifdef WIN32
- return 0; // TODO WIN32
-
-#else
+uint64_t SizeFile(int fd) {
struct stat sb;
if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
return sb.st_size;
-#endif
}
-void ReadOrThrow(FD fd, void *to_void, std::size_t amount) {
+void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
-
-#ifdef WIN32
- ssize_t ret; // TODO WIN32
-#else
ssize_t ret = read(fd, to, amount);
-#endif
- if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
- if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+ UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+ UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
}
-void WriteOrThrow(FD fd, const void *data_void, std::size_t size) {
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
+ uint8_t *to = static_cast<uint8_t*>(to_void);
+ std::size_t remaining = amount;
+ while (remaining) {
+ ssize_t ret = read(fd, to, remaining);
+ UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed.");
+ if (!ret) return amount - remaining;
+ remaining -= ret;
+ to += ret;
+ }
+ return amount;
+}
+
+void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
-#ifdef WIN32
- ssize_t ret; // TODO WIN32
-#else
ssize_t ret = write(fd, data, size);
-#endif
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
data += ret;
size -= ret;
}
}
-void RemoveOrThrow(const char *name) {
- UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name);
+namespace {
+void InternalSeek(int fd, off_t off, int whence) {
+ UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
+}
+} // namespace
+
+void SeekOrThrow(int fd, uint64_t off) {
+ InternalSeek(fd, off, SEEK_SET);
+}
+
+void AdvanceOrThrow(int fd, int64_t off) {
+ InternalSeek(fd, off, SEEK_CUR);
+}
+
+void SeekEnd(int fd) {
+ InternalSeek(fd, 0, SEEK_END);
+}
+
+std::FILE *FDOpenOrThrow(scoped_fd &file) {
+ std::FILE *ret = fdopen(file.get(), "r+b");
+ if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen");
+ file.release();
+ return ret;
+}
+
+TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {
+ base_ += "XXXXXX";
+}
+
+// Sigh. Windows temporary file creation is full of race conditions.
+#if defined(_WIN32) || defined(_WIN64)
+/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright
+ (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version. */
+
+/* This has been modified from the original version to rename the function and
+ * set the Windows temporary flag. */
+
+static const char letters[] =
+"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+/* Generate a temporary file name based on TMPL. TMPL must match the
+ rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed
+ does not exist at the time of the call to mkstemp. TMPL is
+ overwritten with the result. */
+int
+mkstemp_and_unlink(char *tmpl)
+{
+ int len;
+ char *XXXXXX;
+ static unsigned long long value;
+ unsigned long long random_time_bits;
+ unsigned int count;
+ int fd = -1;
+ int save_errno = errno;
+
+ /* A lower bound on the number of temporary files to attempt to
+ generate. The maximum total number of temporary file names that
+ can exist for a given template is 62**6. It should never be
+ necessary to try all these combinations. Instead if a reasonable
+ number of names is tried (we define reasonable as 62**3) fail to
+ give the system administrator the chance to remove the problems. */
+#define ATTEMPTS_MIN (62 * 62 * 62)
+
+ /* The number of times to attempt to generate a temporary file. To
+ conform to POSIX, this must be no smaller than TMP_MAX. */
+#if ATTEMPTS_MIN < TMP_MAX
+ unsigned int attempts = TMP_MAX;
+#else
+ unsigned int attempts = ATTEMPTS_MIN;
+#endif
+
+ len = strlen (tmpl);
+ if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX"))
+ {
+ errno = EINVAL;
+ return -1;
+ }
+
+/* This is where the Xs start. */
+ XXXXXX = &tmpl[len - 6];
+
+ /* Get some more or less random data. */
+ {
+ SYSTEMTIME stNow;
+ FILETIME ftNow;
+
+ // get system time
+ GetSystemTime(&stNow);
+ stNow.wMilliseconds = 500;
+ if (!SystemTimeToFileTime(&stNow, &ftNow))
+ {
+ errno = -1;
+ return -1;
+ }
+
+ random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32)
+ | (unsigned long long)ftNow.dwLowDateTime);
+ }
+ value += random_time_bits ^ (unsigned long long)GetCurrentThreadId ();
+
+ for (count = 0; count < attempts; value += 7777, ++count)
+ {
+ unsigned long long v = value;
+
+ /* Fill in the random bits. */
+ XXXXXX[0] = letters[v % 62];
+ v /= 62;
+ XXXXXX[1] = letters[v % 62];
+ v /= 62;
+ XXXXXX[2] = letters[v % 62];
+ v /= 62;
+ XXXXXX[3] = letters[v % 62];
+ v /= 62;
+ XXXXXX[4] = letters[v % 62];
+ v /= 62;
+ XXXXXX[5] = letters[v % 62];
+
+ /* Modified to unlink */
+// fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE);
+ fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL, _S_IREAD | _S_IWRITE);
+ if (fd >= 0)
+ {
+ errno = save_errno;
+ return fd;
+ }
+ else if (errno != EEXIST)
+ return -1;
+ }
+
+ /* We got out of the loop because we ran out of combinations to try. */
+ errno = EEXIST;
+ return -1;
+}
+#else
+int
+mkstemp_and_unlink(char *tmpl) {
+ int ret = mkstemp(tmpl);
+ if (ret == -1) return -1;
+ UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
+ return ret;
+}
+#endif
+
+int TempMaker::Make() const {
+ std::string copy(base_);
+ copy.push_back(0);
+ int ret;
+ UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&copy[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
+ return ret;
+}
+
+std::FILE *TempMaker::MakeFile() const {
+ util::scoped_fd file(Make());
+ return FDOpenOrThrow(file);
}
} // namespace util
diff --git a/kenlm/util/file.hh b/kenlm/util/file.hh
index 6adea6625..0dc9ea76b 100644
--- a/kenlm/util/file.hh
+++ b/kenlm/util/file.hh
@@ -1,38 +1,41 @@
#ifndef UTIL_FILE__
#define UTIL_FILE__
+#include <cstddef>
#include <cstdio>
-#include "util/portability.hh"
+#include <string>
+
+#include <inttypes.h>
namespace util {
class scoped_fd {
public:
- scoped_fd() : fd_(kBadFD) {}
+ scoped_fd() : fd_(-1) {}
- explicit scoped_fd(FD fd) : fd_(fd) {}
+ explicit scoped_fd(int fd) : fd_(fd) {}
~scoped_fd();
- void reset(FD to) {
+ void reset(int to) {
scoped_fd other(fd_);
fd_ = to;
}
- FD get() const { return fd_; }
+ int get() const { return fd_; }
- FD operator*() const { return fd_; }
+ int operator*() const { return fd_; }
- FD release() {
- FD ret = fd_;
- fd_ = kBadFD;
+ int release() {
+ int ret = fd_;
+ fd_ = -1;
return ret;
}
- operator bool() { return fd_ != kBadFD; }
+ operator bool() { return fd_ != -1; }
private:
- FD fd_;
+ int fd_;
scoped_fd(const scoped_fd &);
scoped_fd &operator=(const scoped_fd &);
@@ -52,22 +55,45 @@ class scoped_FILE {
file_ = to;
}
+ std::FILE *release() {
+ std::FILE *ret = file_;
+ file_ = NULL;
+ return ret;
+ }
+
private:
std::FILE *file_;
};
-FD OpenReadOrThrow(const char *name);
-
-FD CreateOrThrow(const char *name);
+int OpenReadOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
-const off_t kBadSize = -1;
-off_t SizeFile(FD fd);
+const uint64_t kBadSize = (uint64_t)-1;
+uint64_t SizeFile(int fd);
+
+void ReadOrThrow(int fd, void *to, std::size_t size);
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
+
+void WriteOrThrow(int fd, const void *data_void, std::size_t size);
-void ReadOrThrow(FD fd, void *to, std::size_t size);
-void WriteOrThrow(FD fd, const void *data_void, std::size_t size);
+// Seeking
+void SeekOrThrow(int fd, uint64_t off);
+void AdvanceOrThrow(int fd, int64_t off);
+void SeekEnd(int fd);
-void RemoveOrThrow(const char *name);
+std::FILE *FDOpenOrThrow(scoped_fd &file);
+
+class TempMaker {
+ public:
+ explicit TempMaker(const std::string &prefix);
+
+ int Make() const;
+
+ std::FILE *MakeFile() const;
+
+ private:
+ std::string base_;
+};
} // namespace util
diff --git a/kenlm/util/file_piece.cc b/kenlm/util/file_piece.cc
index bd3688690..d0101e129 100644
--- a/kenlm/util/file_piece.cc
+++ b/kenlm/util/file_piece.cc
@@ -2,6 +2,7 @@
#include "util/exception.hh"
#include "util/file.hh"
+#include "util/mmap.hh"
#include <iostream>
#include <string>
@@ -11,6 +12,9 @@
#include <ctype.h>
#include <fcntl.h>
#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
#ifdef HAVE_ZLIB
#include <zlib.h>
@@ -32,14 +36,14 @@ GZException::GZException(void *file) {
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
- file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+ file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-FilePiece::FilePiece(FD fd, const char *name, std::ostream *show_progress, off_t min_buffer) :
- file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+ file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
@@ -59,7 +63,7 @@ FilePiece::~FilePiece() {
}
StringPiece FilePiece::ReadLine(char delim) {
- size_t skip = 0;
+ std::size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (*i == delim) {
@@ -90,13 +94,13 @@ unsigned long int FilePiece::ReadULong() {
return ReadNumber<unsigned long int>();
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
#ifdef HAVE_ZLIB
gz_file_ = NULL;
#endif
file_name_ = name;
- default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
+ default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = 0;
@@ -167,7 +171,7 @@ template <class T> T FilePiece::ReadNumber() {
}
const char *FilePiece::FindDelimiterOrEOF(const bool *delim) {
- size_t skip = 0;
+ std::size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (delim[static_cast<unsigned char>(*i)]) return i;
@@ -186,7 +190,7 @@ void FilePiece::Shift() {
progress_.Finished();
throw EndOfFileException();
}
- off_t desired_begin = position_ - data_.begin() + mapped_offset_;
+ uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
@@ -197,18 +201,18 @@ void FilePiece::Shift() {
}
}
-void FilePiece::MMapShift(off_t desired_begin) {
+void FilePiece::MMapShift(uint64_t desired_begin) {
// Use mmap.
- off_t ignore = desired_begin % page_;
+ uint64_t ignore = desired_begin % page_;
// Duplicate request for Shift means give more data.
if (position_ == data_.begin() + ignore) {
default_map_size_ *= 2;
}
// Local version so that in case of failure it doesn't overwrite the class variable.
- off_t mapped_offset = desired_begin - ignore;
+ uint64_t mapped_offset = desired_begin - ignore;
- off_t mapped_size;
- if (default_map_size_ >= static_cast<size_t>(total_size_ - mapped_offset)) {
+ uint64_t mapped_size;
+ if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) {
at_end_ = true;
mapped_size = total_size_ - mapped_offset;
} else {
@@ -217,19 +221,11 @@ void FilePiece::MMapShift(off_t desired_begin) {
// Forcibly clear the existing mmap first.
data_.reset();
- data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED
- // Populate where available on linux
-#ifdef MAP_POPULATE
- | MAP_POPULATE
-#endif
- , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
- if (data_.get() == MAP_FAILED) {
+ try {
+ MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
+ } catch (const util::ErrnoException &e) {
if (desired_begin) {
-#ifdef WIN32
-
-#else
- if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
-#endif
+ SeekOrThrow(*file_, desired_begin);
}
// The mmap was scheduled to end the file, but now we're going to read it.
at_end_ = false;
@@ -254,15 +250,9 @@ void FilePiece::TransitionToRead() {
#ifdef HAVE_ZLIB
assert(!gz_file_);
-
-#ifdef WIN32
-
-#else
gz_file_ = gzdopen(file_.get(), "r");
UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
#endif
-
-#endif
}
void FilePiece::ReadShift() {
@@ -303,11 +293,7 @@ void FilePiece::ReadShift() {
if (read_return == -1) throw GZException(gz_file_);
if (total_size_ != kBadSize) {
// Just get the position, don't actually seek. Apparently this is how you do it. . .
-#ifdef WIN32
- off_t ret;
-#else
off_t ret = lseek(file_.get(), 0, SEEK_CUR);
-#endif
if (ret != -1) progress_.Set(ret);
}
#else
diff --git a/kenlm/util/file_piece.hh b/kenlm/util/file_piece.hh
index 2ddec643f..a8dc35523 100644
--- a/kenlm/util/file_piece.hh
+++ b/kenlm/util/file_piece.hh
@@ -7,11 +7,11 @@
#include "util/have.hh"
#include "util/mmap.hh"
#include "util/string_piece.hh"
-#include "util/portability.hh"
+#include <cstddef>
#include <string>
-#include <cstddef>
+#include <inttypes.h>
namespace util {
@@ -34,9 +34,9 @@ extern const bool kSpaces[256];
class FilePiece {
public:
// 32 MB default.
- explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
// Takes ownership of fd. name is used for messages.
- explicit FilePiece(FD fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
~FilePiece();
@@ -71,14 +71,14 @@ class FilePiece {
}
}
- off_t Offset() const {
+ uint64_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
}
const std::string &FileName() const { return file_name_; }
private:
- void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
+ void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
template <class T> T ReadNumber();
@@ -92,7 +92,7 @@ class FilePiece {
void Shift();
// Backends to Shift().
- void MMapShift(off_t desired_begin);
+ void MMapShift(uint64_t desired_begin);
void TransitionToRead();
void ReadShift();
@@ -100,11 +100,11 @@ class FilePiece {
const char *position_, *last_space_, *position_end_;
scoped_fd file_;
- const off_t total_size_;
- const off_t page_;
+ const uint64_t total_size_;
+ const uint64_t page_;
- size_t default_map_size_;
- off_t mapped_offset_;
+ std::size_t default_map_size_;
+ uint64_t mapped_offset_;
// Order matters: file_ should always be destroyed after this.
scoped_memory data_;
diff --git a/kenlm/util/key_value_packing.hh b/kenlm/util/key_value_packing.hh
index 8339980b5..b84a5aadf 100644
--- a/kenlm/util/key_value_packing.hh
+++ b/kenlm/util/key_value_packing.hh
@@ -7,7 +7,7 @@
#include <cstddef>
#include <cstring>
-#include <stdint.h>
+#include <inttypes.h>
namespace util {
diff --git a/kenlm/util/mmap.cc b/kenlm/util/mmap.cc
index f73a3cf5c..3dfe0ab2c 100644
--- a/kenlm/util/mmap.cc
+++ b/kenlm/util/mmap.cc
@@ -1,23 +1,62 @@
+/* Memory mapping wrappers.
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
+#include "util/mmap.hh"
+
#include "util/exception.hh"
#include "util/file.hh"
-#include "util/mmap.hh"
-#include "util/portability.hh"
#include <iostream>
#include <assert.h>
#include <fcntl.h>
#include <sys/types.h>
+#include <sys/stat.h>
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#endif
#include <stdlib.h>
-#include "util/portability.hh"
+#include <unistd.h>
namespace util {
+long SizePage() {
+#if defined(_WIN32) || defined(_WIN64)
+ SYSTEM_INFO si;
+ GetSystemInfo(&si);
+ return si.dwAllocationGranularity;
+#else
+ return sysconf(_SC_PAGE_SIZE);
+#endif
+}
+
+void SyncOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
+#else
+ UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
+#endif
+}
+
+void UnmapOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
+#else
+ UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
+#endif
+}
+
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
- // Thanks Denis Filimonov for pointing on NFS likes msync first.
- if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) {
- std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl;
+ try {
+ // Thanks Denis Filimonov for pointing out NFS likes msync first.
+ SyncOrThrow(data_, size_);
+ UnmapOrThrow(data_, size_);
+ } catch (const util::ErrnoException &e) {
+ std::cerr << e.what();
abort();
}
}
@@ -52,32 +91,39 @@ void scoped_memory::call_realloc(std::size_t size) {
}
}
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, FD fd, off_t offset) {
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
#ifdef MAP_POPULATE // Linux specific
if (prefault) {
flags |= MAP_POPULATE;
}
-#elif WIN32
- // TODO WIN32
-
#endif
+#if defined(_WIN32) || defined(_WIN64)
+ int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
+ int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
+ HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL);
+ UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
+ ret = MapViewOfFile(hMapping, protectM, 0, offset, size);
+ CloseHandle(hMapping);
+ UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
+#else
int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
void *ret = mmap(NULL, size, protect, flags, fd, offset);
- if (ret == MAP_FAILED) {
- UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset);
- }
+ UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+#endif
return ret;
}
const int kFileFlags =
-#ifdef MAP_FILE
+#if defined(_WIN32) || defined(_WIN64)
+ 0 // MapOrThrow ignores flags on windows
+#elif defined(MAP_FILE)
MAP_FILE | MAP_SHARED
#else
MAP_SHARED
#endif
;
-void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_memory &out) {
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
switch (method) {
case LAZY:
out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
@@ -94,11 +140,7 @@ void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_me
case READ:
out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
-#ifdef WIN32
-
-#else
- if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed.");
-#endif
+ SeekOrThrow(fd, offset);
ReadOrThrow(fd, out.get(), size);
break;
}
@@ -106,27 +148,40 @@ void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_me
void *MapAnonymous(std::size_t size) {
return MapOrThrow(size, true,
-#ifdef MAP_ANONYMOUS
- MAP_ANONYMOUS // Linux
+#if defined(_WIN32) || defined(_WIN64)
+ 0 // MapOrThrow ignores the flags anyway.
+#elif defined(MAP_ANONYMOUS)
+ MAP_ANONYMOUS | MAP_PRIVATE // Linux
#else
- MAP_ANON // BSD
+ MAP_ANON | MAP_PRIVATE // BSD
#endif
- | MAP_PRIVATE, false, kBadFD, 0);
+ , false, -1, 0);
}
-void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
-#ifdef WIN32
+void *MapZeroedWrite(int fd, std::size_t size) {
+ UTIL_THROW_IF(-1 == ftruncate(fd, 0), ErrnoException, "ftruncate on fd " << fd << " to 0 failed");
+ UTIL_THROW_IF(-1 == ftruncate(fd, size), ErrnoException, "ftruncate on fd " << fd << " to " << size << " failed");
+ return MapOrThrow(size, true, kFileFlags, false, fd, 0);
+}
+namespace {
+
+int CreateOrThrow(const char *name) {
+ int ret;
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
- file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+ UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
+ return ret;
+}
+
+} // namespace
- if (kBadFD == file.get())
- UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing");
- if (-1 == ftruncate(file.get(), size))
- UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
+void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
+ file.reset(CreateOrThrow(name));
try {
- return MapOrThrow(size, true, kFileFlags, false, file.get(), 0);
+ return MapZeroedWrite(file.get(), size);
} catch (ErrnoException &e) {
e << " in file " << name;
throw;
diff --git a/kenlm/util/mmap.hh b/kenlm/util/mmap.hh
index 8333fed03..3183c6292 100644
--- a/kenlm/util/mmap.hh
+++ b/kenlm/util/mmap.hh
@@ -4,14 +4,15 @@
#include <cstddef>
-#include "util/portability.hh"
-#include <stdint.h>
+#include <inttypes.h>
#include <sys/types.h>
namespace util {
class scoped_fd;
+long SizePage();
+
// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
class scoped_mmap {
public:
@@ -95,15 +96,19 @@ typedef enum {
extern const int kFileFlags;
// Wrapper around mmap to check it worked and hide some platform macros.
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, FD fd, off_t offset = 0);
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
-void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_memory &out);
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
void *MapAnonymous(std::size_t size);
// Open file name with mmap of size bytes, all of which are initially zero.
+void *MapZeroedWrite(int fd, std::size_t size);
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
+// msync wrapper
+void SyncOrThrow(void *start, size_t length);
+
} // namespace util
#endif // UTIL_MMAP__
diff --git a/kenlm/util/murmur_hash.hh b/kenlm/util/murmur_hash.hh
index 638aaeb22..78fe583fc 100644
--- a/kenlm/util/murmur_hash.hh
+++ b/kenlm/util/murmur_hash.hh
@@ -1,7 +1,7 @@
#ifndef UTIL_MURMUR_HASH__
#define UTIL_MURMUR_HASH__
#include <cstddef>
-#include <stdint.h>
+#include <inttypes.h>
namespace util {
diff --git a/kenlm/util/portability.cc b/kenlm/util/portability.cc
deleted file mode 100644
index 2efd74cba..000000000
--- a/kenlm/util/portability.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-
-#include <stdlib.h>
-#include <errno.h>
-#include "util/portability.hh"
-
-#ifdef WIN32
-
-int RUSAGE_SELF = 0;
-
-int sysconf(int) { return 0; }
-int msync(void*, int, int) { return 0; }
-int munmap(void *, int) { return 0; }
-void *mmap(void*, int, int, int, FD, OFF_T) { return 0; }
-int write(int, const void *, int) {return 0; }
-
-//FILE *popen(const char*, const char*) { return 0; }
-//int pclose(FILE *) { return 0; }
-int close(FD fd) { return 0; }
-
-
-// to be implemented by boost
-int mkdtemp(const char*) { return 0; }
-
-// done
-long lrint(float x)
-{
- long ret = (long) x;
- return ret;
-}
-
-float strtof(const char *begin, char **end)
-{
- double ret = strtod(begin, end);
- return (float) ret;
-}
-
-
-int ftruncate (FD hfile, unsigned int size)
-{
- unsigned int curpos;
- /*
- HANDLE hfile;
-
- if (fd < 0)
- {
- errno = EBADF;
- return -1;
- }
-
- hfile = (HANDLE) _get_osfhandle (fd);
- */
- curpos = SetFilePointer (hfile, 0, NULL, FILE_CURRENT);
- if (curpos == ~0
- || SetFilePointer (hfile, size, NULL, FILE_BEGIN) == ~0
- || !SetEndOfFile (hfile))
- {
- int error = GetLastError ();
- switch (error)
- {
- case ERROR_INVALID_HANDLE:
- errno = EBADF;
- break;
- default:
- errno = EIO;
- break;
- }
- return -1;
- }
- return 0;
-}
-
-#endif
-
-
diff --git a/kenlm/util/portability.hh b/kenlm/util/portability.hh
deleted file mode 100644
index 7066f50f5..000000000
--- a/kenlm/util/portability.hh
+++ /dev/null
@@ -1,127 +0,0 @@
-
-#pragma once
-
-#include <assert.h>
-#include <stdint.h>
-
-#ifdef WIN32
-
-#include <windows.h>
-#include <direct.h>
-#include <io.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/stat.h>
-#include "util/getopt.hh"
-
-#undef max
-#undef min
-
-typedef HANDLE FD;
-
-const FD kBadFD = INVALID_HANDLE_VALUE;
-
-typedef int ssize_t;
-
-#define _SC_PAGE_SIZE 1
-#define MS_SYNC 1
-
-int sysconf(int);
-int msync(void*, int, int);
-int ftruncate(FD, unsigned int);
-
-long lrint(float);
-
-/*
-struct timeval
-{
- float tv_sec, tv_usec;
-};
-
-struct rusage
-{
- timeval ru_utime, ru_stime;
-};
-*/
-
-//inline int getrusage(int, struct rusage*) { return 0; }
-//extern int RUSAGE_SELF;
-
-typedef __int64 OFF_T;
-//#define OFF_T __int64
-
-#ifndef S_ISDIR
-#define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
-#endif
-
-#ifndef S_ISREG
-#define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
-#endif
-
-int mkdtemp(const char*);
-int munmap(void *, int);
-void *mmap(void*, int, int, int, FD, OFF_T);
-
-#define PROT_READ 1
-#define PROT_WRITE 1
-#define MAP_FAILED (void*) 0x1
-#define MAP_SHARED 1
-#define MAP_ANON 1
-#define MAP_PRIVATE 1
-#define S_IRUSR 1
-#define S_IROTH 1
-#define S_IRGRP 1
-
-int write(int, const void *, int);
-#define S_IRUSR 1
-#define S_IWUSR 1
-
-//const char *strerror_r(int, const char *buf, int);
-
-float strtof(const char *begin, char **end);
-//FILE *popen(const char*, const char*);
-//int pclose(FILE *);
-int close(FD fd);
-
-#define dup(x) _dup(x)
-#define rmdir(x) _rmdir(x)
-#define strerror_r(errNum, buffer, numberOfElements) strerror_s(buffer, numberOfElements);
-
-#else // assume UNIX OS
-
-#include <stdint.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-typedef int FD;
-const FD kBadFD = -1;
-
-typedef off_t OFF_T;
-
-#endif
-
-#ifdef __GNUC__
-#define UTIL_FUNC_NAME __PRETTY_FUNCTION__
-#else
-#ifdef _WIN32
-#define UTIL_FUNC_NAME __FUNCTION__
-#else
-#define UTIL_FUNC_NAME NULL
-#endif
-#endif
-
-/* Bit-level packing routines */
-#ifdef __APPLE__
- #include <architecture/byte_order.h>
-#elif __linux__
- #include <endian.h>
-#elif WIN32
- // nothing
-#else
- #include <arpa/nameser_compat.h>
-#endif
-
diff --git a/kenlm/util/probing_hash_table.hh b/kenlm/util/probing_hash_table.hh
index 2ec342a66..8122d69c5 100644
--- a/kenlm/util/probing_hash_table.hh
+++ b/kenlm/util/probing_hash_table.hh
@@ -61,14 +61,14 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
#endif
{}
- template <class T> void Insert(const T &t) {
+ template <class T> MutableIterator Insert(const T &t) {
if (++entries_ >= buckets_)
UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
#ifdef DEBUG
assert(initialized_);
#endif
for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
- if (equal_(i->GetKey(), invalid_)) { *i = t; return; }
+ if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
if (++i == end_) { i = begin_; }
}
}
diff --git a/kenlm/util/sized_iterator.hh b/kenlm/util/sized_iterator.hh
index aabcc5319..47dfc2454 100644
--- a/kenlm/util/sized_iterator.hh
+++ b/kenlm/util/sized_iterator.hh
@@ -6,7 +6,7 @@
#include <functional>
#include <string>
-#include <stdint.h>
+#include <inttypes.h>
#include <string.h>
namespace util {
diff --git a/kenlm/util/sorted_uniform.hh b/kenlm/util/sorted_uniform.hh
index 0391189f0..0d6ecbbd6 100644
--- a/kenlm/util/sorted_uniform.hh
+++ b/kenlm/util/sorted_uniform.hh
@@ -5,7 +5,7 @@
#include <cstddef>
#include <assert.h>
-#include <stdint.h>
+#include <inttypes.h>
namespace util {