KenLM update including progress on ARM and MinGW from NICT

author: Kenneth Heafield <github@kheafield.com> 2011-11-11 00:46:59 +0400
committer: Kenneth Heafield <github@kheafield.com> 2011-11-11 00:46:59 +0400
commit: d732f63ec2f8d54092da298fff289ee8bba1e419 (patch)
tree: 2b9299039fe1841acae93d4bc8932d06e20e8907 /kenlm
parent: 9903a239ea33d58484cdb625f89135d96869dd5c (diff)
38 files changed, 678 insertions, 895 deletions
diff --git a/kenlm/lm/bhiksha.cc b/kenlm/lm/bhiksha.cc
index 0c187960d..cdeafb478 100644
--- a/kenlm/lm/bhiksha.cc
+++ b/kenlm/lm/bhiksha.cc
@@ -1,5 +1,6 @@
 #include "lm/bhiksha.hh"
 #include "lm/config.hh"
+#include "util/file.hh"
 
 #include <limits>
 
@@ -12,16 +13,12 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_
 
 const uint8_t kArrayBhikshaVersion = 0;
 
-void ArrayBhiksha::UpdateConfigFromBinary(FD fd, Config &config) {
+// TODO: put this in binary file header instead when I change the binary file format again.  
+void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
   uint8_t version;
   uint8_t configured_bits;
-#ifdef WIN32
-#else
-  if (read(fd, &version, 1) != 1 || read(fd, &configured_bits, 1) != 1) {
-    UTIL_THROW(util::ErrnoException, "Could not read from binary file");
-  }
-#endif
-
+  util::ReadOrThrow(fd, &version, 1);
+  util::ReadOrThrow(fd, &configured_bits, 1);
   if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
   config.pointer_bhiksha_bits = configured_bits;
 }
diff --git a/kenlm/lm/bhiksha.hh b/kenlm/lm/bhiksha.hh
index 9f615a477..3df43dda9 100644
--- a/kenlm/lm/bhiksha.hh
+++ b/kenlm/lm/bhiksha.hh
@@ -10,14 +10,16 @@
  *  Currently only used for next pointers.  
  */
 
-#include <stdint.h>
+#ifndef LM_BHIKSHA__
+#define LM_BHIKSHA__
+
+#include <inttypes.h>
 #include <assert.h>
 
 #include "lm/model_type.hh"
 #include "lm/trie.hh"
 #include "util/bit_packing.hh"
 #include "util/sorted_uniform.hh"
-#include "util/portability.hh"
 
 namespace lm {
 namespace ngram {
@@ -29,7 +31,7 @@ class DontBhiksha {
   public:
     static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
 
-    static void UpdateConfigFromBinary(FD /*fd*/, Config &/*config*/) {}
+    static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
 
     static std::size_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
 
@@ -63,7 +65,7 @@ class ArrayBhiksha {
   public:
     static const ModelType kModelTypeAdd = kArrayAdd;
 
-    static void UpdateConfigFromBinary(FD fd, Config &config);
+    static void UpdateConfigFromBinary(int fd, Config &config);
 
     static std::size_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
 
@@ -109,3 +111,5 @@ class ArrayBhiksha {
 } // namespace trie
 } // namespace ngram
 } // namespace lm
+
+#endif // LM_BHIKSHA__
diff --git a/kenlm/lm/binary_format.cc b/kenlm/lm/binary_format.cc
index 5fea35118..e7f9cd048 100644
--- a/kenlm/lm/binary_format.cc
+++ b/kenlm/lm/binary_format.cc
@@ -1,17 +1,15 @@
 #include "lm/binary_format.hh"
 
 #include "lm/lm_exception.hh"
+#include "util/file.hh"
 #include "util/file_piece.hh"
 
+#include <cstddef>
+#include <cstring>
 #include <limits>
 #include <string>
 
-#include <fcntl.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
+#include <inttypes.h>
 
 namespace lm {
 namespace ngram {
@@ -30,6 +28,7 @@ struct Sanity {
   uint64_t one_uint64;
 
   void SetToReference() {
+    std::memset(this, 0, sizeof(Sanity));
     std::memcpy(magic, kMagicBytes, sizeof(magic));
     zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
     one_word_index = 1;
@@ -41,28 +40,13 @@ struct Sanity {
 const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
 
 std::size_t TotalHeaderSize(unsigned char order) {
-  return Align8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
-}
-
-void ReadLoop(FD fd, void *to_void, std::size_t size) {
-  uint8_t *to = static_cast<uint8_t*>(to_void);
-  while (size) {
-#ifdef WIN32
-	ssize_t ret;
-#else
-    ssize_t ret = read(fd, to, size);
-#endif
-    if (ret == -1) UTIL_THROW(util::ErrnoException, "Failed to read from binary file");
-    if (ret == 0) UTIL_THROW(util::ErrnoException, "Binary file too short");
-    to += ret;
-    size -= ret;
-  }
+  return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
 }
 
 void WriteHeader(void *to, const Parameters &params) {
   Sanity header = Sanity();
   header.SetToReference();
-  memcpy(to, &header, sizeof(Sanity));
+  std::memcpy(to, &header, sizeof(Sanity));
   char *out = reinterpret_cast<char*>(to) + sizeof(Sanity);
 
   *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed;
@@ -76,20 +60,6 @@ void WriteHeader(void *to, const Parameters &params) {
 
 } // namespace
 
-void SeekOrThrow(FD fd, off_t off) {
-#ifdef WIN32
-#else
-  if ((off_t)-1 == lseek(fd, off, SEEK_SET)) UTIL_THROW(util::ErrnoException, "Seek failed");
-#endif
-}
-
-void AdvanceOrThrow(FD fd, off_t off) {
-#ifdef WIN32
-#else
-  if ((off_t)-1 == lseek(fd, off, SEEK_CUR)) UTIL_THROW(util::ErrnoException, "Seek failed");
-#endif
-}
-
 uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
   if (config.write_mmap) {
     std::size_t total = TotalHeaderSize(order) + memory_size;
@@ -110,8 +80,8 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
       UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (adjusted_vocab + memory_size) << " failed");
 
     // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.  
-    off_t page_size = sysconf(_SC_PAGE_SIZE);
-    off_t alignment_cruft = adjusted_vocab % page_size;
+    std::size_t page_size = util::SizePage();
+    std::size_t alignment_cruft = adjusted_vocab % page_size;
     backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
 
     return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
@@ -123,8 +93,8 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
 
 void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) {
   if (config.write_mmap) {
-    if (msync(backing.search.get(), backing.search.size(), MS_SYNC) || msync(backing.vocab.get(), backing.vocab.size(), MS_SYNC)) 
-      UTIL_THROW(util::ErrnoException, "msync failed for " << config.write_mmap);
+    util::SyncOrThrow(backing.search.get(), backing.search.size());
+    util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
     // header and vocab share the same mmap.  The header is written here because we know the counts.  
     Parameters params;
     params.counts = counts;
@@ -139,9 +109,9 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
 
 namespace detail {
 
-bool IsBinaryFormat(FD fd) {
-  const off_t size = util::SizeFile(fd);
-  if (size == util::kBadSize || (size <= static_cast<off_t>(sizeof(Sanity)))) return false;
+bool IsBinaryFormat(int fd) {
+  const uint64_t size = util::SizeFile(fd);
+  if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
   // Try reading the header.  
   util::scoped_memory memory;
   try {
@@ -167,14 +137,14 @@ bool IsBinaryFormat(FD fd) {
   return false;
 }
 
-void ReadHeader(FD fd, Parameters &out) {
-  SeekOrThrow(fd, sizeof(Sanity));
-  ReadLoop(fd, &out.fixed, sizeof(out.fixed));
+void ReadHeader(int fd, Parameters &out) {
+  util::SeekOrThrow(fd, sizeof(Sanity));
+  util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed));
   if (out.fixed.probing_multiplier < 1.0)
     UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
 
   out.counts.resize(static_cast<std::size_t>(out.fixed.order));
-  ReadLoop(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
+  util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
 }
 
 void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params) {
@@ -186,12 +156,12 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
   UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
 }
 
-void SeekPastHeader(FD fd, const Parameters &params) {
-  SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
+void SeekPastHeader(int fd, const Parameters &params) {
+  util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
 }
 
 uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing) {
-  const off_t file_size = util::SizeFile(backing.file.get());
+  const uint64_t file_size = util::SizeFile(backing.file.get());
   // The header is smaller than a page, so we have to map the whole header as well.  
   std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size;
   if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
@@ -203,7 +173,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
     UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them.  You may need to rebuild the binary file with an updated version of build_binary.");
 
   if (config.enumerate_vocab) {
-    SeekOrThrow(backing.file.get(), total_map);
+    util::SeekOrThrow(backing.file.get(), total_map);
   }
   return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
 }
diff --git a/kenlm/lm/binary_format.hh b/kenlm/lm/binary_format.hh
index fc5995a96..8adb1ec48 100644
--- a/kenlm/lm/binary_format.hh
+++ b/kenlm/lm/binary_format.hh
@@ -8,12 +8,11 @@
 #include "util/file_piece.hh"
 #include "util/mmap.hh"
 #include "util/scoped.hh"
-#include "util/portability.hh"
 
 #include <cstddef>
 #include <vector>
 
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace lm {
 namespace ngram {
@@ -34,10 +33,8 @@ struct FixedWidthParameters {
   unsigned int search_version;
 };
 
-inline std::size_t Align8(std::size_t in) {
-  std::size_t off = in % 8;
-  return off ? (in + 8 - off) : in;
-}
+// This is a macro instead of an inline function so constants can be assigned using it.
+#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
 
 // Parameters stored in the header of a binary file.  
 struct Parameters {
@@ -54,10 +51,6 @@ struct Backing {
   util::scoped_memory search;
 };
 
-void SeekOrThrow(FD fd, off_t off);
-// Seek forward
-void AdvanceOrThrow(FD fd, off_t off);
-
 // Create just enough of a binary file to write vocabulary to it.  
 uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
 // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.  
@@ -69,13 +62,13 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
 
 namespace detail {
 
-bool IsBinaryFormat(FD fd);
+bool IsBinaryFormat(int fd);
 
-void ReadHeader(FD fd, Parameters &params);
+void ReadHeader(int fd, Parameters &params);
 
 void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
 
-void SeekPastHeader(FD fd, const Parameters &params);
+void SeekPastHeader(int fd, const Parameters &params);
 
 uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing);
 
diff --git a/kenlm/lm/blank.hh b/kenlm/lm/blank.hh
index 68a809a01..2fb64cd03 100644
--- a/kenlm/lm/blank.hh
+++ b/kenlm/lm/blank.hh
@@ -1,7 +1,10 @@
 #ifndef LM_BLANK__
 #define LM_BLANK__
 
-#include <stdint.h>
+#include <limits>
+
+#include <inttypes.h>
+#include <math.h>
 
 namespace lm {
 namespace ngram {
diff --git a/kenlm/lm/build_binary.cc b/kenlm/lm/build_binary.cc
index f4172f23c..5a0d98dc6 100644
--- a/kenlm/lm/build_binary.cc
+++ b/kenlm/lm/build_binary.cc
@@ -8,14 +8,13 @@
 
 #include <math.h>
 #include <stdlib.h>
-#include "util/portability.hh"
 
 namespace lm {
 namespace ngram {
 namespace {
 
 void Usage(const char *name) {
-  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-c bits] [type] input.arpa [output.mmap]\n\n"
+  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
 "-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
 "   Default is -100.  The ARPA file will always take precedence.\n"
 "-s allows models to be built even if they do not have <s> and </s>.\n"
@@ -87,7 +86,7 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
     prefix = 'G';
     divide = 1 << 30;
   }
-  long int length = std::max<long int>(2, lrint(ceil(log10((double) max_length / (double)divide))));
+  long int length = std::max<long int>(2, lrint(ceil(log10(max_length / divide))));
   std::cout << "Memory estimate:\ntype    ";
   // right align bytes.  
   for (long int i = 0; i < length - 2; ++i) std::cout << ' ';
diff --git a/kenlm/lm/model.cc b/kenlm/lm/model.cc
index e11d36148..042955efd 100644
--- a/kenlm/lm/model.cc
+++ b/kenlm/lm/model.cc
@@ -44,7 +44,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
   P::Init(begin_sentence, null_context, vocab_, search_.MiddleEnd() - search_.MiddleBegin() + 2);
 }
 
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, FD fd) {
+template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
   SetupMemory(start, params.counts, config);
   vocab_.LoadedBinary(fd, config.enumerate_vocab);
   search_.LoadedBinary();
@@ -89,10 +89,15 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
   }
 }
 
+template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
+  util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
+  Search::UpdateConfigFromBinary(fd, counts, config);
+}
+
 template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
   FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
-  if (ret.ngram_length - 1 < in_state.length) {
-    ret.prob = std::accumulate(in_state.backoff + ret.ngram_length - 1, in_state.backoff + in_state.length, ret.prob);
+  for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
+    ret.prob += *i;
   }
   return ret;
 }
diff --git a/kenlm/lm/model.hh b/kenlm/lm/model.hh
index df9541fc7..1196a0c48 100644
--- a/kenlm/lm/model.hh
+++ b/kenlm/lm/model.hh
@@ -13,7 +13,6 @@
 #include "lm/weights.hh"
 
 #include "util/murmur_hash.hh"
-#include "util/portability.hh"
 
 #include <algorithm>
 #include <vector>
@@ -138,21 +137,16 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
         unsigned char &next_use) const;
 
   private:
-    friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
+    friend void LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
 
-    static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config) {
-      AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
-      Search::UpdateConfigFromBinary(fd, counts, config);
-    }
-
-    float SlowBackoffLookup(const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const;
+    static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
 
     FullScoreReturn ScoreExceptBackoff(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
 
     // Appears after Size in the cc file.
     void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
 
-    void InitializeFromBinary(void *start, const Parameters &params, const Config &config, FD fd);
+    void InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd);
 
     void InitializeFromARPA(const char *file, const Config &config);
 
diff --git a/kenlm/lm/ngram_query.cc b/kenlm/lm/ngram_query.cc
index 2f5bd0725..50ceef5c8 100644
--- a/kenlm/lm/ngram_query.cc
+++ b/kenlm/lm/ngram_query.cc
@@ -7,16 +7,17 @@
 #include <string>
 
 #include <ctype.h>
-
-#include "util/portability.hh"
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
 
 float FloatSec(const struct timeval &tv) {
   return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0);
 }
 
 void PrintUsage(const char *message) {
-#ifdef WIN32
-#else
+#if !defined(_WIN32) && !defined(_WIN64)
   struct rusage usage;
   if (getrusage(RUSAGE_SELF, &usage)) {
     perror("getrusage");
@@ -24,7 +25,6 @@ void PrintUsage(const char *message) {
   }
   std::cerr << message;
   std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
-#endif
 
   // Linux doesn't set memory usage :-(.  
   std::ifstream status("/proc/self/status", std::ios::in);
@@ -35,6 +35,7 @@ void PrintUsage(const char *message) {
       break;
     }
   }
+#endif
 }
 
 template <class Model> void Query(const Model &model, bool sentence_context) {
diff --git a/kenlm/lm/quantize.cc b/kenlm/lm/quantize.cc
index 3a9eccf3a..8de37e827 100644
--- a/kenlm/lm/quantize.cc
+++ b/kenlm/lm/quantize.cc
@@ -1,25 +1,23 @@
+/* Quantize into bins of equal size as described in
+ * M. Federico and N. Bertoldi. 2006. How many bits are needed
+ * to store probabilities for phrase-based translation? In Proc.
+ * of the Workshop on Statistical Machine Translation, pages
+ * 94–101, New York City, June. Association for Computa-
+ * tional Linguistics.
+ */
+
 #include "lm/quantize.hh"
 
 #include "lm/binary_format.hh"
 #include "lm/lm_exception.hh"
+#include "util/file.hh"
 
 #include <algorithm>
 #include <numeric>
-#include <limits>
-
-#include "util/portability.hh"
 
 namespace lm {
 namespace ngram {
 
-/* Quantize into bins of equal size as described in
- * M. Federico and N. Bertoldi. 2006. How many bits are needed
- * to store probabilities for phrase-based translation? In Proc.
- * of the Workshop on Statistical Machine Translation, pages
- * 94–101, New York City, June. Association for Computa-
- * tional Linguistics.
- */
-
 namespace {
 
 void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) {
@@ -40,15 +38,13 @@ const char kSeparatelyQuantizeVersion = 2;
 
 } // namespace
 
-void SeparatelyQuantize::UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
+void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
   char version;
-#ifdef WIN32
-#else
-  if (read(fd, &version, 1) != 1 || read(fd, &config.prob_bits, 1) != 1 || read(fd, &config.backoff_bits, 1) != 1) 
-    UTIL_THROW(util::ErrnoException, "Failed to read header for quantization.");
-#endif
+  util::ReadOrThrow(fd, &version, 1);
+  util::ReadOrThrow(fd, &config.prob_bits, 1);
+  util::ReadOrThrow(fd, &config.backoff_bits, 1);
   if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
-  AdvanceOrThrow(fd, -3);
+  util::AdvanceOrThrow(fd, -3);
 }
 
 void SeparatelyQuantize::SetupMemory(void *start, const Config &config) {
diff --git a/kenlm/lm/quantize.hh b/kenlm/lm/quantize.hh
index 855c8ba66..4cf4236eb 100644
--- a/kenlm/lm/quantize.hh
+++ b/kenlm/lm/quantize.hh
@@ -9,7 +9,7 @@
 #include <algorithm>
 #include <vector>
 
-#include <stdint.h>
+#include <inttypes.h>
 
 #include <iostream>
 
@@ -22,7 +22,7 @@ class Config;
 class DontQuantize {
   public:
     static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
-    static void UpdateConfigFromBinary(FD, const std::vector<uint64_t> &, Config &) {}
+    static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
     static std::size_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; }
     static uint8_t MiddleBits(const Config &/*config*/) { return 63; }
     static uint8_t LongestBits(const Config &/*config*/) { return 31; }
@@ -113,7 +113,7 @@ class SeparatelyQuantize {
   public:
     static const ModelType kModelTypeAdd = kQuantAdd;
 
-    static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config);
+    static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config);
 
     static std::size_t Size(uint8_t order, const Config &config) {
       size_t longest_table = (static_cast<size_t>(1) << static_cast<size_t>(config.prob_bits)) * sizeof(float);
diff --git a/kenlm/lm/read_arpa.cc b/kenlm/lm/read_arpa.cc
index 05f761be6..dce73f771 100644
--- a/kenlm/lm/read_arpa.cc
+++ b/kenlm/lm/read_arpa.cc
@@ -8,7 +8,7 @@
 
 #include <ctype.h>
 #include <string.h>
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace lm {
 
diff --git a/kenlm/lm/return.hh b/kenlm/lm/return.hh
index 1b55091b2..155719605 100644
--- a/kenlm/lm/return.hh
+++ b/kenlm/lm/return.hh
@@ -1,7 +1,7 @@
 #ifndef LM_RETURN__
 #define LM_RETURN__
 
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace lm {
 /* Structure returned by scoring routines. */
diff --git a/kenlm/lm/search_hashed.hh b/kenlm/lm/search_hashed.hh
index 30a86fb72..e289fd114 100644
--- a/kenlm/lm/search_hashed.hh
+++ b/kenlm/lm/search_hashed.hh
@@ -78,7 +78,7 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
     static const unsigned int kVersion = 0;
 
     // TODO: move probing_multiplier here with next binary file format update.  
-    static void UpdateConfigFromBinary(FD, const std::vector<uint64_t> &, Config &) {}
+    static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
 
     static std::size_t Size(const std::vector<uint64_t> &counts, const Config &config) {
       std::size_t ret = Unigram::Size(counts[0]);
diff --git a/kenlm/lm/search_trie.cc b/kenlm/lm/search_trie.cc
index 633bcdf45..8cb6984b0 100644
--- a/kenlm/lm/search_trie.cc
+++ b/kenlm/lm/search_trie.cc
@@ -13,6 +13,7 @@
 #include "lm/weights.hh"
 #include "lm/word_index.hh"
 #include "util/ersatz_progress.hh"
+#include "util/mmap.hh"
 #include "util/proxy_iterator.hh"
 #include "util/scoped.hh"
 #include "util/sized_iterator.hh"
@@ -20,12 +21,15 @@
 #include <algorithm>
 #include <cstring>
 #include <cstdio>
+#include <cstdlib>
 #include <queue>
 #include <limits>
 #include <numeric>
 #include <vector>
-#include "util/portability.hh"
 
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
 
 namespace lm {
 namespace ngram {
@@ -269,7 +273,7 @@ template <class Quant, class Bhiksha> class WriteEntries {
       contexts_(contexts),
       unigrams_(unigrams),
       middle_(middle),
-      longest_(longest),
+      longest_(longest), 
       bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
       order_(order),
       sri_(sri) {}
@@ -332,7 +336,6 @@ template <class Doing> class BlankManager {
 
     void Visit(const WordIndex *to, unsigned char length, float prob) {
       basis_[length - 1] = prob;
-      // Try to match everything except the last word, which is expected to be different.  
       unsigned char overlap = std::min<unsigned char>(length - 1, been_length_);
       const WordIndex *cur;
       WordIndex *pre;
@@ -349,9 +352,9 @@ template <class Doing> class BlankManager {
       UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
       const float *lower_basis;
       for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {}
-      assert(*lower_basis != kBadProb);
       unsigned char based_on = lower_basis - basis_ + 1;
       for (; cur != to + length - 1; ++blank, ++cur, ++pre) {
+        assert(*lower_basis != kBadProb);
         doing_.MiddleBlank(blank, to, based_on, *lower_basis);
         *pre = *cur;
         // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.  
@@ -460,42 +463,32 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
 
 } // namespace
 
-template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
+template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
   RecordReader inputs[kMaxOrder - 1];
   RecordReader contexts[kMaxOrder - 1];
 
   for (unsigned char i = 2; i <= counts.size(); ++i) {
-    std::stringstream assembled;
-    assembled << file_prefix << static_cast<unsigned int>(i) << "_merged";
-    inputs[i-2].Init(assembled.str(), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
-    util::RemoveOrThrow(assembled.str().c_str());
-    assembled << kContextSuffix;
-    contexts[i-2].Init(assembled.str(), (i-1) * sizeof(WordIndex));
-    util::RemoveOrThrow(assembled.str().c_str());
+    inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
+    contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex));
   }
 
   SRISucks sri;
   std::vector<uint64_t> fixed_counts(counts.size());
+  util::scoped_FILE unigram_file;
+  util::scoped_fd unigram_fd(files.StealUnigram());
   {
-    std::string temp(file_prefix); temp += "unigrams";
-    util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str()));
     util::scoped_memory unigrams;
-    MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
+    MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
     FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
     RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
   }
+  unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
   for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
     if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
   }
   SanityCheckCounts(counts, fixed_counts);
   counts = fixed_counts;
 
-  util::scoped_FILE unigram_file;
-  {
-    std::string name(file_prefix + "unigrams");
-    unigram_file.reset(OpenOrThrow(name.c_str(), "r+"));
-    util::RemoveOrThrow(name.c_str());
-  }
   sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
 
   out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), backing), fixed_counts, config);
@@ -586,42 +579,19 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::LoadedBin
   longest.LoadedBinary();
 }
 
-namespace {
-bool IsDirectory(const char *path) {
-  struct stat info;
-  if (0 != stat(path, &info)) return false;
-  return S_ISDIR(info.st_mode);
-}
-} // namespace
-
 template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
-  std::string temporary_directory;
+  std::string temporary_prefix;
   if (config.temporary_directory_prefix) {
-    temporary_directory = config.temporary_directory_prefix;
-    if (!temporary_directory.empty() && temporary_directory[temporary_directory.size() - 1] != '/' && IsDirectory(temporary_directory.c_str()))
-      temporary_directory += '/';
+    temporary_prefix = config.temporary_directory_prefix;
   } else if (config.write_mmap) {
-    temporary_directory = config.write_mmap;
+    temporary_prefix = config.write_mmap;
   } else {
-    temporary_directory = file;
+    temporary_prefix = file;
   }
-  // Null on end is kludge to ensure null termination.
-  temporary_directory += "_trie_tmp_XXXXXX";
-  temporary_directory += '\0';
-  if (!mkdtemp(&temporary_directory[0])) {
-    UTIL_THROW(util::ErrnoException, "Failed to make a temporary directory based on the name " << temporary_directory.c_str());
-  }
-  // Chop off null kludge.  
-  temporary_directory.resize(strlen(temporary_directory.c_str()));
-  // Add directory delimiter.  Assumes a real operating system.  
-  temporary_directory += '/';
   // At least 1MB sorting memory.  
-  ARPAToSortedFiles(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_directory.c_str(), vocab);
+  SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
 
-  BuildTrie(temporary_directory, counts, config, *this, quant_, vocab, backing);
-  if (rmdir(temporary_directory.c_str()) && config.messages) {
-    *config.messages << "Failed to delete " << temporary_directory << std::endl;
-  }
+  BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
 }
 
 template class TrieSearch<DontQuantize, DontBhiksha>;
diff --git a/kenlm/lm/search_trie.hh b/kenlm/lm/search_trie.hh
index 4a9fab648..caa7a05e2 100644
--- a/kenlm/lm/search_trie.hh
+++ b/kenlm/lm/search_trie.hh
@@ -7,8 +7,8 @@
 #include "lm/trie.hh"
 #include "lm/weights.hh"
 
+#include "util/file.hh"
 #include "util/file_piece.hh"
-#include "util/portability.hh"
 
 #include <vector>
 
@@ -21,7 +21,8 @@ class SortedVocabulary;
 namespace trie {
 
 template <class Quant, class Bhiksha> class TrieSearch;
-template <class Quant, class Bhiksha> void BuildTrie(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
+class SortedFiles;
+template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
 
 template <class Quant, class Bhiksha> class TrieSearch {
   public:
@@ -39,9 +40,9 @@ template <class Quant, class Bhiksha> class TrieSearch {
 
     static const unsigned int kVersion = 1;
 
-    static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config) {
+    static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
       Quant::UpdateConfigFromBinary(fd, counts, config);
-      AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
+      util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
       Bhiksha::UpdateConfigFromBinary(fd, config);
     }
 
@@ -109,7 +110,7 @@ template <class Quant, class Bhiksha> class TrieSearch {
     }
 
   private:
-    friend void BuildTrie<Quant, Bhiksha>(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
+    friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);
 
     // Middles are managed manually so we can delay construction and they don't have to be copyable.  
     void FreeMiddles() {
diff --git a/kenlm/lm/sri.cc b/kenlm/lm/sri.cc
deleted file mode 100644
index 825f699bc..000000000
--- a/kenlm/lm/sri.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "lm/lm_exception.hh"
-#include "lm/sri.hh"
-
-#include <Ngram.h>
-#include <Vocab.h>
-
-#include <errno.h>
-
-namespace lm {
-namespace sri {
-
-Vocabulary::Vocabulary() : sri_(new Vocab) {}
-
-Vocabulary::~Vocabulary() {}
-
-WordIndex Vocabulary::Index(const char *str) const {
-  WordIndex ret = sri_->getIndex(str);
-  // NGram wants the index of Vocab_Unknown for unknown words, but for some reason SRI returns Vocab_None here :-(.
-  if (ret == Vocab_None) {
-    return not_found_;
-  } else {
-    return ret;
-  }
-}
-
-const char *Vocabulary::Word(WordIndex index) const {
-  return sri_->getWord(index);
-}
-
-void Vocabulary::FinishedLoading() {
-  SetSpecial(
-    sri_->ssIndex(),
-    sri_->seIndex(),
-    sri_->unkIndex());
-}
-
-namespace {
-Ngram *MakeSRIModel(const char *file_name, unsigned int ngram_length, Vocab &sri_vocab) {
-  sri_vocab.unkIsWord() = true;
-  std::auto_ptr<Ngram> ret(new Ngram(sri_vocab, ngram_length));
-  File file(file_name, "r");
-  errno = 0;
-  if (!ret->read(file)) {
-    UTIL_THROW(FormatLoadException, "reading file " << file_name << " with SRI failed.");
-  }
-  return ret.release();
-}
-} // namespace
-
-Model::Model(const char *file_name, unsigned int ngram_length) : sri_(MakeSRIModel(file_name, ngram_length, *vocab_.sri_)) {
-  if (!sri_->setorder()) {
-    UTIL_THROW(FormatLoadException, "Can't have an SRI model with order 0.");
-  }
-  vocab_.FinishedLoading();
-  State begin_state = State();
-  begin_state.valid_length_ = 1;
-  if (kMaxOrder > 1) {
-    begin_state.history_[0] = vocab_.BeginSentence();
-    if (kMaxOrder > 2) begin_state.history_[1] = Vocab_None;
-  }
-  State null_state = State();
-  null_state.valid_length_ = 0;
-  if (kMaxOrder > 1) null_state.history_[0] = Vocab_None;
-  Init(begin_state, null_state, vocab_, sri_->setorder());
-  not_found_ = vocab_.NotFound();
-}
-
-Model::~Model() {}
-
-namespace {
-
-/* Argh SRI's wordProb knows the ngram length but doesn't return it.  One more
- * reason you should use my model.  */
-// TODO(stolcke): fix SRILM so I don't have to do this.   
-unsigned int MatchedLength(Ngram &model, const WordIndex new_word, const SRIVocabIndex *const_history) {
-  unsigned int out_length = 0;
-  // This gets the length of context used, which is ngram_length - 1 unless new_word is OOV in which case it is 0.
-  model.contextID(new_word, const_history, out_length);
-  return out_length + 1;
-}
-
-} // namespace
-
-FullScoreReturn Model::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
-  // If you get a compiler in this function, change SRIVocabIndex in sri.hh to match the one found in SRI's Vocab.h.
-  const SRIVocabIndex *const_history;
-  SRIVocabIndex local_history[Order()];
-  if (in_state.valid_length_ < kMaxOrder - 1) {
-    const_history = in_state.history_;
-  } else {
-    std::copy(in_state.history_, in_state.history_ + in_state.valid_length_, local_history);
-    local_history[in_state.valid_length_] = Vocab_None;
-    const_history = local_history;
-  }
-  FullScoreReturn ret;
-  ret.ngram_length = MatchedLength(*sri_, new_word, const_history);
-  out_state.history_[0] = new_word;
-  out_state.valid_length_ = std::min<unsigned char>(ret.ngram_length, Order() - 1);
-  std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1);
-  if (out_state.valid_length_ < kMaxOrder - 1) {
-    out_state.history_[out_state.valid_length_] = Vocab_None;
-  }
-  ret.prob = sri_->wordProb(new_word, const_history);
-  return ret;
-}
-
-} // namespace sri
-} // namespace lm
diff --git a/kenlm/lm/sri.hh b/kenlm/lm/sri.hh
deleted file mode 100644
index b57e9b73a..000000000
--- a/kenlm/lm/sri.hh
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef LM_SRI__
-#define LM_SRI__
-
-#include "lm/facade.hh"
-#include "util/murmur_hash.hh"
-
-#include <cmath>
-#include <exception>
-#include <memory>
-
-class Ngram;
-class Vocab;
-
-/* The ngram length reported uses some random API I found and may be wrong.
- *
- * See ngram, which should return equivalent results.
- */
-
-namespace lm {
-namespace sri {
-
-static const unsigned int kMaxOrder = 6;
-
-/* This should match VocabIndex found in SRI's Vocab.h
- * The reason I define this here independently is that SRI's headers
- * pollute and increase compile time.
- * It's difficult to extract this from their header and anyway would
- * break packaging.
- * If these differ there will be a compiler error in ActuallyCall.
- */
-typedef unsigned int SRIVocabIndex;
-
-class State {
-  public:
-    // You shouldn't need to touch these, but they're public so State will be a POD.
-    // If valid_length_ < kMaxOrder - 1 then history_[valid_length_] == Vocab_None.
-    SRIVocabIndex history_[kMaxOrder - 1];
-    unsigned char valid_length_;
-};
-
-inline bool operator==(const State &left, const State &right) {
-  if (left.valid_length_ != right.valid_length_) {
-    return false;
-  }
-  for (const SRIVocabIndex *l = left.history_, *r = right.history_;
-      l != left.history_ + left.valid_length_;
-      ++l, ++r) {
-    if (*l != *r) return false;
-  }
-  return true;
-}
-
-inline size_t hash_value(const State &state) {
-  return util::MurmurHashNative(&state.history_, sizeof(SRIVocabIndex) * state.valid_length_);
-}
-
-class Vocabulary : public base::Vocabulary {
-  public:
-    Vocabulary();
-
-    ~Vocabulary();
-
-    WordIndex Index(const StringPiece &str) const {
-      std::string temp(str.data(), str.length());
-      return Index(temp.c_str());
-    }
-    WordIndex Index(const std::string &str) const {
-      return Index(str.c_str());
-    }
-    WordIndex Index(const char *str) const;
-
-    const char *Word(WordIndex index) const;
-
-  private:
-    friend class Model;
-    void FinishedLoading();
-
-    // The parent class isn't copyable so auto_ptr is the same as scoped_ptr
-    // but without the boost dependence.  
-    mutable std::auto_ptr<Vocab> sri_;
-};
-
-class Model : public base::ModelFacade<Model, State, Vocabulary> {
-  public:
-    Model(const char *file_name, unsigned int ngram_length);
-
-    ~Model();
-
-    FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
-
-  private:
-    Vocabulary vocab_;
-
-    mutable std::auto_ptr<Ngram> sri_;
-
-    WordIndex not_found_;
-};
-
-} // namespace sri
-} // namespace lm
-
-#endif // LM_SRI__
diff --git a/kenlm/lm/trie.hh b/kenlm/lm/trie.hh
index ebe9910f0..06cc96ac4 100644
--- a/kenlm/lm/trie.hh
+++ b/kenlm/lm/trie.hh
@@ -1,7 +1,7 @@
 #ifndef LM_TRIE__
 #define LM_TRIE__
 
-#include <stdint.h>
+#include <inttypes.h>
 
 #include <cstddef>
 
diff --git a/kenlm/lm/trie_sort.cc b/kenlm/lm/trie_sort.cc
index bb126f18c..9d1d5f27f 100644
--- a/kenlm/lm/trie_sort.cc
+++ b/kenlm/lm/trie_sort.cc
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <cstring>
 #include <cstdio>
+#include <cstdlib>
 #include <deque>
 #include <limits>
 #include <vector>
@@ -22,14 +23,6 @@ namespace lm {
 namespace ngram {
 namespace trie {
 
-const char *kContextSuffix = "_contexts";
-
-FILE *OpenOrThrow(const char *name, const char *mode) {
-  FILE *ret = fopen(name, mode);
-  if (!ret) UTIL_THROW(util::ErrnoException, "Could not open " << name << " for " << mode);
-  return ret;
-}
-
 void WriteOrThrow(FILE *to, const void *data, size_t size) {
   assert(size);
   if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size);
@@ -78,16 +71,13 @@ class PartialViewProxy {
 
 typedef util::ProxyIterator<PartialViewProxy> PartialIter;
 
-std::string DiskFlush(const void *mem_begin, const void *mem_end, const std::string &file_prefix, std::size_t batch, unsigned char order) {
-  std::stringstream assembled;
-  assembled << file_prefix << static_cast<unsigned int>(order) << '_' << batch;
-  std::string ret(assembled.str());
-  util::scoped_fd out(util::CreateOrThrow(ret.c_str()));
-  util::WriteOrThrow(out.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
-  return ret;
+FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) {
+  util::scoped_fd file(maker.Make());
+  util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
+  return util::FDOpenOrThrow(file);
 }
 
-void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_file_name, std::size_t entry_size, unsigned char order) {
+FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) {
   const size_t context_size = sizeof(WordIndex) * (order - 1);
   // Sort just the contexts using the same memory.  
   PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
@@ -95,11 +85,10 @@ void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_fil
 
   std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
 
-  std::string name(ngram_file_name + kContextSuffix);
-  util::scoped_FILE out(OpenOrThrow(name.c_str(), "w"));
+  util::scoped_FILE out(maker.MakeFile());
 
   // Write out to file and uniqueify at the same time.  Could have used unique_copy if there was an appropriate OutputIterator.  
-  if (context_begin == context_end) return;
+  if (context_begin == context_end) return out.release();
   PartialIter i(context_begin);
   WriteOrThrow(out.get(), i->Data(), context_size);
   const void *previous = i->Data();
@@ -110,6 +99,7 @@ void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_fil
       previous = i->Data();
     }
   }
+  return out.release();
 }
 
 struct ThrowCombine {
@@ -125,14 +115,12 @@ struct FirstCombine {
   }
 };
 
-template <class Combine> void MergeSortedFiles(const std::string &first_name, const std::string &second_name, const std::string &out, std::size_t weights_size, unsigned char order, const Combine &combine = ThrowCombine()) {
+template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) {
   std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
   RecordReader first, second;
-  first.Init(first_name.c_str(), entry_size);
-  util::RemoveOrThrow(first_name.c_str());
-  second.Init(second_name.c_str(), entry_size);
-  util::RemoveOrThrow(second_name.c_str());
-  util::scoped_FILE out_file(OpenOrThrow(out.c_str(), "w"));
+  first.Init(first_file, entry_size);
+  second.Init(second_file, entry_size);
+  util::scoped_FILE out_file(maker.MakeFile());
   EntryCompare less(order);
   while (first && second) {
     if (less(first.Data(), second.Data())) {
@@ -149,67 +137,14 @@ template <class Combine> void MergeSortedFiles(const std::string &first_name, co
   for (RecordReader &remains = (first ? first : second); remains; ++remains) {
     WriteOrThrow(out_file.get(), remains.Data(), entry_size);
   }
-}
-
-void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn) {
-  ReadNGramHeader(f, order);
-  const size_t count = counts[order - 1];
-  // Size of weights.  Does it include backoff?  
-  const size_t words_size = sizeof(WordIndex) * order;
-  const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
-  const size_t entry_size = words_size + weights_size;
-  const size_t batch_size = std::min(count, mem.size() / entry_size);
-  uint8_t *const begin = reinterpret_cast<uint8_t*>(mem.get());
-  std::deque<std::string> files;
-  for (std::size_t batch = 0, done = 0; done < count; ++batch) {
-    uint8_t *out = begin;
-    uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
-    if (order == counts.size()) {
-      for (; out != out_end; out += entry_size) {
-        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
-      }
-    } else {
-      for (; out != out_end; out += entry_size) {
-        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
-      }
-    }
-    // Sort full records by full n-gram.  
-    util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
-    // parallel_sort uses too much RAM
-    std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
-    files.push_back(DiskFlush(begin, out_end, file_prefix, batch, order));
-    WriteContextFile(begin, out_end, files.back(), entry_size, order);
-
-    done += (out_end - begin) / entry_size;
-  }
-
-  // All individual files created.  Merge them.  
-
-  std::size_t merge_count = 0;
-  while (files.size() > 1) {
-    std::stringstream assembled;
-    assembled << file_prefix << static_cast<unsigned int>(order) << "_merge_" << (merge_count++);
-    files.push_back(assembled.str());
-    MergeSortedFiles(files[0], files[1], files.back(), weights_size, order, ThrowCombine());
-    MergeSortedFiles(files[0] + kContextSuffix, files[1] + kContextSuffix, files.back() + kContextSuffix, 0, order - 1, FirstCombine());
-    files.pop_front();
-    files.pop_front();
-  }
-  if (!files.empty()) {
-    std::stringstream assembled;
-    assembled << file_prefix << static_cast<unsigned int>(order) << "_merged";
-    std::string merged_name(assembled.str());
-    if (std::rename(files[0].c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << files[0].c_str() << " to " << merged_name.c_str());
-    std::string context_name = files[0] + kContextSuffix;
-    merged_name += kContextSuffix;
-    if (std::rename(context_name.c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << context_name << " to " << merged_name.c_str());
-  }
+  return out_file.release();
 }
 
 } // namespace
 
-void RecordReader::Init(const std::string &name, std::size_t entry_size) {
-  file_.reset(OpenOrThrow(name.c_str(), "r+"));
+void RecordReader::Init(FILE *file, std::size_t entry_size) {
+  rewind(file);
+  file_ = file;
   data_.reset(malloc(entry_size));
   UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer");
   remains_ = true;
@@ -219,20 +154,26 @@ void RecordReader::Init(const std::string &name, std::size_t entry_size) {
 
 void RecordReader::Overwrite(const void *start, std::size_t amount) {
   long internal = (uint8_t*)start - (uint8_t*)data_.get();
-  UTIL_THROW_IF(fseek(file_.get(), internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
-  WriteOrThrow(file_.get(), start, amount);
+  UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
+  WriteOrThrow(file_, start, amount);
   long forward = entry_size_ - internal - amount;
-  if (forward) UTIL_THROW_IF(fseek(file_.get(), forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
+  if (forward) UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
 }
 
-void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+void RecordReader::Rewind() {
+  rewind(file_);
+  remains_ = true;
+  ++*this;
+}
+
+SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+  util::TempMaker maker(file_prefix);
   PositiveProbWarn warn(config.positive_log_probability);
+  unigram_.reset(maker.Make());
   {
-    std::string unigram_name = file_prefix + "unigrams";
-    util::scoped_fd unigram_file;
     // In case <unk> appears.  
-    size_t file_out = (counts[0] + 1) * sizeof(ProbBackoff);
-    util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), file_out, unigram_file), file_out);
+    size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
+    util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out);
     Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
     CheckSpecials(config, vocab);
     if (!vocab.SawUnk()) ++counts[0];
@@ -246,16 +187,91 @@ void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uin
   buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
   buffer = std::min<size_t>(buffer, buffer_use);
 
-  util::scoped_memory mem;
-  mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED);
+  util::scoped_malloc mem;
+  mem.reset(malloc(buffer));
   if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);
 
   for (unsigned char order = 2; order <= counts.size(); ++order) {
-    ConvertToSorted(f, vocab, counts, mem, file_prefix, order, warn);
+    ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer);
   }
   ReadEnd(f);
 }
 
+namespace {
+class Closer {
+  public:
+    explicit Closer(std::deque<FILE*> &files) : files_(files) {}
+
+    ~Closer() {
+      for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); ++i) {
+        util::scoped_FILE deleter(*i);
+      }
+    }
+
+    void PopFront() {
+      util::scoped_FILE deleter(files_.front());
+      files_.pop_front();
+    }
+  private:
+    std::deque<FILE*> &files_;
+};
+} // namespace
+
+void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
+  ReadNGramHeader(f, order);
+  const size_t count = counts[order - 1];
+  // Size of weights.  Does it include backoff?  
+  const size_t words_size = sizeof(WordIndex) * order;
+  const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
+  const size_t entry_size = words_size + weights_size;
+  const size_t batch_size = std::min(count, mem_size / entry_size);
+  uint8_t *const begin = reinterpret_cast<uint8_t*>(mem);
+
+  std::deque<FILE*> files, contexts;
+  Closer files_closer(files), contexts_closer(contexts);
+
+  for (std::size_t batch = 0, done = 0; done < count; ++batch) {
+    uint8_t *out = begin;
+    uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
+    if (order == counts.size()) {
+      for (; out != out_end; out += entry_size) {
+        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
+      }
+    } else {
+      for (; out != out_end; out += entry_size) {
+        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
+      }
+    }
+    // Sort full records by full n-gram.  
+    util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
+    // parallel_sort uses too much RAM
+    std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
+    files.push_back(DiskFlush(begin, out_end, maker));
+    contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
+
+    done += (out_end - begin) / entry_size;
+  }
+
+  // All individual files created.  Merge them.  
+
+  while (files.size() > 1) {
+    files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine()));
+    files_closer.PopFront();
+    files_closer.PopFront();
+    contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine()));
+    contexts_closer.PopFront();
+    contexts_closer.PopFront();
+  }
+
+  if (!files.empty()) {
+    // Steal from closers.
+    full_[order - 2].reset(files.front());
+    files.pop_front();
+    context_[order - 2].reset(contexts.front());
+    contexts.pop_front();
+  }
+}
+
 } // namespace trie
 } // namespace ngram
 } // namespace lm
diff --git a/kenlm/lm/trie_sort.hh b/kenlm/lm/trie_sort.hh
index c57b36186..a8e552e6e 100644
--- a/kenlm/lm/trie_sort.hh
+++ b/kenlm/lm/trie_sort.hh
@@ -1,6 +1,9 @@
+// Step of trie builder: create sorted files.  
+
 #ifndef LM_TRIE_SORT__
 #define LM_TRIE_SORT__
 
+#include "lm/max_order.hh"
 #include "lm/word_index.hh"
 
 #include "util/file.hh"
@@ -11,20 +14,21 @@
 #include <string>
 #include <vector>
 
-#include <stdint.h>
+#include <inttypes.h>
 
-namespace util { class FilePiece; }
+namespace util {
+class FilePiece;
+class TempMaker;
+} // namespace util
 
-// Step of trie builder: create sorted files.  
 namespace lm {
+class PositiveProbWarn;
 namespace ngram {
 class SortedVocabulary;
 class Config;
 
 namespace trie {
 
-extern const char *kContextSuffix;
-FILE *OpenOrThrow(const char *name, const char *mode);
 void WriteOrThrow(FILE *to, const void *data, size_t size);
 
 class EntryCompare : public std::binary_function<const void*, const void*, bool> {
@@ -49,15 +53,15 @@ class RecordReader {
   public:
     RecordReader() : remains_(true) {}
 
-    void Init(const std::string &name, std::size_t entry_size);
+    void Init(FILE *file, std::size_t entry_size);
 
     void *Data() { return data_.get(); }
     const void *Data() const { return data_.get(); }
 
     RecordReader &operator++() {
-      std::size_t ret = fread(data_.get(), entry_size_, 1, file_.get());
+      std::size_t ret = fread(data_.get(), entry_size_, 1, file_);
       if (!ret) {
-        UTIL_THROW_IF(!feof(file_.get()), util::ErrnoException, "Error reading temporary file");
+        UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file");
         remains_ = false;
       }
       return *this;
@@ -65,27 +69,46 @@ class RecordReader {
 
     operator bool() const { return remains_; }
 
-    void Rewind() {
-      rewind(file_.get());
-      remains_ = true;
-      ++*this;
-    }
+    void Rewind();
 
     std::size_t EntrySize() const { return entry_size_; }
 
     void Overwrite(const void *start, std::size_t amount);
 
   private:
+    FILE *file_;
+
     util::scoped_malloc data_;
 
     bool remains_;
 
     std::size_t entry_size_;
-
-    util::scoped_FILE file_;
 };
 
-void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab);
+class SortedFiles {
+  public:
+    // Build from ARPA
+    SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab);
+
+    int StealUnigram() {
+      return unigram_.release();
+    }
+
+    FILE *Full(unsigned char order) {
+      return full_[order - 2].get();
+    }
+
+    FILE *Context(unsigned char of_order) {
+      return context_[of_order - 2].get();
+    }
+
+  private:
+    void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
+    
+    util::scoped_fd unigram_;
+
+    util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1];
+};
 
 } // namespace trie
 } // namespace ngram
diff --git a/kenlm/lm/vocab.cc b/kenlm/lm/vocab.cc
index 12e19f0ec..5ac828178 100644
--- a/kenlm/lm/vocab.cc
+++ b/kenlm/lm/vocab.cc
@@ -6,6 +6,7 @@
 #include "lm/config.hh"
 #include "lm/weights.hh"
 #include "util/exception.hh"
+#include "util/file.hh"
 #include "util/joint_sort.hh"
 #include "util/murmur_hash.hh"
 #include "util/probing_hash_table.hh"
@@ -29,7 +30,7 @@ const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
 // Sadly some LMs have <UNK>.  
 const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
 
-WordIndex ReadWords(FD fd, EnumerateVocab *enumerate) {
+WordIndex ReadWords(int fd, EnumerateVocab *enumerate) {
   if (!enumerate) return std::numeric_limits<WordIndex>::max();
   const std::size_t kInitialRead = 16384;
   std::string buf;
@@ -37,23 +38,12 @@ WordIndex ReadWords(FD fd, EnumerateVocab *enumerate) {
   buf.resize(kInitialRead);
   WordIndex index = 0;
   while (true) {
-#ifdef WIN32
-	ssize_t got;
-#else
-    ssize_t got = read(fd, &buf[0], kInitialRead);
-#endif
-    UTIL_THROW_IF(got == -1, util::ErrnoException, "Reading vocabulary words");
+    std::size_t got = util::ReadOrEOF(fd, &buf[0], kInitialRead);
     if (got == 0) return index;
     buf.resize(got);
     while (buf[buf.size() - 1]) {
       char next_char;
-#ifdef WIN32
-	ssize_t ret;
-#else
-      ssize_t ret = read(fd, &next_char, 1);
-#endif
-      UTIL_THROW_IF(ret == -1, util::ErrnoException, "Reading vocabulary words");
-      UTIL_THROW_IF(ret == 0, FormatLoadException, "Missing null terminator on a vocab word.");
+      util::ReadOrThrow(fd, &next_char, 1);
       buf.push_back(next_char);
     }
     // Ok now we have null terminated strings.  
@@ -76,12 +66,8 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
   buffer_.push_back(0);
 }
 
-void WriteWordsWrapper::Write(FD fd) {
-#ifdef WIN32
-#else
-  if ((off_t)-1 == lseek(fd, 0, SEEK_END))
-    UTIL_THROW(util::ErrnoException, "Failed to seek in binary to vocab words");
-#endif
+void WriteWordsWrapper::Write(int fd) {
+  util::SeekEnd(fd);
   util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
 }
 
@@ -142,10 +128,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
   bound_ = end_ - begin_ + 1;
 }
 
-void SortedVocabulary::LoadedBinary(FD fd, EnumerateVocab *to) {
+void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
   end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
   ReadWords(fd, to);
   SetSpecial(Index("<s>"), Index("</s>"), 0);
+  bound_ = end_ - begin_ + 1;
 }
 
 namespace {
@@ -163,12 +150,12 @@ struct ProbingVocabularyHeader {
 ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {}
 
 std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) {
-  return Align8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier);
+  return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier);
 }
 
 void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) {
   header_ = static_cast<detail::ProbingVocabularyHeader*>(start);
-  lookup_ = Lookup(static_cast<uint8_t*>(start) + Align8(sizeof(detail::ProbingVocabularyHeader)), allocated);
+  lookup_ = Lookup(static_cast<uint8_t*>(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated);
   bound_ = 1;
   saw_unk_ = false;
 }
@@ -200,7 +187,7 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
   SetSpecial(Index("<s>"), Index("</s>"), 0);
 }
 
-void ProbingVocabulary::LoadedBinary(FD fd, EnumerateVocab *to) {
+void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
   UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ".  Please rerun build_binary using the same version of the code.");
   lookup_.LoadedBinary();
   ReadWords(fd, to);
diff --git a/kenlm/lm/vocab.hh b/kenlm/lm/vocab.hh
index 26de3647d..3c3414fb9 100644
--- a/kenlm/lm/vocab.hh
+++ b/kenlm/lm/vocab.hh
@@ -8,7 +8,6 @@
 #include "util/probing_hash_table.hh"
 #include "util/sorted_uniform.hh"
 #include "util/string_piece.hh"
-#include "util/portability.hh"
 
 #include <limits>
 #include <string>
@@ -37,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab {
     
     void Add(WordIndex index, const StringPiece &str);
 
-    void Write(FD fd);
+    void Write(int fd);
 
   private:
     EnumerateVocab *inner_;
@@ -67,7 +66,6 @@ class SortedVocabulary : public base::Vocabulary {
     static size_t Size(std::size_t entries, const Config &config);
 
     // Vocab words are [0, Bound())  Only valid after FinishedLoading/LoadedBinary.  
-    // While this number is correct, ProbingVocabulary::Bound might not be correct in some cases.  
     WordIndex Bound() const { return bound_; }
 
     // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
@@ -85,7 +83,7 @@ class SortedVocabulary : public base::Vocabulary {
 
     bool SawUnk() const { return saw_unk_; }
 
-    void LoadedBinary(FD fd, EnumerateVocab *to);
+    void LoadedBinary(int fd, EnumerateVocab *to);
 
   private:
     uint64_t *begin_, *end_;
@@ -128,7 +126,7 @@ class ProbingVocabulary : public base::Vocabulary {
 
     bool SawUnk() const { return saw_unk_; }
 
-    void LoadedBinary(FD fd, EnumerateVocab *to);
+    void LoadedBinary(int fd, EnumerateVocab *to);
 
   private:
     // std::identity is an SGI extension :-(
diff --git a/kenlm/util/bit_packing.hh b/kenlm/util/bit_packing.hh
index ba3b8529e..62b1f9ea2 100644
--- a/kenlm/util/bit_packing.hh
+++ b/kenlm/util/bit_packing.hh
@@ -1,35 +1,37 @@
 #ifndef UTIL_BIT_PACKING__
 #define UTIL_BIT_PACKING__
 
-/* Bit-level packing routines */
+/* Bit-level packing routines 
+ *
+ * WARNING WARNING WARNING:
+ * The write functions assume that memory is zero initially.  This makes them
+ * faster and is the appropriate case for mmapped language model construction.
+ * These routines assume that unaligned access to uint64_t is fast.  This is
+ * the case on x86_64.  I'm not sure how fast unaligned 64-bit access is on
+ * x86 but my target audience is large language models for which 64-bit is
+ * necessary.  
+ *
+ * Call the BitPackingSanity function to sanity check.  Calling once suffices,
+ * but it may be called multiple times when that's inconvenient.  
+ *
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
 
 #include <assert.h>
 #ifdef __APPLE__
 #include <architecture/byte_order.h>
 #elif __linux__
 #include <endian.h>
-#elif WIN32
-	// TODO WIN32
-#else
+#elif !defined(_WIN32) && !defined(_WIN64)
 #include <arpa/nameser_compat.h>
 #endif 
 
-#include <stdint.h>
-
-namespace util {
+#include <inttypes.h>
 
-/* WARNING WARNING WARNING:
- * The write functions assume that memory is zero initially.  This makes them
- * faster and is the appropriate case for mmapped language model construction.
- * These routines assume that unaligned access to uint64_t is fast and that
- * storage is little endian.  This is the case on x86_64.  I'm not sure how 
- * fast unaligned 64-bit access is on x86 but my target audience is large
- * language models for which 64-bit is necessary.  
- *
- * Call the BitPackingSanity function to sanity check.  Calling once suffices,
- * but it may be called multiple times when that's inconvenient.  
- */
+#include <string.h>
 
+namespace util {
 
 // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.  
 #if BYTE_ORDER == LITTLE_ENDIAN
@@ -59,8 +61,16 @@ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, ui
  * Assumes the memory is zero initially. 
  */
 inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) {
+#if defined(__arm) || defined(__arm__)
+  uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
+  uint64_t value64;
+  memcpy(&value64, base_off, sizeof(value64));
+  value64 |= (value << BitPackShift(bit_off & 7, length));
+  memcpy(base_off, &value64, sizeof(value64));
+#else
   *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= 
     (value << BitPackShift(bit_off & 7, length));
+#endif
 }
 
 /* Same caveats as above, but for a 25 bit limit. */
@@ -69,8 +79,16 @@ inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, ui
 }
 
 inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
+#if defined(__arm) || defined(__arm__)
+  uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
+  uint32_t value32;
+  memcpy(&value32, base_off, sizeof(value32));
+  value32 |= (value << BitPackShift(bit_off & 7, length));
+  memcpy(base_off, &value32, sizeof(value32));
+#else
   *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= 
     (value << BitPackShift(bit_off & 7, length));
+#endif
 }
 
 typedef union { float f; uint32_t i; } FloatEnc;
diff --git a/kenlm/util/exception.cc b/kenlm/util/exception.cc
index ebe06e424..c4f8c04ce 100644
--- a/kenlm/util/exception.cc
+++ b/kenlm/util/exception.cc
@@ -1,5 +1,4 @@
 #include "util/exception.hh"
-#include "util/portability.hh"
 
 #ifdef __GXX_RTTI
 #include <typeinfo>
@@ -67,11 +66,8 @@ const char *HandleStrerror(const char *ret, const char * /*buf*/) {
 ErrnoException::ErrnoException() throw() : errno_(errno) {
   char buf[200];
   buf[0] = 0;
-#ifdef sun
+#if defined(sun) || defined(_WIN32) || defined(_WIN64)
   const char *add = strerror(errno);
-#elif WIN32
-  // TODO WIN32
-  const char *add;
 #else
   const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
 #endif
diff --git a/kenlm/util/file.cc b/kenlm/util/file.cc
index 8da6fda62..18b7934ca 100644
--- a/kenlm/util/file.cc
+++ b/kenlm/util/file.cc
@@ -9,12 +9,17 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
 
 namespace util {
 
 scoped_fd::~scoped_fd() {
-  if (fd_ != kBadFD && close(fd_)) {
+  if (fd_ != -1 && close(fd_)) {
     std::cerr << "Could not close file " << fd_ << std::endl;
     std::abort();
   }
@@ -27,69 +32,215 @@ scoped_FILE::~scoped_FILE() {
   }
 }
 
-FD OpenReadOrThrow(const char *name) {
-  FD ret;
-#ifdef WIN32
-
+int OpenReadOrThrow(const char *name) {
+  int ret;
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name);
 #else
   UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
 #endif
   return ret;
 }
 
-FD CreateOrThrow(const char *name) {
-  FD ret;
-#ifdef WIN32
-
-#else
-  UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name);
-#endif
-  return ret;
-}
-
-off_t SizeFile(FD fd) {
-#ifdef WIN32
-  return 0; // TODO WIN32
-
-#else
+uint64_t SizeFile(int fd) {
   struct stat sb;
   if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
   return sb.st_size;
-#endif
 }
 
-void ReadOrThrow(FD fd, void *to_void, std::size_t amount) {
+void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
   uint8_t *to = static_cast<uint8_t*>(to_void);
   while (amount) {
-
-#ifdef WIN32
-	ssize_t ret; // TODO WIN32
-#else
     ssize_t ret = read(fd, to, amount);
-#endif
-    if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
-    if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+    UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+    UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
     amount -= ret;
     to += ret;
   }
 }
 
-void WriteOrThrow(FD fd, const void *data_void, std::size_t size) {
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
+  uint8_t *to = static_cast<uint8_t*>(to_void);
+  std::size_t remaining = amount;
+  while (remaining) {
+    ssize_t ret = read(fd, to, remaining);
+    UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed.");
+    if (!ret) return amount - remaining;
+    remaining -= ret;
+    to += ret;
+  }
+  return amount;
+}
+
+void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
   const uint8_t *data = static_cast<const uint8_t*>(data_void);
   while (size) {
-#ifdef WIN32
-	ssize_t ret; // TODO WIN32
-#else
     ssize_t ret = write(fd, data, size);
-#endif
     if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
     data += ret;
     size -= ret;
   }
 }
 
-void RemoveOrThrow(const char *name) {
-  UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name);
+namespace {
+void InternalSeek(int fd, off_t off, int whence) {
+  UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
+}
+} // namespace
+
+void SeekOrThrow(int fd, uint64_t off) {
+  InternalSeek(fd, off, SEEK_SET);
+}
+
+void AdvanceOrThrow(int fd, int64_t off) {
+  InternalSeek(fd, off, SEEK_CUR);
+}
+
+void SeekEnd(int fd) {
+  InternalSeek(fd, 0, SEEK_END);
+}
+
+std::FILE *FDOpenOrThrow(scoped_fd &file) {
+  std::FILE *ret = fdopen(file.get(), "r+b");
+  if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen");
+  file.release();
+  return ret;
+}
+
+TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {
+  base_ += "XXXXXX";
+}
+
+// Sigh.  Windows temporary file creation is full of race conditions.
+#if defined(_WIN32) || defined(_WIN64)
+/* mkstemp extracted from libc/sysdeps/posix/tempname.c.  Copyright
+   (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.  */
+
+/* This has been modified from the original version to rename the function and
+ * set the Windows temporary flag. */
+
+static const char letters[] =
+"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+/* Generate a temporary file name based on TMPL.  TMPL must match the
+   rules for mk[s]temp (i.e. end in "XXXXXX").  The name constructed
+   does not exist at the time of the call to mkstemp.  TMPL is
+   overwritten with the result.  */
+int
+mkstemp_and_unlink(char *tmpl)
+{
+  int len;
+  char *XXXXXX;
+  static unsigned long long value;
+  unsigned long long random_time_bits;
+  unsigned int count;
+  int fd = -1;
+  int save_errno = errno;
+
+  /* A lower bound on the number of temporary files to attempt to
+     generate.  The maximum total number of temporary file names that
+     can exist for a given template is 62**6.  It should never be
+     necessary to try all these combinations.  Instead if a reasonable
+     number of names is tried (we define reasonable as 62**3) fail to
+     give the system administrator the chance to remove the problems.  */
+#define ATTEMPTS_MIN (62 * 62 * 62)
+
+  /* The number of times to attempt to generate a temporary file.  To
+     conform to POSIX, this must be no smaller than TMP_MAX.  */
+#if ATTEMPTS_MIN < TMP_MAX
+  unsigned int attempts = TMP_MAX;
+#else
+  unsigned int attempts = ATTEMPTS_MIN;
+#endif
+
+  len = strlen (tmpl);
+  if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX"))
+    {
+      errno = EINVAL;
+      return -1;
+    }
+
+/* This is where the Xs start.  */
+  XXXXXX = &tmpl[len - 6];
+
+  /* Get some more or less random data.  */
+  {
+    SYSTEMTIME      stNow;
+    FILETIME ftNow;
+
+    // get system time
+    GetSystemTime(&stNow);
+    stNow.wMilliseconds = 500;
+    if (!SystemTimeToFileTime(&stNow, &ftNow))
+    {
+        errno = -1;
+        return -1;
+    }
+
+    random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32)
+                        | (unsigned long long)ftNow.dwLowDateTime);
+  }
+  value += random_time_bits ^ (unsigned long long)GetCurrentThreadId ();
+
+  for (count = 0; count < attempts; value += 7777, ++count)
+    {
+      unsigned long long v = value;
+
+      /* Fill in the random bits.  */
+      XXXXXX[0] = letters[v % 62];
+      v /= 62;
+      XXXXXX[1] = letters[v % 62];
+      v /= 62;
+      XXXXXX[2] = letters[v % 62];
+      v /= 62;
+      XXXXXX[3] = letters[v % 62];
+      v /= 62;
+      XXXXXX[4] = letters[v % 62];
+      v /= 62;
+      XXXXXX[5] = letters[v % 62];
+
+      /* Modified to unlink */
+//      fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE);
+      fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL, _S_IREAD | _S_IWRITE);
+      if (fd >= 0)
+    {
+      errno = save_errno;
+      return fd;
+    }
+      else if (errno != EEXIST)
+    return -1;
+    }
+
+  /* We got out of the loop because we ran out of combinations to try.  */
+  errno = EEXIST;
+  return -1;
+}
+#else
+int
+mkstemp_and_unlink(char *tmpl) {
+  int ret = mkstemp(tmpl);
+  if (ret == -1) return -1;
+  UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
+  return ret;
+}
+#endif
+
+int TempMaker::Make() const {
+  std::string copy(base_);
+  copy.push_back(0);
+  int ret;
+  UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&copy[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
+  return ret;
+}
+
+std::FILE *TempMaker::MakeFile() const {
+  util::scoped_fd file(Make());
+  return FDOpenOrThrow(file);
 }
 
 } // namespace util
diff --git a/kenlm/util/file.hh b/kenlm/util/file.hh
index 6adea6625..0dc9ea76b 100644
--- a/kenlm/util/file.hh
+++ b/kenlm/util/file.hh
@@ -1,38 +1,41 @@
 #ifndef UTIL_FILE__
 #define UTIL_FILE__
 
+#include <cstddef>
 #include <cstdio>
-#include "util/portability.hh"
+#include <string>
+
+#include <inttypes.h>
 
 namespace util {
 
 class scoped_fd {
   public:
-    scoped_fd() : fd_(kBadFD) {}
+    scoped_fd() : fd_(-1) {}
 
-    explicit scoped_fd(FD fd) : fd_(fd) {}
+    explicit scoped_fd(int fd) : fd_(fd) {}
 
     ~scoped_fd();
 
-    void reset(FD to) {
+    void reset(int to) {
       scoped_fd other(fd_);
       fd_ = to;
     }
 
-    FD get() const { return fd_; }
+    int get() const { return fd_; }
 
-    FD operator*() const { return fd_; }
+    int operator*() const { return fd_; }
 
-    FD release() {
-      FD ret = fd_;
-      fd_ = kBadFD;
+    int release() {
+      int ret = fd_;
+      fd_ = -1;
       return ret;
     }
 
-    operator bool() { return fd_ != kBadFD; }
+    operator bool() { return fd_ != -1; }
 
   private:
-    FD fd_;
+    int fd_;
 
     scoped_fd(const scoped_fd &);
     scoped_fd &operator=(const scoped_fd &);
@@ -52,22 +55,45 @@ class scoped_FILE {
       file_ = to;
     }
 
+    std::FILE *release() {
+      std::FILE *ret = file_;
+      file_ = NULL;
+      return ret;
+    }
+
   private:
     std::FILE *file_;
 };
 
-FD OpenReadOrThrow(const char *name);
-
-FD CreateOrThrow(const char *name);
+int OpenReadOrThrow(const char *name);
 
 // Return value for SizeFile when it can't size properly.  
-const off_t kBadSize = -1;
-off_t SizeFile(FD fd);
+const uint64_t kBadSize = (uint64_t)-1;
+uint64_t SizeFile(int fd);
+
+void ReadOrThrow(int fd, void *to, std::size_t size);
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
+
+void WriteOrThrow(int fd, const void *data_void, std::size_t size);
 
-void ReadOrThrow(FD fd, void *to, std::size_t size);
-void WriteOrThrow(FD fd, const void *data_void, std::size_t size);
+// Seeking
+void SeekOrThrow(int fd, uint64_t off);
+void AdvanceOrThrow(int fd, int64_t off);
+void SeekEnd(int fd);
 
-void RemoveOrThrow(const char *name);
+std::FILE *FDOpenOrThrow(scoped_fd &file);
+
+class TempMaker {
+  public:
+    explicit TempMaker(const std::string &prefix);
+
+    int Make() const;
+
+    std::FILE *MakeFile() const;
+
+  private:
+    std::string base_;
+};
 
 } // namespace util
 
diff --git a/kenlm/util/file_piece.cc b/kenlm/util/file_piece.cc
index bd3688690..d0101e129 100644
--- a/kenlm/util/file_piece.cc
+++ b/kenlm/util/file_piece.cc
@@ -2,6 +2,7 @@
 
 #include "util/exception.hh"
 #include "util/file.hh"
+#include "util/mmap.hh"
 
 #include <iostream>
 #include <string>
@@ -11,6 +12,9 @@
 #include <ctype.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 #ifdef HAVE_ZLIB
 #include <zlib.h>
@@ -32,14 +36,14 @@ GZException::GZException(void *file) {
 // Sigh this is the only way I could come up with to do a _const_ bool.  It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). 
 const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : 
-  file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : 
+  file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
   progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
   Initialize(name, show_progress, min_buffer);
 }
 
-FilePiece::FilePiece(FD fd, const char *name, std::ostream *show_progress, off_t min_buffer)  : 
-  file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer)  : 
+  file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
   progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
   Initialize(name, show_progress, min_buffer);
 }
@@ -59,7 +63,7 @@ FilePiece::~FilePiece() {
 }
 
 StringPiece FilePiece::ReadLine(char delim) {
-  size_t skip = 0;
+  std::size_t skip = 0;
   while (true) {
     for (const char *i = position_ + skip; i < position_end_; ++i) {
       if (*i == delim) {
@@ -90,13 +94,13 @@ unsigned long int FilePiece::ReadULong() {
   return ReadNumber<unsigned long int>();
 }
 
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer)  {
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer)  {
 #ifdef HAVE_ZLIB
   gz_file_ = NULL;
 #endif
   file_name_ = name;
 
-  default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
+  default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
   position_ = NULL;
   position_end_ = NULL;
   mapped_offset_ = 0;
@@ -167,7 +171,7 @@ template <class T> T FilePiece::ReadNumber() {
 }
 
 const char *FilePiece::FindDelimiterOrEOF(const bool *delim)  {
-  size_t skip = 0;
+  std::size_t skip = 0;
   while (true) {
     for (const char *i = position_ + skip; i < position_end_; ++i) {
       if (delim[static_cast<unsigned char>(*i)]) return i;
@@ -186,7 +190,7 @@ void FilePiece::Shift() {
     progress_.Finished();
     throw EndOfFileException();
   }
-  off_t desired_begin = position_ - data_.begin() + mapped_offset_;
+  uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
 
   if (!fallback_to_read_) MMapShift(desired_begin);
   // Notice an mmap failure might set the fallback.  
@@ -197,18 +201,18 @@ void FilePiece::Shift() {
   }
 }
 
-void FilePiece::MMapShift(off_t desired_begin) {
+void FilePiece::MMapShift(uint64_t desired_begin) {
   // Use mmap.  
-  off_t ignore = desired_begin % page_;
+  uint64_t ignore = desired_begin % page_;
   // Duplicate request for Shift means give more data.  
   if (position_ == data_.begin() + ignore) {
     default_map_size_ *= 2;
   }
   // Local version so that in case of failure it doesn't overwrite the class variable.  
-  off_t mapped_offset = desired_begin - ignore;
+  uint64_t mapped_offset = desired_begin - ignore;
 
-  off_t mapped_size;
-  if (default_map_size_ >= static_cast<size_t>(total_size_ - mapped_offset)) {
+  uint64_t mapped_size;
+  if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) {
     at_end_ = true;
     mapped_size = total_size_ - mapped_offset;
   } else {
@@ -217,19 +221,11 @@ void FilePiece::MMapShift(off_t desired_begin) {
 
   // Forcibly clear the existing mmap first.  
   data_.reset();
-  data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED
-  // Populate where available on linux
-#ifdef MAP_POPULATE
-        | MAP_POPULATE
-#endif
-        , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
-  if (data_.get() == MAP_FAILED) {
+  try {
+    MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
+  } catch (const util::ErrnoException &e) {
     if (desired_begin) {
-#ifdef WIN32
-
-#else
-      if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before.  lseek failed too, so using read isn't an option either.");
-#endif
+      SeekOrThrow(*file_, desired_begin);
     }
     // The mmap was scheduled to end the file, but now we're going to read it.  
     at_end_ = false;
@@ -254,15 +250,9 @@ void FilePiece::TransitionToRead() {
 
 #ifdef HAVE_ZLIB
   assert(!gz_file_);
-
-#ifdef WIN32
-
-#else
   gz_file_ = gzdopen(file_.get(), "r");
   UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
 #endif
-
-#endif
 }
 
 void FilePiece::ReadShift() {
@@ -303,11 +293,7 @@ void FilePiece::ReadShift() {
   if (read_return == -1) throw GZException(gz_file_);
   if (total_size_ != kBadSize) {
     // Just get the position, don't actually seek.  Apparently this is how you do it. . . 
-#ifdef WIN32
-  off_t ret;
-#else
     off_t ret = lseek(file_.get(), 0, SEEK_CUR);
-#endif
     if (ret != -1) progress_.Set(ret);
   }
 #else
diff --git a/kenlm/util/file_piece.hh b/kenlm/util/file_piece.hh
index 2ddec643f..a8dc35523 100644
--- a/kenlm/util/file_piece.hh
+++ b/kenlm/util/file_piece.hh
@@ -7,11 +7,11 @@
 #include "util/have.hh"
 #include "util/mmap.hh"
 #include "util/string_piece.hh"
-#include "util/portability.hh"
 
+#include <cstddef>
 #include <string>
 
-#include <cstddef>
+#include <inttypes.h>
 
 namespace util {
 
@@ -34,9 +34,9 @@ extern const bool kSpaces[256];
 class FilePiece {
   public:
     // 32 MB default.
-    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
     // Takes ownership of fd.  name is used for messages.  
-    explicit FilePiece(FD fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+    explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
 
     ~FilePiece();
      
@@ -71,14 +71,14 @@ class FilePiece {
       }
     }
 
-    off_t Offset() const {
+    uint64_t Offset() const {
       return position_ - data_.begin() + mapped_offset_;
     }
 
     const std::string &FileName() const { return file_name_; }
     
   private:
-    void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
+    void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
 
     template <class T> T ReadNumber();
 
@@ -92,7 +92,7 @@ class FilePiece {
 
     void Shift();
     // Backends to Shift().
-    void MMapShift(off_t desired_begin);
+    void MMapShift(uint64_t desired_begin);
 
     void TransitionToRead();
     void ReadShift();
@@ -100,11 +100,11 @@ class FilePiece {
     const char *position_, *last_space_, *position_end_;
 
     scoped_fd file_;
-    const off_t total_size_;
-    const off_t page_;
+    const uint64_t total_size_;
+    const uint64_t page_;
 
-    size_t default_map_size_;
-    off_t mapped_offset_;
+    std::size_t default_map_size_;
+    uint64_t mapped_offset_;
 
     // Order matters: file_ should always be destroyed after this.
     scoped_memory data_;
diff --git a/kenlm/util/key_value_packing.hh b/kenlm/util/key_value_packing.hh
index 8339980b5..b84a5aadf 100644
--- a/kenlm/util/key_value_packing.hh
+++ b/kenlm/util/key_value_packing.hh
@@ -7,7 +7,7 @@
 #include <cstddef>
 #include <cstring>
 
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace util {
 
diff --git a/kenlm/util/mmap.cc b/kenlm/util/mmap.cc
index f73a3cf5c..3dfe0ab2c 100644
--- a/kenlm/util/mmap.cc
+++ b/kenlm/util/mmap.cc
@@ -1,23 +1,62 @@
+/* Memory mapping wrappers.
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
+#include "util/mmap.hh"
+
 #include "util/exception.hh"
 #include "util/file.hh"
-#include "util/mmap.hh"
-#include "util/portability.hh"
 
 #include <iostream>
 
 #include <assert.h>
 #include <fcntl.h>
 #include <sys/types.h>
+#include <sys/stat.h>
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#endif
 #include <stdlib.h>
-#include "util/portability.hh"
+#include <unistd.h>
 
 namespace util {
 
+long SizePage() {
+#if defined(_WIN32) || defined(_WIN64)
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwAllocationGranularity;
+#else
+  return sysconf(_SC_PAGE_SIZE);
+#endif
+}
+
+void SyncOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
+#else
+  UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
+#endif
+}
+
+void UnmapOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
+#else
+  UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
+#endif
+}
+
 scoped_mmap::~scoped_mmap() {
   if (data_ != (void*)-1) {
-    // Thanks Denis Filimonov for pointing on NFS likes msync first.  
-    if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) {
-      std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl;
+    try {
+      // Thanks Denis Filimonov for pointing out NFS likes msync first.  
+      SyncOrThrow(data_, size_);
+      UnmapOrThrow(data_, size_);
+    } catch (const util::ErrnoException &e) {
+      std::cerr << e.what();
       abort();
     }
   }
@@ -52,32 +91,39 @@ void scoped_memory::call_realloc(std::size_t size) {
   }
 }
 
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, FD fd, off_t offset) {
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
 #ifdef MAP_POPULATE // Linux specific
   if (prefault) {
     flags |= MAP_POPULATE;
   }
-#elif WIN32
-  // TODO WIN32
-
 #endif
+#if defined(_WIN32) || defined(_WIN64)
+  int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
+  int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
+  HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL);
+  UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
+  ret = MapViewOfFile(hMapping, protectM, 0, offset, size);
+  CloseHandle(hMapping);
+  UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
+#else
   int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
   void *ret = mmap(NULL, size, protect, flags, fd, offset);
-  if (ret == MAP_FAILED) {
-    UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset);
-  }
+  UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+#endif
   return ret;
 }
 
 const int kFileFlags =
-#ifdef MAP_FILE
+#if defined(_WIN32) || defined(_WIN64)
+  0 // MapOrThrow ignores flags on windows
+#elif defined(MAP_FILE)
   MAP_FILE | MAP_SHARED
 #else
   MAP_SHARED
 #endif
   ;
 
-void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_memory &out) {
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
   switch (method) {
     case LAZY:
       out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
@@ -94,11 +140,7 @@ void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_me
     case READ:
       out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
       if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
-#ifdef WIN32
-
-#else
-      if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed.");
-#endif
+      SeekOrThrow(fd, offset);
       ReadOrThrow(fd, out.get(), size);
       break;
   }
@@ -106,27 +148,40 @@ void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_me
 
 void *MapAnonymous(std::size_t size) {
   return MapOrThrow(size, true,
-#ifdef MAP_ANONYMOUS
-      MAP_ANONYMOUS // Linux
+#if defined(_WIN32) || defined(_WIN64)
+      0 // MapOrThrow ignores the flags anyway.
+#elif defined(MAP_ANONYMOUS)
+      MAP_ANONYMOUS | MAP_PRIVATE // Linux
 #else
-      MAP_ANON // BSD
+      MAP_ANON | MAP_PRIVATE // BSD
 #endif
-	  | MAP_PRIVATE, false, kBadFD, 0);
+      , false, -1, 0);
 }
 
-void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
-#ifdef WIN32
+void *MapZeroedWrite(int fd, std::size_t size) {
+  UTIL_THROW_IF(-1 == ftruncate(fd, 0), ErrnoException, "ftruncate on fd " << fd << " to 0 failed");
+  UTIL_THROW_IF(-1 == ftruncate(fd, size), ErrnoException, "ftruncate on fd " << fd << " to " << size << " failed");
+  return MapOrThrow(size, true, kFileFlags, false, fd, 0);
+}
 
+namespace {
+
+int CreateOrThrow(const char *name) {
+  int ret;
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
 #else
-  file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+  UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
 #endif
+  return ret;
+}
+
+} // namespace
 
-  if (kBadFD == file.get())
-    UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing");
-  if (-1 == ftruncate(file.get(), size))
-    UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
+void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
+  file.reset(CreateOrThrow(name));
   try {
-    return MapOrThrow(size, true, kFileFlags, false, file.get(), 0);
+    return MapZeroedWrite(file.get(), size);
   } catch (ErrnoException &e) {
     e << " in file " << name;
     throw;
diff --git a/kenlm/util/mmap.hh b/kenlm/util/mmap.hh
index 8333fed03..3183c6292 100644
--- a/kenlm/util/mmap.hh
+++ b/kenlm/util/mmap.hh
@@ -4,14 +4,15 @@
 
 #include <cstddef>
 
-#include "util/portability.hh"
-#include <stdint.h>
+#include <inttypes.h>
 #include <sys/types.h>
 
 namespace util {
 
 class scoped_fd;
 
+long SizePage();
+
 // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.  
 class scoped_mmap {
   public:
@@ -95,15 +96,19 @@ typedef enum {
 extern const int kFileFlags;
 
 // Wrapper around mmap to check it worked and hide some platform macros.  
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, FD fd, off_t offset = 0);
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
 
-void MapRead(LoadMethod method, FD fd, off_t offset, std::size_t size, scoped_memory &out);
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
 
 void *MapAnonymous(std::size_t size);
 
 // Open file name with mmap of size bytes, all of which are initially zero.  
+void *MapZeroedWrite(int fd, std::size_t size);
 void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
 
+// msync wrapper 
+void SyncOrThrow(void *start, size_t length);
+
 } // namespace util
 
 #endif // UTIL_MMAP__
diff --git a/kenlm/util/murmur_hash.hh b/kenlm/util/murmur_hash.hh
index 638aaeb22..78fe583fc 100644
--- a/kenlm/util/murmur_hash.hh
+++ b/kenlm/util/murmur_hash.hh
@@ -1,7 +1,7 @@
 #ifndef UTIL_MURMUR_HASH__
 #define UTIL_MURMUR_HASH__
 #include <cstddef>
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace util {
 
diff --git a/kenlm/util/portability.cc b/kenlm/util/portability.cc
deleted file mode 100644
index 2efd74cba..000000000
--- a/kenlm/util/portability.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-
-#include <stdlib.h>
-#include <errno.h>
-#include "util/portability.hh"
-
-#ifdef WIN32
-
-int RUSAGE_SELF = 0;
-
-int sysconf(int) { return 0; }
-int msync(void*, int, int) { return 0; }
-int munmap(void *, int) { return 0; }
-void *mmap(void*, int, int, int, FD, OFF_T) { return 0; }
-int write(int, const void *, int) {return 0; }
-
-//FILE *popen(const char*, const char*) { return 0; }
-//int pclose(FILE *) { return 0; }
-int close(FD fd) { return 0; }
-
-
-// to be implemented by boost
-int mkdtemp(const char*) { return 0; }
-
-// done
-long lrint(float x)
-{
-  long ret = (long) x;
-  return ret;
-}
-
-float strtof(const char *begin, char **end) 
-{ 
-	double ret = strtod(begin, end);
-	return (float) ret; 
-}
-
-
-int ftruncate (FD hfile, unsigned int size)
-{
-  unsigned int curpos;
-  /*
-  HANDLE hfile;
-
-  if (fd < 0)
-    {
-      errno = EBADF;
-      return -1;
-    }
-
-  hfile = (HANDLE) _get_osfhandle (fd);
-  */
-  curpos = SetFilePointer (hfile, 0, NULL, FILE_CURRENT);
-  if (curpos == ~0
-      || SetFilePointer (hfile, size, NULL, FILE_BEGIN) == ~0
-      || !SetEndOfFile (hfile))
-    {
-      int error = GetLastError (); 
-      switch (error)
-	{
-	case ERROR_INVALID_HANDLE:
-	  errno = EBADF;
-	  break;
-	default:
-	  errno = EIO;
-	  break;
-	}
-      return -1;
-    }
-  return 0;
-}
-
-#endif 
-
-
diff --git a/kenlm/util/portability.hh b/kenlm/util/portability.hh
deleted file mode 100644
index 7066f50f5..000000000
--- a/kenlm/util/portability.hh
+++ /dev/null
@@ -1,127 +0,0 @@
-
-#pragma once
-
-#include <assert.h>
-#include <stdint.h>
-
-#ifdef WIN32
-
-#include <windows.h>
-#include <direct.h>
-#include <io.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/stat.h>
-#include "util/getopt.hh"
-
-#undef max
-#undef min
-
-typedef HANDLE FD;
-
-const FD kBadFD = INVALID_HANDLE_VALUE;
-
-typedef int ssize_t;
-
-#define _SC_PAGE_SIZE 1
-#define MS_SYNC 1
-
-int sysconf(int);
-int msync(void*, int, int);
-int ftruncate(FD, unsigned int);
-
-long lrint(float); 
-
-/*
-struct timeval 
-{
-	float tv_sec, tv_usec;
-};
-
-struct rusage
-{
-	timeval ru_utime, ru_stime;
-};
-*/
-
-//inline int getrusage(int, struct rusage*) { return 0; }
-//extern int RUSAGE_SELF;
-
-typedef __int64 OFF_T;
-//#define OFF_T __int64
-
-#ifndef S_ISDIR
-#define S_ISDIR(mode)  (((mode) & S_IFMT) == S_IFDIR)
-#endif
-
-#ifndef S_ISREG
-#define S_ISREG(mode)  (((mode) & S_IFMT) == S_IFREG)
-#endif
-
-int mkdtemp(const char*);
-int munmap(void *, int);
-void *mmap(void*, int, int, int, FD, OFF_T);
-
-#define PROT_READ 1
-#define PROT_WRITE 1
-#define MAP_FAILED (void*) 0x1
-#define MAP_SHARED 1
-#define MAP_ANON 1
-#define MAP_PRIVATE 1
-#define S_IRUSR 1
-#define S_IROTH 1
-#define S_IRGRP 1
-
-int write(int, const void *, int);
-#define S_IRUSR 1
-#define S_IWUSR 1
-
-//const char *strerror_r(int, const char *buf, int);
-
-float strtof(const char *begin, char **end);
-//FILE *popen(const char*, const char*);
-//int pclose(FILE *);
-int close(FD fd);
-
-#define dup(x) _dup(x)
-#define rmdir(x) _rmdir(x)
-#define strerror_r(errNum, buffer, numberOfElements)  strerror_s(buffer, numberOfElements);
-
-#else // assume UNIX OS
-
-#include <stdint.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-typedef int FD;
-const FD kBadFD = -1;
-
-typedef off_t OFF_T;
-
-#endif
-
-#ifdef __GNUC__
-#define UTIL_FUNC_NAME __PRETTY_FUNCTION__
-#else
-#ifdef _WIN32
-#define UTIL_FUNC_NAME __FUNCTION__
-#else
-#define UTIL_FUNC_NAME NULL
-#endif
-#endif
-
-/* Bit-level packing routines */
-#ifdef __APPLE__
-	#include <architecture/byte_order.h>
-#elif __linux__
-	#include <endian.h>
-#elif WIN32
-	// nothing
-#else
-	#include <arpa/nameser_compat.h>
-#endif 
-
diff --git a/kenlm/util/probing_hash_table.hh b/kenlm/util/probing_hash_table.hh
index 2ec342a66..8122d69c5 100644
--- a/kenlm/util/probing_hash_table.hh
+++ b/kenlm/util/probing_hash_table.hh
@@ -61,14 +61,14 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
 #endif
     {}
 
-    template <class T> void Insert(const T &t) {
+    template <class T> MutableIterator Insert(const T &t) {
       if (++entries_ >= buckets_)
         UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
 #ifdef DEBUG
       assert(initialized_);
 #endif
       for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
-        if (equal_(i->GetKey(), invalid_)) { *i = t; return; }
+        if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
         if (++i == end_) { i = begin_; }
       }
     }
diff --git a/kenlm/util/sized_iterator.hh b/kenlm/util/sized_iterator.hh
index aabcc5319..47dfc2454 100644
--- a/kenlm/util/sized_iterator.hh
+++ b/kenlm/util/sized_iterator.hh
@@ -6,7 +6,7 @@
 #include <functional>
 #include <string>
 
-#include <stdint.h>
+#include <inttypes.h>
 #include <string.h>
 
 namespace util {
diff --git a/kenlm/util/sorted_uniform.hh b/kenlm/util/sorted_uniform.hh
index 0391189f0..0d6ecbbd6 100644
--- a/kenlm/util/sorted_uniform.hh
+++ b/kenlm/util/sorted_uniform.hh
@@ -5,7 +5,7 @@
 #include <cstddef>
 
 #include <assert.h>
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace util {
author	Kenneth Heafield <github@kheafield.com>	2011-11-11 00:46:59 +0400
committer	Kenneth Heafield <github@kheafield.com>	2011-11-11 00:46:59 +0400
commit	d732f63ec2f8d54092da298fff289ee8bba1e419 (patch)
tree	2b9299039fe1841acae93d4bc8932d06e20e8907 /kenlm
parent	9903a239ea33d58484cdb625f89135d96869dd5c (diff)