moses windows build, with some TODO list

author: jiejiang <mail.jie.jiang@gmail.com> 2013-12-19 00:15:39 +0400
committer: jiejiang <mail.jie.jiang@gmail.com> 2013-12-19 00:15:39 +0400
commit: 744376b3fbebc41c4a270bf549826d5eb9219ae0 (patch)
tree: 27b324b13dacf16e021cde88b9edb594d71f09dc /lm
parent: 1a8a8fbb2d8eb503c38ba03da796e16ed08fd07a (diff)
7 files changed, 48 insertions, 21 deletions
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index aea93ad10..3edd3216a 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -87,7 +87,7 @@ class VocabHandout {
     Table table_;
 
     std::size_t double_cutoff_;
-    
+
     util::FakeOFStream word_list_;
 };
 
@@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
     std::size_t operator()(const WordIndex *start) const {
       return util::MurmurHashNative(start, size_);
     }
-    
+
   private:
     const std::size_t size_;
 };
@@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
 class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
   public:
     explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-    
+
     bool operator()(const WordIndex *first, const WordIndex *second) const {
       return !memcmp(first, second, size_);
-    } 
-    
+    }
+
   private:
     const std::size_t size_;
 };
@@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
 
 class Writer {
   public:
-    Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) 
+    Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
       : block_(position), gram_(block_->Get(), order),
         dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
         dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@@ -140,7 +140,7 @@ class Writer {
       dedupe_.Clear();
       assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
       if (order == 1) {
-        // Add special words.  AdjustCounts is responsible if order != 1.    
+        // Add special words.  AdjustCounts is responsible if order != 1.
         AddUnigramWord(kUNK);
         AddUnigramWord(kBOS);
       }
@@ -170,16 +170,16 @@ class Writer {
         memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
         return;
       }
-      // Complete the write.  
+      // Complete the write.
       gram_.Count() = 1;
-      // Prepare the next n-gram.  
+      // Prepare the next n-gram.
       if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
         NGram last(gram_);
         gram_.NextInMemory();
         std::copy(last.begin() + 1, last.end(), gram_.begin());
         return;
       }
-      // Block end.  Need to store the context in a temporary buffer.  
+      // Block end.  Need to store the context in a temporary buffer.
       std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
       dedupe_.Clear();
       block_->SetValidSize(block_size_);
@@ -207,7 +207,7 @@ class Writer {
     // Hash table combiner implementation.
     Dedupe dedupe_;
 
-    // Small buffer to hold existing ngrams when shifting across a block boundary.  
+    // Small buffer to hold existing ngrams when shifting across a block boundary.
     boost::scoped_array<WordIndex> buffer_;
 
     const std::size_t block_size_;
@@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
   return VocabHandout::MemUsage(vocab_estimate);
 }
 
-CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) 
+CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
   : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
     dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
     dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 500268069..52e69f02e 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -33,12 +33,12 @@ class Callback {
       pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
       probs_[order_minus_1 + 1] = pay.complete.prob;
       pay.complete.prob = log10(pay.complete.prob);
-      // TODO: this is a hack to skip n-grams that don't appear as context.  Pruning will require some different handling.  
-      if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
+      // TODO: this is a hack to skip n-grams that don't appear as context.  Pruning will require some different handling.
+      if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end
         pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
         ++backoffs_[order_minus_1];
       } else {
-        // Not a context.  
+        // Not a context.
         pay.complete.backoff = 0.0;
       }
     }
@@ -52,7 +52,7 @@ class Callback {
 };
 } // namespace
 
-Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) 
+Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
   : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
 
 // perform order-wise interpolation
diff --git a/lm/config.cc b/lm/config.cc
index 9520c41c8..dc3365319 100644
--- a/lm/config.cc
+++ b/lm/config.cc
@@ -11,7 +11,11 @@ Config::Config() :
   enumerate_vocab(NULL),
   unknown_missing(COMPLAIN),
   sentence_marker_missing(THROW_UP),
+#if defined(_WIN32) || defined(_WIN64)
+  positive_log_probability(SILENT),
+#else
   positive_log_probability(THROW_UP),
+#endif
   unknown_missing_logprob(-100.0),
   probing_multiplier(1.5),
   building_memory(1073741824ULL), // 1 GB
diff --git a/lm/filter/arpa_io.hh b/lm/filter/arpa_io.hh
index 5b31620b5..08e658666 100644
--- a/lm/filter/arpa_io.hh
+++ b/lm/filter/arpa_io.hh
@@ -14,7 +14,10 @@
 #include <string>
 #include <vector>
 
+#if !defined __MINGW32__
 #include <err.h>
+#endif
+
 #include <string.h>
 #include <stdint.h>
 
diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh
index 97c0fa25e..740b8d50e 100644
--- a/lm/filter/count_io.hh
+++ b/lm/filter/count_io.hh
@@ -5,7 +5,9 @@
 #include <iostream>
 #include <string>
 
+#if !defined __MINGW32__
 #include <err.h>
+#endif
 
 #include "util/file_piece.hh"
 
@@ -17,7 +19,12 @@ class CountOutput : boost::noncopyable {
 
     void AddNGram(const StringPiece &line) {
       if (!(file_ << line << '\n')) {
+#if defined __MINGW32__
+        std::cerr<<"Writing counts file failed"<<std::endl;
+        exit(3);
+#else
         err(3, "Writing counts file failed");
+#endif
       }
     }
 
@@ -35,7 +42,7 @@ class CountOutput : boost::noncopyable {
 
 class CountBatch {
   public:
-    explicit CountBatch(std::streamsize initial_read) 
+    explicit CountBatch(std::streamsize initial_read)
       : initial_read_(initial_read) {
       buffer_.reserve(initial_read);
     }
@@ -68,7 +75,7 @@ class CountBatch {
   private:
     std::streamsize initial_read_;
 
-    // This could have been a std::string but that's less happy with raw writes.  
+    // This could have been a std::string but that's less happy with raw writes.
     std::vector<char> buffer_;
 };
 
diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc
index 1736bc405..f89ac4df3 100644
--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@@ -57,7 +57,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
 typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
 
 struct Config {
-  Config() : 
+  Config() :
 #ifndef NTHREAD
   batch_size(25000),
   threads(boost::thread::hardware_concurrency()),
@@ -202,7 +202,7 @@ int main(int argc, char *argv[]) {
       return 1;
     }
   }
-  
+
   if (config.mode == lm::MODE_UNSET) {
     lm::DisplayHelp(argv[0]);
     return 1;
@@ -221,7 +221,12 @@ int main(int argc, char *argv[]) {
   } else if (!strncmp(cmd_input, "model:", 6)) {
     cmd_input += 6;
   } else if (strchr(cmd_input, ':')) {
+#if defined __MINGW32__
+    std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
+    exit(1);
+#else
     errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
+#endif // defined
   } else {
     std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
   }
@@ -232,7 +237,12 @@ int main(int argc, char *argv[]) {
   } else {
     cmd_file.open(cmd_input, std::ios::in);
     if (!cmd_file) {
+#if defined __MINGW32__
+      std::cerr << "Could not open input file " << cmd_input << std::endl;
+      exit(2);
+#else
       err(2, "Could not open input file %s", cmd_input);
+#endif // defined
     }
     vocab = &cmd_file;
   }
diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc
index 7ee4e84ba..7ed5d92fb 100644
--- a/lm/filter/vocab.cc
+++ b/lm/filter/vocab.cc
@@ -4,7 +4,10 @@
 #include <iostream>
 
 #include <ctype.h>
+
+#if !defined __MINGW32__
 #include <err.h>
+#endif
 
 namespace lm {
 namespace vocab {
@@ -31,7 +34,7 @@ bool IsLineEnd(std::istream &in) {
 }// namespace
 
 // Read space separated words in enter separated lines.  These lines can be
-// very long, so don't read an entire line at a time.  
+// very long, so don't read an entire line at a time.
 unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
   in.exceptions(std::istream::badbit);
   unsigned int sentence = 0;
author	jiejiang <mail.jie.jiang@gmail.com>	2013-12-19 00:15:39 +0400
committer	jiejiang <mail.jie.jiang@gmail.com>	2013-12-19 00:15:39 +0400
commit	744376b3fbebc41c4a270bf549826d5eb9219ae0 (patch)
tree	27b324b13dacf16e021cde88b9edb594d71f09dc /lm
parent	1a8a8fbb2d8eb503c38ba03da796e16ed08fd07a (diff)