From 744376b3fbebc41c4a270bf549826d5eb9219ae0 Mon Sep 17 00:00:00 2001 From: jiejiang Date: Wed, 18 Dec 2013 20:15:39 +0000 Subject: moses windows build, with some TODO list --- lm/builder/corpus_count.cc | 24 ++++++++++++------------ lm/builder/interpolate.cc | 8 ++++---- lm/config.cc | 4 ++++ lm/filter/arpa_io.hh | 3 +++ lm/filter/count_io.hh | 11 +++++++++-- lm/filter/filter_main.cc | 14 ++++++++++++-- lm/filter/vocab.cc | 5 ++++- 7 files changed, 48 insertions(+), 21 deletions(-) (limited to 'lm') diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc index aea93ad10..3edd3216a 100644 --- a/lm/builder/corpus_count.cc +++ b/lm/builder/corpus_count.cc @@ -87,7 +87,7 @@ class VocabHandout { Table table_; std::size_t double_cutoff_; - + util::FakeOFStream word_list_; }; @@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function { std::size_t operator()(const WordIndex *start) const { return util::MurmurHashNative(start, size_); } - + private: const std::size_t size_; }; @@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function { class DedupeEquals : public std::binary_function { public: explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} - + bool operator()(const WordIndex *first, const WordIndex *second) const { return !memcmp(first, second, size_); - } - + } + private: const std::size_t size_; }; @@ -131,7 +131,7 @@ typedef util::ProbingHashTable Dedupe; class Writer { public: - Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) + Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) : block_(position), gram_(block_->Get(), order), dedupe_invalid_(order, std::numeric_limits::max()), dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), @@ -140,7 +140,7 @@ class Writer { dedupe_.Clear(); assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); if (order == 1) { - // Add special words. AdjustCounts is responsible if order != 1. + // Add special words. AdjustCounts is responsible if order != 1. AddUnigramWord(kUNK); AddUnigramWord(kBOS); } @@ -170,16 +170,16 @@ class Writer { memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); return; } - // Complete the write. + // Complete the write. gram_.Count() = 1; - // Prepare the next n-gram. + // Prepare the next n-gram. if (reinterpret_cast(gram_.begin()) + gram_.TotalSize() != static_cast(block_->Get()) + block_size_) { NGram last(gram_); gram_.NextInMemory(); std::copy(last.begin() + 1, last.end(), gram_.begin()); return; } - // Block end. Need to store the context in a temporary buffer. + // Block end. Need to store the context in a temporary buffer. std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); dedupe_.Clear(); block_->SetValidSize(block_size_); @@ -207,7 +207,7 @@ class Writer { // Hash table combiner implementation. Dedupe dedupe_; - // Small buffer to hold existing ngrams when shifting across a block boundary. + // Small buffer to hold existing ngrams when shifting across a block boundary. boost::scoped_array buffer_; const std::size_t block_size_; @@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) { return VocabHandout::MemUsage(vocab_estimate); } -CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) { diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc index 500268069..52e69f02e 100644 --- a/lm/builder/interpolate.cc +++ b/lm/builder/interpolate.cc @@ -33,12 +33,12 @@ class Callback { pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1]; probs_[order_minus_1 + 1] = pay.complete.prob; pay.complete.prob = log10(pay.complete.prob); - // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. - if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) { + // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. + if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end pay.complete.backoff = log10(*static_cast(backoffs_[order_minus_1].Get())); ++backoffs_[order_minus_1]; } else { - // Not a context. + // Not a context. pay.complete.backoff = 0.0; } } @@ -52,7 +52,7 @@ class Callback { }; } // namespace -Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) +Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) : uniform_prob_(1.0 / static_cast(unigram_count - 1)), backoffs_(backoffs) {} // perform order-wise interpolation diff --git a/lm/config.cc b/lm/config.cc index 9520c41c8..dc3365319 100644 --- a/lm/config.cc +++ b/lm/config.cc @@ -11,7 +11,11 @@ Config::Config() : enumerate_vocab(NULL), unknown_missing(COMPLAIN), sentence_marker_missing(THROW_UP), +#if defined(_WIN32) || defined(_WIN64) + positive_log_probability(SILENT), +#else positive_log_probability(THROW_UP), +#endif unknown_missing_logprob(-100.0), probing_multiplier(1.5), building_memory(1073741824ULL), // 1 GB diff --git a/lm/filter/arpa_io.hh b/lm/filter/arpa_io.hh index 5b31620b5..08e658666 100644 --- a/lm/filter/arpa_io.hh +++ b/lm/filter/arpa_io.hh @@ -14,7 +14,10 @@ #include #include +#if !defined __MINGW32__ #include +#endif + #include #include diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh index 97c0fa25e..740b8d50e 100644 --- a/lm/filter/count_io.hh +++ b/lm/filter/count_io.hh @@ -5,7 +5,9 @@ #include #include +#if !defined __MINGW32__ #include +#endif #include "util/file_piece.hh" @@ -17,7 +19,12 @@ class CountOutput : boost::noncopyable { void AddNGram(const StringPiece &line) { if (!(file_ << line << '\n')) { +#if defined __MINGW32__ + std::cerr<<"Writing counts file failed"< buffer_; }; diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc index 1736bc405..f89ac4df3 100644 --- a/lm/filter/filter_main.cc +++ b/lm/filter/filter_main.cc @@ -57,7 +57,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; struct Config { - Config() : + Config() : #ifndef NTHREAD batch_size(25000), threads(boost::thread::hardware_concurrency()), @@ -202,7 +202,7 @@ int main(int argc, char *argv[]) { return 1; } } - + if (config.mode == lm::MODE_UNSET) { lm::DisplayHelp(argv[0]); return 1; @@ -221,7 +221,12 @@ int main(int argc, char *argv[]) { } else if (!strncmp(cmd_input, "model:", 6)) { cmd_input += 6; } else if (strchr(cmd_input, ':')) { +#if defined __MINGW32__ + std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; + exit(1); +#else errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); +#endif // defined } else { std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; } @@ -232,7 +237,12 @@ int main(int argc, char *argv[]) { } else { cmd_file.open(cmd_input, std::ios::in); if (!cmd_file) { +#if defined __MINGW32__ + std::cerr << "Could not open input file " << cmd_input << std::endl; + exit(2); +#else err(2, "Could not open input file %s", cmd_input); +#endif // defined } vocab = &cmd_file; } diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc index 7ee4e84ba..7ed5d92fb 100644 --- a/lm/filter/vocab.cc +++ b/lm/filter/vocab.cc @@ -4,7 +4,10 @@ #include #include + +#if !defined __MINGW32__ #include +#endif namespace lm { namespace vocab { @@ -31,7 +34,7 @@ bool IsLineEnd(std::istream &in) { }// namespace // Read space separated words in enter separated lines. These lines can be -// very long, so don't read an entire line at a time. +// very long, so don't read an entire line at a time. unsigned int ReadMultiple(std::istream &in, boost::unordered_map > &out) { in.exceptions(std::istream::badbit); unsigned int sentence = 0; -- cgit v1.2.3