diff options
author | jiejiang <mail.jie.jiang@gmail.com> | 2013-12-19 00:15:39 +0400 |
---|---|---|
committer | jiejiang <mail.jie.jiang@gmail.com> | 2013-12-19 00:15:39 +0400 |
commit | 744376b3fbebc41c4a270bf549826d5eb9219ae0 (patch) | |
tree | 27b324b13dacf16e021cde88b9edb594d71f09dc /lm | |
parent | 1a8a8fbb2d8eb503c38ba03da796e16ed08fd07a (diff) |
moses windows build, with some TODO list
Diffstat (limited to 'lm')
-rw-r--r-- | lm/builder/corpus_count.cc | 24 | ||||
-rw-r--r-- | lm/builder/interpolate.cc | 8 | ||||
-rw-r--r-- | lm/config.cc | 4 | ||||
-rw-r--r-- | lm/filter/arpa_io.hh | 3 | ||||
-rw-r--r-- | lm/filter/count_io.hh | 11 | ||||
-rw-r--r-- | lm/filter/filter_main.cc | 14 | ||||
-rw-r--r-- | lm/filter/vocab.cc | 5 |
7 files changed, 48 insertions, 21 deletions
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc index aea93ad10..3edd3216a 100644 --- a/lm/builder/corpus_count.cc +++ b/lm/builder/corpus_count.cc @@ -87,7 +87,7 @@ class VocabHandout { Table table_; std::size_t double_cutoff_; - + util::FakeOFStream word_list_; }; @@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> { std::size_t operator()(const WordIndex *start) const { return util::MurmurHashNative(start, size_); } - + private: const std::size_t size_; }; @@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> { class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> { public: explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} - + bool operator()(const WordIndex *first, const WordIndex *second) const { return !memcmp(first, second, size_); - } - + } + private: const std::size_t size_; }; @@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe; class Writer { public: - Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) + Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) : block_(position), gram_(block_->Get(), order), dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()), dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), @@ -140,7 +140,7 @@ class Writer { dedupe_.Clear(); assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); if (order == 1) { - // Add special words. AdjustCounts is responsible if order != 1. + // Add special words. AdjustCounts is responsible if order != 1. AddUnigramWord(kUNK); AddUnigramWord(kBOS); } @@ -170,16 +170,16 @@ class Writer { memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); return; } - // Complete the write. + // Complete the write. gram_.Count() = 1; - // Prepare the next n-gram. + // Prepare the next n-gram. if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) { NGram last(gram_); gram_.NextInMemory(); std::copy(last.begin() + 1, last.end(), gram_.begin()); return; } - // Block end. Need to store the context in a temporary buffer. + // Block end. Need to store the context in a temporary buffer. std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); dedupe_.Clear(); block_->SetValidSize(block_size_); @@ -207,7 +207,7 @@ class Writer { // Hash table combiner implementation. Dedupe dedupe_; - // Small buffer to hold existing ngrams when shifting across a block boundary. + // Small buffer to hold existing ngrams when shifting across a block boundary. boost::scoped_array<WordIndex> buffer_; const std::size_t block_size_; @@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) { return VocabHandout::MemUsage(vocab_estimate); } -CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) { diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc index 500268069..52e69f02e 100644 --- a/lm/builder/interpolate.cc +++ b/lm/builder/interpolate.cc @@ -33,12 +33,12 @@ class Callback { pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1]; probs_[order_minus_1 + 1] = pay.complete.prob; pay.complete.prob = log10(pay.complete.prob); - // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. - if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) { + // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. + if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get())); ++backoffs_[order_minus_1]; } else { - // Not a context. + // Not a context. pay.complete.backoff = 0.0; } } @@ -52,7 +52,7 @@ class Callback { }; } // namespace -Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) +Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {} // perform order-wise interpolation diff --git a/lm/config.cc b/lm/config.cc index 9520c41c8..dc3365319 100644 --- a/lm/config.cc +++ b/lm/config.cc @@ -11,7 +11,11 @@ Config::Config() : enumerate_vocab(NULL), unknown_missing(COMPLAIN), sentence_marker_missing(THROW_UP), +#if defined(_WIN32) || defined(_WIN64) + positive_log_probability(SILENT), +#else positive_log_probability(THROW_UP), +#endif unknown_missing_logprob(-100.0), probing_multiplier(1.5), building_memory(1073741824ULL), // 1 GB diff --git a/lm/filter/arpa_io.hh b/lm/filter/arpa_io.hh index 5b31620b5..08e658666 100644 --- a/lm/filter/arpa_io.hh +++ b/lm/filter/arpa_io.hh @@ -14,7 +14,10 @@ #include <string> #include <vector> +#if !defined __MINGW32__ #include <err.h> +#endif + #include <string.h> #include <stdint.h> diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh index 97c0fa25e..740b8d50e 100644 --- a/lm/filter/count_io.hh +++ b/lm/filter/count_io.hh @@ -5,7 +5,9 @@ #include <iostream> #include <string> +#if !defined __MINGW32__ #include <err.h> +#endif #include "util/file_piece.hh" @@ -17,7 +19,12 @@ class CountOutput : boost::noncopyable { void AddNGram(const StringPiece &line) { if (!(file_ << line << '\n')) { +#if defined __MINGW32__ + std::cerr<<"Writing counts file failed"<<std::endl; + exit(3); +#else err(3, "Writing counts file failed"); +#endif } } @@ -35,7 +42,7 @@ class CountOutput : boost::noncopyable { class CountBatch { public: - explicit CountBatch(std::streamsize initial_read) + explicit CountBatch(std::streamsize initial_read) : initial_read_(initial_read) { buffer_.reserve(initial_read); } @@ -68,7 +75,7 @@ class CountBatch { private: std::streamsize initial_read_; - // This could have been a std::string but that's less happy with raw writes. + // This could have been a std::string but that's less happy with raw writes. std::vector<char> buffer_; }; diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc index 1736bc405..f89ac4df3 100644 --- a/lm/filter/filter_main.cc +++ b/lm/filter/filter_main.cc @@ -57,7 +57,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; struct Config { - Config() : + Config() : #ifndef NTHREAD batch_size(25000), threads(boost::thread::hardware_concurrency()), @@ -202,7 +202,7 @@ int main(int argc, char *argv[]) { return 1; } } - + if (config.mode == lm::MODE_UNSET) { lm::DisplayHelp(argv[0]); return 1; @@ -221,7 +221,12 @@ int main(int argc, char *argv[]) { } else if (!strncmp(cmd_input, "model:", 6)) { cmd_input += 6; } else if (strchr(cmd_input, ':')) { +#if defined __MINGW32__ + std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; + exit(1); +#else errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); +#endif // defined } else { std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; } @@ -232,7 +237,12 @@ int main(int argc, char *argv[]) { } else { cmd_file.open(cmd_input, std::ios::in); if (!cmd_file) { +#if defined __MINGW32__ + std::cerr << "Could not open input file " << cmd_input << std::endl; + exit(2); +#else err(2, "Could not open input file %s", cmd_input); +#endif // defined } vocab = &cmd_file; } diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc index 7ee4e84ba..7ed5d92fb 100644 --- a/lm/filter/vocab.cc +++ b/lm/filter/vocab.cc @@ -4,7 +4,10 @@ #include <iostream> #include <ctype.h> + +#if !defined __MINGW32__ #include <err.h> +#endif namespace lm { namespace vocab { @@ -31,7 +34,7 @@ bool IsLineEnd(std::istream &in) { }// namespace // Read space separated words in enter separated lines. These lines can be -// very long, so don't read an entire line at a time. +// very long, so don't read an entire line at a time. unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) { in.exceptions(std::istream::badbit); unsigned int sentence = 0; |