Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/lm
diff options
context:
space:
mode:
authorjiejiang <mail.jie.jiang@gmail.com>2013-12-19 00:15:39 +0400
committerjiejiang <mail.jie.jiang@gmail.com>2013-12-19 00:15:39 +0400
commit744376b3fbebc41c4a270bf549826d5eb9219ae0 (patch)
tree27b324b13dacf16e021cde88b9edb594d71f09dc /lm
parent1a8a8fbb2d8eb503c38ba03da796e16ed08fd07a (diff)
moses windows build, with some TODO list
Diffstat (limited to 'lm')
-rw-r--r--lm/builder/corpus_count.cc24
-rw-r--r--lm/builder/interpolate.cc8
-rw-r--r--lm/config.cc4
-rw-r--r--lm/filter/arpa_io.hh3
-rw-r--r--lm/filter/count_io.hh11
-rw-r--r--lm/filter/filter_main.cc14
-rw-r--r--lm/filter/vocab.cc5
7 files changed, 48 insertions, 21 deletions
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index aea93ad10..3edd3216a 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -87,7 +87,7 @@ class VocabHandout {
Table table_;
std::size_t double_cutoff_;
-
+
util::FakeOFStream word_list_;
};
@@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_);
}
-
+
private:
const std::size_t size_;
};
@@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-
+
bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_);
- }
-
+ }
+
private:
const std::size_t size_;
};
@@ -131,7 +131,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer {
public:
- Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
+ Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@@ -140,7 +140,7 @@ class Writer {
dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) {
- // Add special words. AdjustCounts is responsible if order != 1.
+ // Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK);
AddUnigramWord(kBOS);
}
@@ -170,16 +170,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return;
}
- // Complete the write.
+ // Complete the write.
gram_.Count() = 1;
- // Prepare the next n-gram.
+ // Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_);
gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin());
return;
}
- // Block end. Need to store the context in a temporary buffer.
+ // Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear();
block_->SetValidSize(block_size_);
@@ -207,7 +207,7 @@ class Writer {
// Hash table combiner implementation.
Dedupe dedupe_;
- // Small buffer to hold existing ngrams when shifting across a block boundary.
+ // Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_;
@@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
return VocabHandout::MemUsage(vocab_estimate);
}
-CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
+CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 500268069..52e69f02e 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -33,12 +33,12 @@ class Callback {
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
pay.complete.prob = log10(pay.complete.prob);
- // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
- if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
+ // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
+ if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end
pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
++backoffs_[order_minus_1];
} else {
- // Not a context.
+ // Not a context.
pay.complete.backoff = 0.0;
}
}
@@ -52,7 +52,7 @@ class Callback {
};
} // namespace
-Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
+Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
: uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
// perform order-wise interpolation
diff --git a/lm/config.cc b/lm/config.cc
index 9520c41c8..dc3365319 100644
--- a/lm/config.cc
+++ b/lm/config.cc
@@ -11,7 +11,11 @@ Config::Config() :
enumerate_vocab(NULL),
unknown_missing(COMPLAIN),
sentence_marker_missing(THROW_UP),
+#if defined(_WIN32) || defined(_WIN64)
+ positive_log_probability(SILENT),
+#else
positive_log_probability(THROW_UP),
+#endif
unknown_missing_logprob(-100.0),
probing_multiplier(1.5),
building_memory(1073741824ULL), // 1 GB
diff --git a/lm/filter/arpa_io.hh b/lm/filter/arpa_io.hh
index 5b31620b5..08e658666 100644
--- a/lm/filter/arpa_io.hh
+++ b/lm/filter/arpa_io.hh
@@ -14,7 +14,10 @@
#include <string>
#include <vector>
+#if !defined __MINGW32__
#include <err.h>
+#endif
+
#include <string.h>
#include <stdint.h>
diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh
index 97c0fa25e..740b8d50e 100644
--- a/lm/filter/count_io.hh
+++ b/lm/filter/count_io.hh
@@ -5,7 +5,9 @@
#include <iostream>
#include <string>
+#if !defined __MINGW32__
#include <err.h>
+#endif
#include "util/file_piece.hh"
@@ -17,7 +19,12 @@ class CountOutput : boost::noncopyable {
void AddNGram(const StringPiece &line) {
if (!(file_ << line << '\n')) {
+#if defined __MINGW32__
+ std::cerr<<"Writing counts file failed"<<std::endl;
+ exit(3);
+#else
err(3, "Writing counts file failed");
+#endif
}
}
@@ -35,7 +42,7 @@ class CountOutput : boost::noncopyable {
class CountBatch {
public:
- explicit CountBatch(std::streamsize initial_read)
+ explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) {
buffer_.reserve(initial_read);
}
@@ -68,7 +75,7 @@ class CountBatch {
private:
std::streamsize initial_read_;
- // This could have been a std::string but that's less happy with raw writes.
+ // This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_;
};
diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc
index 1736bc405..f89ac4df3 100644
--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@@ -57,7 +57,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config {
- Config() :
+ Config() :
#ifndef NTHREAD
batch_size(25000),
threads(boost::thread::hardware_concurrency()),
@@ -202,7 +202,7 @@ int main(int argc, char *argv[]) {
return 1;
}
}
-
+
if (config.mode == lm::MODE_UNSET) {
lm::DisplayHelp(argv[0]);
return 1;
@@ -221,7 +221,12 @@ int main(int argc, char *argv[]) {
} else if (!strncmp(cmd_input, "model:", 6)) {
cmd_input += 6;
} else if (strchr(cmd_input, ':')) {
+#if defined __MINGW32__
+ std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl;
+ exit(1);
+#else
errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input);
+#endif // defined
} else {
std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl;
}
@@ -232,7 +237,12 @@ int main(int argc, char *argv[]) {
} else {
cmd_file.open(cmd_input, std::ios::in);
if (!cmd_file) {
+#if defined __MINGW32__
+ std::cerr << "Could not open input file " << cmd_input << std::endl;
+ exit(2);
+#else
err(2, "Could not open input file %s", cmd_input);
+#endif // defined
}
vocab = &cmd_file;
}
diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc
index 7ee4e84ba..7ed5d92fb 100644
--- a/lm/filter/vocab.cc
+++ b/lm/filter/vocab.cc
@@ -4,7 +4,10 @@
#include <iostream>
#include <ctype.h>
+
+#if !defined __MINGW32__
#include <err.h>
+#endif
namespace lm {
namespace vocab {
@@ -31,7 +34,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace
// Read space separated words in enter separated lines. These lines can be
-// very long, so don't read an entire line at a time.
+// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit);
unsigned int sentence = 0;