Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/lm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2014-08-28 07:23:39 +0400
committerKenneth Heafield <github@kheafield.com>2014-08-28 07:23:39 +0400
commit02ab8f5102122ce6368b939161d7bc53de1fa78d (patch)
tree9ccdb0e71f6ca9d8753826b9d54e963ee66b4fd6 /lm
parent1c45d780d4f4ed1f3ac5f260b108cebb823a4afd (diff)
Bugfix / Stephan Peitz and more paranoid error checking
Diffstat (limited to 'lm')
-rw-r--r--lm/model_test.cc2
-rw-r--r--lm/read_arpa.hh30
-rw-r--r--lm/test.arpa2
-rw-r--r--lm/test_nounk.arpa2
-rw-r--r--lm/trie_sort.cc14
5 files changed, 28 insertions, 22 deletions
diff --git a/lm/model_test.cc b/lm/model_test.cc
index 7005b05ea..0f54724bb 100644
--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@@ -176,7 +176,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("to", 1, -1.687872, false);
AppendTest("look", 2, -0.2922095, true);
BOOST_CHECK_EQUAL(2, state.length);
- AppendTest("good", 3, -7, true);
+ AppendTest("a", 3, -7, true);
}
template <class M> void ExtendLeftTest(const M &model) {
diff --git a/lm/read_arpa.hh b/lm/read_arpa.hh
index 213fe1caa..64eeef306 100644
--- a/lm/read_arpa.hh
+++ b/lm/read_arpa.hh
@@ -41,29 +41,24 @@ class PositiveProbWarn {
WarningAction action_;
};
-template <class Weights> StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) {
+template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
try {
- weights.prob = f.ReadFloat();
- if (weights.prob > 0.0) {
- warn.Warn(weights.prob);
- weights.prob = 0.0;
+ float prob = f.ReadFloat();
+ if (prob > 0.0) {
+ warn.Warn(prob);
+ prob = 0.0;
}
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
- StringPiece ret(f.ReadDelimited(kARPASpaces));
- ReadBackoff(f, weights);
- return ret;
+ WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
+ Weights &w = unigrams[word];
+ w.prob = prob;
+ ReadBackoff(f, w);
} catch(util::Exception &e) {
e << " in the 1-gram at byte " << f.Offset();
throw;
}
}
-template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
- Weights temp;
- WordIndex word = vocab.Insert(Read1Gram(f, temp, warn));
- unigrams[word] = temp;
-}
-
template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
ReadNGramHeader(f, 1);
for (std::size_t i = 0; i < count; ++i) {
@@ -81,7 +76,12 @@ template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePie
weights.prob = 0.0;
}
for (unsigned char i = 0; i < n; ++i, ++indices_out) {
- *indices_out = vocab.Index(f.ReadDelimited(kARPASpaces));
+ StringPiece word(f.ReadDelimited(kARPASpaces));
+ WordIndex index = vocab.Index(word);
+ *indices_out = index;
+ // Check for words mapped to <unk> that are not the string <unk>.
+ UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
+ FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
}
ReadBackoff(f, weights);
} catch(util::Exception &e) {
diff --git a/lm/test.arpa b/lm/test.arpa
index ef214eae3..c4d2e6df5 100644
--- a/lm/test.arpa
+++ b/lm/test.arpa
@@ -105,7 +105,7 @@ ngram 5=4
-0.04835128 looking on a -0.4771212
-3 also would consider -7
-6 <unk> however <unk> -12
--7 to look good
+-7 to look a
\4-grams:
-0.009249173 looking on a little -0.4771212
diff --git a/lm/test_nounk.arpa b/lm/test_nounk.arpa
index 060733d98..e38fc8547 100644
--- a/lm/test_nounk.arpa
+++ b/lm/test_nounk.arpa
@@ -101,7 +101,7 @@ ngram 5=4
-0.1892331 little more loin
-0.04835128 looking on a -0.4771212
-3 also would consider -7
--7 to look good
+-7 to look a
\4-grams:
-0.009249173 looking on a little -0.4771212
diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc
index dc24e5b75..c3f468746 100644
--- a/lm/trie_sort.cc
+++ b/lm/trie_sort.cc
@@ -107,14 +107,20 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
}
struct ThrowCombine {
- void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const {
- UTIL_THROW(FormatLoadException, "Duplicate n-gram detected.");
+ void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const {
+ const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
+ FormatLoadException e;
+ e << "Duplicate n-gram detected with vocab ids";
+ for (const WordIndex *i = base; i != base + order; ++i) {
+ e << ' ' << *i;
+ }
+ throw e;
}
};
// Useful for context files that just contain records with no value.
struct FirstCombine {
- void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const {
+ void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
util::WriteOrThrow(out, first, entry_size);
}
};
@@ -134,7 +140,7 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f
util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
++second;
} else {
- combine(entry_size, first.Data(), second.Data(), out_file.get());
+ combine(entry_size, order, first.Data(), second.Data(), out_file.get());
++first; ++second;
}
}