From 02ab8f5102122ce6368b939161d7bc53de1fa78d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 27 Aug 2014 23:23:39 -0400 Subject: Bugfix / Stephan Peitz and more paranoid error checking --- lm/model_test.cc | 2 +- lm/read_arpa.hh | 30 +++++++++++++++--------------- lm/test.arpa | 2 +- lm/test_nounk.arpa | 2 +- lm/trie_sort.cc | 14 ++++++++++---- 5 files changed, 28 insertions(+), 22 deletions(-) (limited to 'lm') diff --git a/lm/model_test.cc b/lm/model_test.cc index 7005b05ea..0f54724bb 100644 --- a/lm/model_test.cc +++ b/lm/model_test.cc @@ -176,7 +176,7 @@ template void MinimalState(const M &model) { AppendTest("to", 1, -1.687872, false); AppendTest("look", 2, -0.2922095, true); BOOST_CHECK_EQUAL(2, state.length); - AppendTest("good", 3, -7, true); + AppendTest("a", 3, -7, true); } template void ExtendLeftTest(const M &model) { diff --git a/lm/read_arpa.hh b/lm/read_arpa.hh index 213fe1caa..64eeef306 100644 --- a/lm/read_arpa.hh +++ b/lm/read_arpa.hh @@ -41,29 +41,24 @@ class PositiveProbWarn { WarningAction action_; }; -template StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) { +template void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { try { - weights.prob = f.ReadFloat(); - if (weights.prob > 0.0) { - warn.Warn(weights.prob); - weights.prob = 0.0; + float prob = f.ReadFloat(); + if (prob > 0.0) { + warn.Warn(prob); + prob = 0.0; } UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability"); - StringPiece ret(f.ReadDelimited(kARPASpaces)); - ReadBackoff(f, weights); - return ret; + WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces)); + Weights &w = unigrams[word]; + w.prob = prob; + ReadBackoff(f, w); } catch(util::Exception &e) { e << " in the 1-gram at byte " << f.Offset(); throw; } } -template void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { - Weights temp; - WordIndex word = vocab.Insert(Read1Gram(f, temp, warn)); - unigrams[word] = temp; -} - template void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { ReadNGramHeader(f, 1); for (std::size_t i = 0; i < count; ++i) { @@ -81,7 +76,12 @@ template void ReadNGram(util::FilePie weights.prob = 0.0; } for (unsigned char i = 0; i < n; ++i, ++indices_out) { - *indices_out = vocab.Index(f.ReadDelimited(kARPASpaces)); + StringPiece word(f.ReadDelimited(kARPASpaces)); + WordIndex index = vocab.Index(word); + *indices_out = index; + // Check for words mapped to that are not the string . + UTIL_THROW_IF(index == 0 /* mapped to */ && (word != StringPiece("", 5)) && (word != StringPiece("", 5)), + FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears"); } ReadBackoff(f, weights); } catch(util::Exception &e) { diff --git a/lm/test.arpa b/lm/test.arpa index ef214eae3..c4d2e6df5 100644 --- a/lm/test.arpa +++ b/lm/test.arpa @@ -105,7 +105,7 @@ ngram 5=4 -0.04835128 looking on a -0.4771212 -3 also would consider -7 -6 however -12 --7 to look good +-7 to look a \4-grams: -0.009249173 looking on a little -0.4771212 diff --git a/lm/test_nounk.arpa b/lm/test_nounk.arpa index 060733d98..e38fc8547 100644 --- a/lm/test_nounk.arpa +++ b/lm/test_nounk.arpa @@ -101,7 +101,7 @@ ngram 5=4 -0.1892331 little more loin -0.04835128 looking on a -0.4771212 -3 also would consider -7 --7 to look good +-7 to look a \4-grams: -0.009249173 looking on a little -0.4771212 diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc index dc24e5b75..c3f468746 100644 --- a/lm/trie_sort.cc +++ b/lm/trie_sort.cc @@ -107,14 +107,20 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre } struct ThrowCombine { - void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const { - UTIL_THROW(FormatLoadException, "Duplicate n-gram detected."); + void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const { + const WordIndex *base = reinterpret_cast(first); + FormatLoadException e; + e << "Duplicate n-gram detected with vocab ids"; + for (const WordIndex *i = base; i != base + order; ++i) { + e << ' ' << *i; + } + throw e; } }; // Useful for context files that just contain records with no value. struct FirstCombine { - void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const { + void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const { util::WriteOrThrow(out, first, entry_size); } }; @@ -134,7 +140,7 @@ template FILE *MergeSortedFiles(FILE *first_file, FILE *second_f util::WriteOrThrow(out_file.get(), second.Data(), entry_size); ++second; } else { - combine(entry_size, first.Data(), second.Data(), out_file.get()); + combine(entry_size, order, first.Data(), second.Data(), out_file.get()); ++first; ++second; } } -- cgit v1.2.3