From 02ab8f5102122ce6368b939161d7bc53de1fa78d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Wed, 27 Aug 2014 23:23:39 -0400
Subject: Bugfix / Stephan Peitz and more paranoid error checking

---
 lm/model_test.cc   |  2 +-
 lm/read_arpa.hh    | 30 +++++++++++++++---------------
 lm/test.arpa       |  2 +-
 lm/test_nounk.arpa |  2 +-
 lm/trie_sort.cc    | 14 ++++++++++----
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'lm')
diff --git a/lm/model_test.cc b/lm/model_test.cc
index 7005b05ea..0f54724bb 100644
--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@@ -176,7 +176,7 @@ template <class M> void MinimalState(const M &model) {
   AppendTest("to", 1, -1.687872, false);
   AppendTest("look", 2, -0.2922095, true);
   BOOST_CHECK_EQUAL(2, state.length);
-  AppendTest("good", 3, -7, true);
+  AppendTest("a", 3, -7, true);
 }
 
 template <class M> void ExtendLeftTest(const M &model) {
diff --git a/lm/read_arpa.hh b/lm/read_arpa.hh
index 213fe1caa..64eeef306 100644
--- a/lm/read_arpa.hh
+++ b/lm/read_arpa.hh
@@ -41,29 +41,24 @@ class PositiveProbWarn {
     WarningAction action_;
 };
 
-template <class Weights> StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) {
+template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
   try {
-    weights.prob = f.ReadFloat();
-    if (weights.prob > 0.0) {
-      warn.Warn(weights.prob);
-      weights.prob = 0.0;
+    float prob = f.ReadFloat();
+    if (prob > 0.0) {
+      warn.Warn(prob);
+      prob = 0.0;
     }
     UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
-    StringPiece ret(f.ReadDelimited(kARPASpaces));
-    ReadBackoff(f, weights);
-    return ret;
+    WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
+    Weights &w = unigrams[word];
+    w.prob = prob;
+    ReadBackoff(f, w);
   } catch(util::Exception &e) {
     e << " in the 1-gram at byte " << f.Offset();
     throw;
   }
 }
 
-template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
-  Weights temp;
-  WordIndex word = vocab.Insert(Read1Gram(f, temp, warn));
-  unigrams[word] = temp;
-}
-
 template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
   ReadNGramHeader(f, 1);
   for (std::size_t i = 0; i < count; ++i) {
@@ -81,7 +76,12 @@ template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePie
       weights.prob = 0.0;
     }
     for (unsigned char i = 0; i < n; ++i, ++indices_out) {
-      *indices_out = vocab.Index(f.ReadDelimited(kARPASpaces));
+      StringPiece word(f.ReadDelimited(kARPASpaces));
+      WordIndex index = vocab.Index(word);
+      *indices_out = index;
+      // Check for words mapped to <unk> that are not the string <unk>.
+      UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
+          FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
     }
     ReadBackoff(f, weights);
   } catch(util::Exception &e) {
diff --git a/lm/test.arpa b/lm/test.arpa
index ef214eae3..c4d2e6df5 100644
--- a/lm/test.arpa
+++ b/lm/test.arpa
@@ -105,7 +105,7 @@ ngram 5=4
 -0.04835128	looking on a	-0.4771212
 -3	also would consider	-7
 -6	<unk> however <unk>	-12
--7	to look good
+-7	to look a
 
 \4-grams:
 -0.009249173	looking on a little	-0.4771212
diff --git a/lm/test_nounk.arpa b/lm/test_nounk.arpa
index 060733d98..e38fc8547 100644
--- a/lm/test_nounk.arpa
+++ b/lm/test_nounk.arpa
@@ -101,7 +101,7 @@ ngram 5=4
 -0.1892331	little more loin
 -0.04835128	looking on a	-0.4771212
 -3	also would consider	-7
--7	to look good
+-7	to look a
 
 \4-grams:
 -0.009249173	looking on a little	-0.4771212
diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc
index dc24e5b75..c3f468746 100644
--- a/lm/trie_sort.cc
+++ b/lm/trie_sort.cc
@@ -107,14 +107,20 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
 }
 
 struct ThrowCombine {
-  void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const {
-    UTIL_THROW(FormatLoadException, "Duplicate n-gram detected.");
+  void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const {
+    const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
+    FormatLoadException e;
+    e << "Duplicate n-gram detected with vocab ids";
+    for (const WordIndex *i = base; i != base + order; ++i) {
+      e << ' ' << *i;
+    }
+    throw e;
   }
 };
 
 // Useful for context files that just contain records with no value.  
 struct FirstCombine {
-  void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const {
+  void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
     util::WriteOrThrow(out, first, entry_size);
   }
 };
@@ -134,7 +140,7 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f
       util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
       ++second;
     } else {
-      combine(entry_size, first.Data(), second.Data(), out_file.get());
+      combine(entry_size, order, first.Data(), second.Data(), out_file.get());
       ++first; ++second;
     }
   }
-- 
cgit v1.2.3