KenLM c1dba12

- Reject NaNs - Fix ChartState hashing (unused in Moses) - Expose CreateOrThrow - Minor portability improvement in getopt
author: Kenneth Heafield <kenlm@kheafield.com> 2012-03-11 21:47:38 +0400
committer: Kenneth Heafield <kenlm@kheafield.com> 2012-03-11 21:47:38 +0400
commit: 175b7aaf495963a8dd08525094073db06686adf8 (patch)
tree: c279eee1e5a3d3d7d417ec747091e184c418b7bf /lm
parent: 0fc56ef7b602134b387b264c4b1ffe13e7ac40f8 (diff)
3 files changed, 23 insertions, 14 deletions
diff --git a/lm/left.hh b/lm/left.hh
index 41f71f849..a07f98038 100644
--- a/lm/left.hh
+++ b/lm/left.hh
@@ -112,7 +112,7 @@ inline size_t hash_value(const ChartState &state) {
   size_t hashes[2];
   hashes[0] = hash_value(state.left);
   hashes[1] = hash_value(state.right);
-  return util::MurmurHashNative(hashes, sizeof(size_t), state.full);
+  return util::MurmurHashNative(hashes, sizeof(size_t) * 2, state.full);
 }
 
 template <class M> class RuleScore {
diff --git a/lm/read_arpa.cc b/lm/read_arpa.cc
index 05f761be6..be6565992 100644
--- a/lm/read_arpa.cc
+++ b/lm/read_arpa.cc
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include <ctype.h>
+#include <math.h>
 #include <string.h>
 #include <stdint.h>
 
@@ -93,7 +94,11 @@ void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
     case '\t':
       weights.backoff = in.ReadFloat();
       if (weights.backoff == ngram::kExtensionBackoff) weights.backoff = ngram::kNoExtensionBackoff;
-      if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff");
+      {
+        int float_class = fpclassify(weights.backoff);
+        UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << weights.backoff);
+      }
+      UTIL_THROW_IF((in.get() != '\n'), FormatLoadException, "Expected newline after backoff");
       break;
     case '\n':
       weights.backoff = ngram::kNoExtensionBackoff;
diff --git a/lm/read_arpa.hh b/lm/read_arpa.hh
index ab996bde7..25648d3fb 100644
--- a/lm/read_arpa.hh
+++ b/lm/read_arpa.hh
@@ -10,6 +10,8 @@
 #include <iosfwd>
 #include <vector>
 
+#include <math.h>
+
 namespace lm {
 
 void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
@@ -29,20 +31,26 @@ class PositiveProbWarn {
 
     explicit PositiveProbWarn(WarningAction action) : action_(action) {}
 
-    void Warn(float prob);
+    float ReadProb(util::FilePiece &f) {
+      float prob = f.ReadFloat();
+      UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
+      UTIL_THROW_IF(isnan(prob), FormatLoadException, "NaN probability");
+      if (prob > 0.0) {
+        Warn(prob);
+        prob = 0.0;
+      }
+      return prob;
+    }
 
   private:
+    void Warn(float prob);
+
     WarningAction action_;
 };
 
 template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) {
   try {
-    float prob = f.ReadFloat();
-    if (prob > 0.0) {
-      warn.Warn(prob);
-      prob = 0.0;
-    }
-    if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
+    float prob = warn.ReadProb(f);
     ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))];
     value.prob = prob;
     ReadBackoff(f, value);
@@ -64,11 +72,7 @@ template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc
 // Return true if a positive log probability came out.
 template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) {
   try {
-    weights.prob = f.ReadFloat();
-    if (weights.prob > 0.0) {
-      warn.Warn(weights.prob);
-      weights.prob = 0.0;
-    }
+    weights.prob = warn.ReadProb(f);
     for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) {
       *vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces));
     }
author	Kenneth Heafield <kenlm@kheafield.com>	2012-03-11 21:47:38 +0400
committer	Kenneth Heafield <kenlm@kheafield.com>	2012-03-11 21:47:38 +0400
commit	175b7aaf495963a8dd08525094073db06686adf8 (patch)
tree	c279eee1e5a3d3d7d417ec747091e184c418b7bf /lm
parent	0fc56ef7b602134b387b264c4b1ffe13e7ac40f8 (diff)