vocabulary no-copy find_string pair-char-ptrs

author: graehl <graehl@gmail.com> 2015-06-25 10:45:09 +0300
committer: graehl <graehl@gmail.com> 2015-06-25 11:31:16 +0300
commit: a3dd9155bab174643fe358c1041f881fd98474e8 (patch)
tree: f8781d8cb1191b03ef7f0bee0c98bd376ce08005
parent: 589a79fa094a1d36dfa110d2bbb58be078c7a05b (diff)
7 files changed, 299 insertions, 170 deletions
diff --git a/.gitignore b/.gitignore
index 23c4020..12fab12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ src/prepareNeuralTM
 src/testNeuralLM
 src/testNeuralNetwork
 src/trainNeuralNetwork
+.history
diff --git a/src/find_string.hpp b/src/find_string.hpp
new file mode 100644
index 0000000..fd612fd
--- /dev/null
+++ b/src/find_string.hpp
@@ -0,0 +1,77 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+    find_string(boost::unordered_map<std::string, ...>, pair<char const*, char
+    const*>) pair is [begin, end), a key: map.find(std:string(key.first,
+    key.second)) read-only since unordered_map doesn't support lazy construction
+    of string from a pair key.
+
+    To the extent possible under law, the author(s) have dedicated all copyright
+    and related and neighboring rights to this software to the public domain
+    worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef FIND_STRING_GRAEHL_2015_06_24_HPP
+#define FIND_STRING_GRAEHL_2015_06_24_HPP
+#pragma once
+
+#include <utility>
+#include <algorithm>
+#include <cstddef>
+#include <boost/functional/hash.hpp>
+
+namespace std {
+/// we do not change standard semantics of any supported comparison e.g. pair vs
+/// pair, but simply allow string to be compared against pair of char pointers.
+inline bool operator==(std::string const& str, std::pair<char const*, char const*> slice) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char const*, char const*> slice, std::string const& str) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::string const& str, std::pair<char*, char*> slice) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char*, char*> slice, std::string const& str) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+/// techinically not allowed but easiest route to ADL. we could rename these instead.
+inline std::size_t hash_value(std::pair<char const*, char const*> slice) {
+  return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::pair<char*, char*> slice) {
+  return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::string const& str) {
+  return boost::hash_range(str.begin(), str.end());
+}
+}
+
+struct slice_or_string_eq {
+  typedef bool result_type;
+  template <class A, class B>
+  bool operator()(A const& a, B const& b) const {
+    return a == b;
+  }
+};
+
+struct slice_or_string_hash {
+  typedef std::size_t result_type;
+  template <class Slice>
+  std::size_t operator()(Slice const& slice) const {
+    return hash_value(slice);
+  }
+};
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, Slice const& key) {
+  return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::iterator find_string(UnorderedMap& map, Slice const& key) {
+  return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+#endif
diff --git a/src/neuralLM.h b/src/neuralLM.h
index c18485f..5febaeb 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -11,6 +11,7 @@
 #include "util.h"
 #include "vocabulary.h"
 #include "neuralNetwork.h"
+#include "replace_digits.hpp"
 
 /*
   To do:
@@ -20,17 +21,16 @@
 namespace nplm
 {
 
-class neuralLM : public neuralNetwork
+class neuralLM : public neuralNetwork, graehl::replace_digits
 {
-  char map_digits;
   boost::shared_ptr<vocabulary> vocab;
   int start, null;
 
  public:
   neuralLM()
       : neuralNetwork(),
-        vocab(new vocabulary()),
-        map_digits(0)
+        graehl::replace_digits(0),
+        vocab(new vocabulary())
   {
   }
 
@@ -45,25 +45,35 @@ class neuralLM : public neuralNetwork
 
   const vocabulary &get_vocabulary() const { return *(this->vocab); }
 
+
   int lookup_word(const std::string &word) const
   {
     if (map_digits)
-      for (int i=0; i<word.length(); i++)
-        if (isdigit(word[i]))
-        {
+      for (int i=0, n=word.size(); i<n; ++i)
+        if (graehl::ascii_digit(word[i])) {
           std::string mapped_word(word);
-          for (; i<word.length(); i++)
-            if (isdigit(word[i]))
-              mapped_word[i] = map_digits;
+          replace(mapped_word, i);
           return vocab->lookup_word(mapped_word);
         }
     return vocab->lookup_word(word);
   }
 
+  int lookup_word(std::pair<char const*, char const*> slice) const
+  {
+    if (map_digits)
+      for (char const* i = slice.first; i != slice.second; ++i)
+        if (graehl::ascii_digit(*i)) {
+          std::string mapped_word(slice.first, slice.second);
+          replace(mapped_word, i - slice.first);
+          return vocab->lookup_word(mapped_word);
+        }
+    return vocab->lookup_word(slice);
+  }
+
   double lookup_ngram(const int *ngram_a, int n)
   {
     Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
-    for (int i=0; i<m->ngram_size; i++)
+    for (int i=0; i<m->ngram_size; ++i)
     {
       if (i-m->ngram_size+n < 0)
       {
@@ -114,7 +124,7 @@ void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size,
 {
   output.clear();
   output.resize(input.size()+ngram_size);
-  for (int i=0; i<ngram_size-1; i++)
+  for (int i=0; i<ngram_size-1; ++i)
     output[i] = start;
   std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
   output[output.size()-1] = stop;
@@ -168,7 +178,7 @@ inline void preprocessWords(const std::vector<std::string> &words,
   else {
     if (nums.size() != ngram_size)
     {
-      std::cerr << "error: wrong number of fields in line" << std::endl;
+      std::cerr << "error: wrong number of fields in line\n";
       std::exit(1);
     }
     ngrams.push_back(nums);
diff --git a/src/neuralTM.h b/src/neuralTM.h
index 4ad6752..4c3db51 100644
--- a/src/neuralTM.h
+++ b/src/neuralTM.h
@@ -11,120 +11,134 @@
 #include "util.h"
 #include "vocabulary.h"
 #include "neuralNetwork.h"
+#include "replace_digits.hpp"
 
 namespace nplm
 {
 
-class neuralTM : public neuralNetwork
+class neuralTM : public neuralNetwork, graehl::replace_digits
 {
-    char map_digits;
-    boost::shared_ptr<vocabulary> input_vocab, output_vocab;
-    int start, null;
+  boost::shared_ptr<vocabulary> input_vocab, output_vocab;
+  int start, null;
 
-public:
-    neuralTM() 
+ public:
+  neuralTM()
       : neuralNetwork(),
-        map_digits(0),
+        graehl::replace_digits(0),
         input_vocab(new vocabulary()),
         output_vocab(new vocabulary())
-    { 
-    }
-
-    void set_map_digits(char value) { map_digits = value; }
-
-    void set_input_vocabulary(const vocabulary &vocab)
-    {
-        *(this->input_vocab) = vocab;
-        start = vocab.lookup_word("<s>");
-        null = vocab.lookup_word("<null>");
-    }
-
-    void set_output_vocabulary(const vocabulary &vocab)
-    {
-        *(this->output_vocab) = vocab;
-    }
-
-    const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
-    const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
-
-    int lookup_input_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return input_vocab->lookup_word(mapped_word);
-		}
-        return input_vocab->lookup_word(word);
-    }
-
-    int lookup_output_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return output_vocab->lookup_word(mapped_word);
-		}
-	return output_vocab->lookup_word(word);
-    }
-
-    double lookup_ngram(const int *ngram_a, int n)
-    {
-        Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
-	for (int i=0; i<m->ngram_size; i++)
-	{
-	    if (i-m->ngram_size+n < 0)
-	    {
-		if (ngram_a[0] == start)
-		    ngram(i) = start;
-		else
-		    ngram(i) = null;
-	    }
-	    else
-	    {
-	        ngram(i) = ngram_a[i-m->ngram_size+n];
-	    }
-	}
-	return neuralNetwork::lookup_ngram(ngram);
-    }
-
-    double lookup_ngram(const std::vector<int> &ngram_v)
-    {
-        return lookup_ngram(ngram_v.data(), ngram_v.size());
-    }
-
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
-    {
-        return neuralNetwork::lookup_ngram(ngram);
-    }
-    
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        return neuralNetwork::lookup_ngram(ngram, log_probs_const);
-    }
-
-    void read(const std::string &filename)
+  {
+  }
+
+  void set_map_digits(char value) { map_digits = value; }
+
+  void set_input_vocabulary(const vocabulary &vocab)
+  {
+    *(this->input_vocab) = vocab;
+    start = vocab.lookup_word("<s>");
+    null = vocab.lookup_word("<null>");
+  }
+
+  void set_output_vocabulary(const vocabulary &vocab)
+  {
+    *(this->output_vocab) = vocab;
+  }
+
+  const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
+  const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
+
+  int lookup_word(const std::string &word, vocabulary const& vocab) const
+  {
+    if (map_digits)
+      for (int i=0, n=word.size(); i<n; ++i)
+        if (graehl::ascii_digit(word[i])) {
+          std::string mapped_word(word);
+          replace(mapped_word, i);
+          return vocab->lookup_word(mapped_word);
+        }
+    return vocab->lookup_word(word);
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice, vocabulary const& vocab) const
+  {
+    if (map_digits)
+      for (char const* i = slice.first; i != slice.second; ++i)
+        if (graehl::ascii_digit(*i)) {
+          std::string mapped_word(slice.first, slice.second);
+          replace(mapped_word, i - slice.first);
+          return vocab->lookup_word(mapped_word);
+        }
+    return vocab->lookup_word(slice);
+  }
+
+  int lookup_input_word(const std::string &word) const
+  {
+    return lookup_word(word, *input_vocab);
+  }
+
+  int lookup_output_word(const std::string &word) const
+  {
+    return lookup_word(word, *output_vocab);
+  }
+
+  int lookup_input_word(std::pair<char const*, char const*> word) const
+  {
+    return lookup_word(word, *input_vocab);
+  }
+
+  int lookup_output_word(std::pair<char const*, char const*> word) const
+  {
+    return lookup_word(word, *output_vocab);
+  }
+
+  double lookup_ngram(const int *ngram_a, int n)
+  {
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+    for (int i=0; i<m->ngram_size; i++)
     {
-        std::vector<std::string> input_words;
-        std::vector<std::string> output_words;
-        m->read(filename, input_words, output_words);
-        set_input_vocabulary(vocabulary(input_words));
-        set_output_vocabulary(vocabulary(output_words));
-        resize();
-	// this is faster but takes more memory
-        //m->premultiply();
+      if (i-m->ngram_size+n < 0)
+      {
+        if (ngram_a[0] == start)
+          ngram(i) = start;
+        else
+          ngram(i) = null;
+      }
+      else
+      {
+        ngram(i) = ngram_a[i-m->ngram_size+n];
+      }
     }
+    return neuralNetwork::lookup_ngram(ngram);
+  }
+
+  double lookup_ngram(const std::vector<int> &ngram_v)
+  {
+    return lookup_ngram(ngram_v.data(), ngram_v.size());
+  }
+
+  template <typename Derived>
+  double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+  {
+    return neuralNetwork::lookup_ngram(ngram);
+  }
+
+  template <typename DerivedA, typename DerivedB>
+  void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+  {
+    return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+  }
+
+  void read(const std::string &filename)
+  {
+    std::vector<std::string> input_words;
+    std::vector<std::string> output_words;
+    m->read(filename, input_words, output_words);
+    set_input_vocabulary(vocabulary(input_words));
+    set_output_vocabulary(vocabulary(output_words));
+    resize();
+    // this is faster but takes more memory
+    //m->premultiply();
+  }
 
 };
 
diff --git a/src/replace_digits.hpp b/src/replace_digits.hpp
new file mode 100644
index 0000000..e8ac957
--- /dev/null
+++ b/src/replace_digits.hpp
@@ -0,0 +1,62 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+    replace 0-9 ascii chars with another ascii replacement
+
+    To the extent possible under law, the author(s) have dedicated all copyright
+    and related and neighboring rights to this software to the public domain
+    worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef REPLACEDIGITS_GRAEHL_2015_06_25_H
+#define REPLACEDIGITS_GRAEHL_2015_06_25_H
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace graehl {
+
+inline bool ascii_digit(char c) {
+  return c >= '0' && c <= '9';
+}
+
+struct replace_digits {
+  char map_digits;
+  replace_digits(char map_digits = '@') : map_digits(map_digits) {}
+
+  /// \return whether anything was replaced
+  bool replaced(char* i, char* end) const {
+    for (; i != end; ++i)
+      if (ascii_digit(*i)) {
+        *i = map_digits;
+        while (++i != end)
+          if (ascii_digit(*i)) *i = map_digits;
+        return true;
+      }
+    return false;
+  }
+  /// maybe: only if non-0 map_digits, do the thing
+  bool maybe_replaced(char* i, char* end) const { return map_digits && replaced(i, end); }
+
+  void replace(char* i, char* end) const {
+    for (; i != end; ++i)
+      if (ascii_digit(*i)) *i = map_digits;
+  }
+  void maybe_replace(char* i, char* end) const {
+    if (map_digits) replace(i, end);
+  }
+
+  void replace(std::string& str, std::string::size_type i = 0) const {
+    std::string::size_type n = str.size();
+    char* d = (char *)str.data(); // although only C++11 officially allows this, in reality everyone does
+    replace(d + i, d + n);
+  }
+  void maybe_replace(std::string& str, std::string::size_type i = 0) const {
+    if (map_digits) replace(str, i);
+  }
+};
+
+
+}
+
+#endif
diff --git a/src/types.hpp b/src/types.hpp
deleted file mode 100644
index 08b010f..0000000
--- a/src/types.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef TYPES_HPP
-#define TYPES_HPP
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <boost/cstdint.hpp>
-#include <limits>
-
-namespace biglm{
-
-typedef double weight_type;
-const weight_type IMPOSSIBLE = -HUGE_VAL;
-
-typedef unsigned long block_type;
-const size_t bits_per_block = (std::numeric_limits<block_type>::digits);
-  //typedef std::size_t size_type;
-typedef boost::uint64_t size_type;
-typedef unsigned char byte_type;
-
-template<typename T>
-struct bytes {
-  static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); }
-  static size_type size(const T& key) { return sizeof(T); }
-};
-
-template<>
-struct bytes<std::string> {
-  static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); }
-  static size_type size(const std::string& key) { return key.size(); }
-};
-
-template<typename U>
-struct bytes<std::vector<U> > {
-  static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); }
-  static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); }
-};
-
-} //namespace nplm
-
-#endif
diff --git a/src/vocabulary.h b/src/vocabulary.h
index fe08d86..c8cd518 100644
--- a/src/vocabulary.h
+++ b/src/vocabulary.h
@@ -5,6 +5,9 @@
 #include <string>
 #include <queue>
 #include <boost/unordered_map.hpp>
+#include "find_string.hpp"
+
+#define NPLM_HAVE_FIND_STRING_PIECE 1
 
 namespace nplm
 {
@@ -17,7 +20,8 @@ struct compare_second
 
 class vocabulary {
   std::vector<std::string> m_words;
-  boost::unordered_map<std::string, int> m_index;
+  typedef boost::unordered_map<std::string, int> WordId;
+  WordId m_index;
   int unk;
 
  public:
@@ -37,21 +41,24 @@ class vocabulary {
 
   int lookup_word(const std::string &word) const
   {
-    boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
-    if (pos != m_index.end())
-      return pos->second;
-    else
-      return unk;
+    return lookup_word(word, unk);
   }
 
   // lookup word using custom unknown-word id
-  int lookup_word(const std::string &word, int unk) const
+  int lookup_word(const std::string &word, int unkid) const
+  {
+    WordId::const_iterator pos = m_index.find(word);
+    return pos == m_index.end() ? unkid : pos->second;
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice) const {
+    return lookup_word(slice, unk);
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice, int unkid) const
   {
-    boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
-    if (pos != m_index.end())
-      return pos->second;
-    else
-      return unk;
+    WordId::const_iterator pos = find_string(m_index, slice);
+    return pos == m_index.end() ? unkid : pos->second;
   }
 
   int insert_word(const std::string &word)
@@ -89,7 +96,6 @@ class vocabulary {
 
   const std::vector<std::string> &words() const { return m_words; }
 
-  const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; }
 };
 
 } // namespace nplm
author	graehl <graehl@gmail.com>	2015-06-25 10:45:09 +0300
committer	graehl <graehl@gmail.com>	2015-06-25 11:31:16 +0300
commit	a3dd9155bab174643fe358c1041f881fd98474e8 (patch)
tree	f8781d8cb1191b03ef7f0bee0c98bd376ce08005
parent	589a79fa094a1d36dfa110d2bbb58be078c7a05b (diff)