moses/src/DynSAInclude/RandLMCache.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

// Copyright 2008 Abby Levenberg, David Talbot
//
// This file is part of RandLM
//
// RandLM is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// RandLM is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with RandLM.  If not, see <http://www.gnu.org/licenses/>.
#ifndef INC_RANDLM_CACHE_H
#define INC_RANDLM_CACHE_H

#include <iterator>
#include <map>
#include <ctime>
#include <iostream>

namespace randlm {
    
  template<typename T>
  class CacheNode {
  public:
    typedef std::map<wordID_t, CacheNode<T>* > childMap;
    // initialise value to 'unknown' (i.e. not yet queried or cached).
    CacheNode(T unknown_value) : value_(unknown_value) {}
    childMap childs_;  // child pointers
    T value_;  // value stored
    const void* state_;  // state pointer
  };
  
  template<typename T>
  class Cache {
  public:
    typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
    // unknown_value is used to indicate the ngram was not queried (yet)
    // null_value_ indicates it was queried but not found in model
    // space usage is handled by client.
    Cache(T unknown_value, T null_value)  : 
      cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
      root_ = newNode();
    }
    ~Cache() {
      if(clear()) {
	delete root_;
	root_ = NULL;
      }  else {
	std::cerr << "Error freeing cache memory.\n";
      }
    }
    bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
      // inserts full ngram into cache 
      CacheNode<T>* node = root_;
      for (int i = len - 1; i > -1; --i) {
	childPtr child = node->childs_.find(ngram[i]);
	if( child != node->childs_.end() ) {
	  // current node is already prefix. Go to child node
	  node = node->childs_[ngram[i]]; 
	}  else {
	  // no child for prefix. set new child link in current node
	  CacheNode<T> * newChild = newNode(node);
	  node->childs_[ngram[i]] = newChild;
	  // go to new node
	  node = newChild;
	}
      }
      node->value_ = value;
      node->state_ = state;
      return true; 
    }
    bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
      // finds value for this full ngram only (returns false if full ngram not in cache)
      CacheNode<T> * node = root_;      
      for(int i = len - 1; i > -1; --i) {
	// go to deepest level node of ngram in cache 
	childPtr child = node->childs_.find(ngram[i]);
	if( child != node->childs_.end() ) {
	  // switch to child node
	  node = node->childs_[ngram[i]]; 
	} else {
	  // not cached
	  return false;
	}
      }
      *value = node->value_;
      if(state) *state = node->state_;
      return *value != null_value_ && *value != unknown_value_;
    }
    int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
      // set values array to point to cache value nodes
      CacheNode<T> * node = root_;
      *found = 0;
      //values[0] = &node->value_; // pointer to root node's value
      bool all_found = true;
      for(int i = len - 1; i > -1; --i) {
	// go to deepest level node of ngram in cache 
	childPtr child = node->childs_.find(ngram[i]);
	if( child != node->childs_.end() ) {
	  // switch to child node
	  node = node->childs_[ngram[i]]; 
	  // get pointer to value (index by length - 1) 
	  values[i] = &node->value_;
	  // if null_value then assume all extensions impossible
	  if (node->value_ == null_value_) {
	    return len - 1 - i; // max length posible
          }
	  all_found = all_found && (node->value_ != unknown_value_);
	  if (all_found)
	    ++(*found);
	} else {
	  // initialise uncached values
	  CacheNode<T> * newChild = newNode(node);
	  node->childs_[ngram[i]] = newChild;
	  // go to new node
	  node = newChild;
	  values[i] = &node->value_;
	}
      }
      return len; // all possible
    }
    int getCache(const wordID_t* ngram, int len, T** values, int* found) {
      // get pointers to values for ngram and constituents.
      // returns upper bound on longest subngram in model.
      // 'found' stores longest non-null and known value found.
      CacheNode<T> * node = root_;
      *found = 0;
      values[0] = &node->value_; // pointer to root node's value
      bool all_found = true;
      for(int i = len - 1; i > -1; --i) {
	// go to deepest level node of ngram in cache 
	childPtr child = node->childs_.find(ngram[i]);
	if( child != node->childs_.end() ) {
	  // switch to child node
	  node = node->childs_[ngram[i]]; 
	  // get pointer to value (index by length - 1) 
	  values[len - i] = &node->value_;
	  // if null_value then assume all extensions impossible
	  if (node->value_ == null_value_)
	    return len - 1 - i; // max length posible
	  all_found = all_found && (node->value_ != unknown_value_);
	  if (all_found)
	    ++(*found);
	} else {
	  // initialise uncached values
	  CacheNode<T> * newChild = newNode(node);
	  node->childs_[ngram[i]] = newChild;
	  // go to new node
	  node = newChild;
	  values[len - i] = &node->value_;
	}
      }
      return len; // all possible
    }
    bool clear() {
      std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize()) 
	 / static_cast<float>(1ull << 20) << "MB" << std::endl;
      return clearNodes(root_);
    }
    int nodes() {
      // returns number of nodes
      return cur_nodes_;
    }
    int nodeSize() {
      return sizeof(CacheNode<T>) + sizeof(root_->childs_);
    }
    private:
    CacheNode<T> * root_;
    count_t cur_nodes_; 
    T unknown_value_; // Used to initialise data at each node
    T null_value_; // Indicates cached something not in model
    CacheNode<T>* newNode(CacheNode<T> * node = 0) {
      ++cur_nodes_;
      return new CacheNode<T>(unknown_value_);
    }
    bool clearNodes(CacheNode<T> * node) {
      //delete children from this node
      if(!node->childs_.empty()) {
	iterate(node->childs_, itr) {
	  if(!clearNodes(itr->second)) 
	    std::cerr << "Error emptying cache\n";
	  delete itr->second;
	  --cur_nodes_;
	}
	node->childs_.clear();
      }
      return true;
    }

  };
} //end namespace
#endif //INC_RANDLM_CACHE_H