/* NO OVERLAY VALUES STORED IN SEPERATE FILTER */ #ifndef INC_PERFECTHASH_H #define INC_PERFECTHASH_H #include #include #include "hash.h" #include "RandLMFilter.h" #include "quantizer.h" /* * PerfectHash handles setting up hash functions and storage * for LM data. */ using randlm::Filter; using randlm::BitFilter; typedef std::map hpDict_t; typedef hpDict_t::iterator hpdEntry_t; static count_t collisions_ = 0; /* Based on Mortenson et. al. 2006 */ template class PerfectHash { public: PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase); PerfectHash(FileHandler* fin) { CHECK(fin != 0); } virtual ~PerfectHash(); void analyze(); count_t hpDictMemUse(); count_t bucketsMemUse(); protected: Filter* filter_; Filter* values_; hpDict_t dict_; uint64_t cells_; count_t hitMask_; int totBuckets_; uint8_t bucketRange_; uint8_t* idxTracker_; uint64_t insert(const wordID_t* IDs, const int len, const count_t value); bool update(const wordID_t* IDs, const int len, const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx); bool update2(const wordID_t* IDs, const int len, const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx); int query(const wordID_t* IDs, const int len, hpdEntry_t& hpdAddr, uint64_t& filterIdx); virtual void remove(const wordID_t* IDs, const int len); void remove(uint64_t index); void save(FileHandler* fout); void load(FileHandler* fin); virtual void markQueried(const uint64_t&)=0; //pointer to a specific entry in a hpDict_t virtual void markQueried(hpdEntry_t&)=0; private: T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket); string hpDictKeyValue(const wordID_t* IDs, const int len); uint64_t memBound_; // total memory bound in bytes uint16_t cellWidth_; // in bits UnivHash_linear* bucketHash_; UnivHash_linear* fingerHash_; LogQtizer* qtizer_; }; template PerfectHash::PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)), cellWidth_(width) { bucketRange_ = static_cast(bucketRange); if(bucketRange > 255) { cerr << "ERROR: Max bucket range is > 2^8\n"; exit(1); } qtizer_ = new LogQtizer(qBase); int valBits = (int)ceil(log2((float)qtizer_->maxcode())); cerr << "BITS FOR VALUES ARRAY = " << valBits << endl; uint64_t totalBits = memBound_ << 3; cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells filter_ = new Filter(cells_, cellWidth_); values_ = new Filter(cells_, valBits); idxTracker_ = new uint8_t[totBuckets_]; for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0; // initialize ranges for each hash function bucketHash_ = new UnivHash_linear(totBuckets_, 1, PRIME); fingerHash_ = new UnivHash_linear(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME); } template PerfectHash::~PerfectHash() { delete[] idxTracker_; delete filter_; filter_ = NULL; delete fingerHash_; delete bucketHash_; delete qtizer_; delete values_; } template uint64_t PerfectHash::insert(const wordID_t* IDs, const int len, const count_t value) { count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); uint64_t emptyidx = cells_ + 1; uint64_t index = bucket * bucketRange_, // starting bucket row lastrow = index + bucketRange_; // ending row while(index < lastrow) { // unique so check each row for "matching" signature T filterVal = filter_->read(index); if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row emptyidx = index; } else if(filterVal == fp) { ++collisions_; dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd return cells_ + 1; // finished } ++index; } CHECK((emptyidx < index) && (filter_->read(emptyidx) == 0)); // should have found empty index if it gets here T code = (T)qtizer_->code(value); filter_->write(emptyidx, fp); // insert the fprint values_->write(emptyidx, code); ++idxTracker_[bucket]; // keep track of bucket size return emptyidx; } else { // bucket is full dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd return cells_ + 1; } } template bool PerfectHash::update(const wordID_t* IDs, const int len, const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { // check if key is in high perf. dictionary filterIdx = cells_ + 1; string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { hpdAddr->second = value; return true; } // else hash ngram //count_t bucket = bucketHash_->hash(IDs, len); count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); uint64_t index = bucket * bucketRange_, // starting bucket row lastrow = index + bucketRange_; while(index < lastrow) { // must check each row for matching fp event T filterVal = filter_->read(index); if(filterVal == fp) { // found event w.h.p. values_->write(index, (T)qtizer_->code(value)); filterIdx = index; return true; } ++index; } // could add if it gets here. return false; } template int PerfectHash::query(const wordID_t* IDs, const int len, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { // check if key is in high perf. dictionary string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { filterIdx = cells_ + 1; return(hpdAddr->second); // returns copy of value } else { // check if key is in filter // get bucket //count_t bucket = bucketHash_->hash(IDs, len); count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); // return value if ngram is in filter uint64_t index = bucket * bucketRange_, lastrow = index + bucketRange_; for(; index < lastrow; ++index) { if(filter_->read(index) == fp) { //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" << //filter_->read(index) << "\tcode = " << code << endl; filterIdx = index; hpdAddr = dict_.end(); return (int)qtizer_->value(values_->read(index)); } } } return -1; } template void PerfectHash::remove(const wordID_t* IDs, const int len) { // delete key if in high perf. dictionary string skey = hpDictKeyValue(IDs, len); if(dict_.find(skey) != dict_.end()) dict_.erase(skey); else { // check if key is in filter // get small representation for ngrams //count_t bucket = bucketHash_->hash(IDs, len); count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); // retrieve non zero fingerprint for ngram T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); // return value if ngram is in filter uint64_t index = bucket * bucketRange_, lastrow = index + bucketRange_; for(; index < lastrow; ++index) { if(filter_->read(index) == fp) { filter_->write(index, 0); values_->write(index, 0); --idxTracker_[bucket]; // track bucket size reduction break; } } } } template // clear filter index void PerfectHash::remove(uint64_t index) { CHECK(index < cells_); CHECK(filter_->read(index) != 0); // slow filter_->write(index, 0); values_->write(index, 0); //reduce bucket size count_t bucket = index / bucketRange_; --idxTracker_[bucket]; } template T PerfectHash::nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket) { count_t h = bucket; T fingerprint(0); do { fingerprint = fingerHash_->hash(IDs, len, h); h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around } while((fingerprint == 0) && (h != bucket)); if(fingerprint == 0) cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl; return fingerprint; } template string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) { string skey(" "); for(int i = 0; i < len; ++i) skey += Utils::IntToStr(IDs[i]) + "¬"; Utils::trim(skey); return skey; } template count_t PerfectHash::hpDictMemUse() { // return hpDict memory usage in MBs return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20; } template count_t PerfectHash::bucketsMemUse() { // return bucket memory usage in MBs return (count_t) (filter_->size() + values_->size()); } template void PerfectHash::save(FileHandler* fout) { CHECK(fout != 0); cerr << "\tSaving perfect hash parameters...\n"; fout->write((char*)&hitMask_, sizeof(hitMask_)); fout->write((char*)&memBound_, sizeof(memBound_)); fout->write((char*)&cellWidth_, sizeof(cellWidth_)); fout->write((char*)&cells_, sizeof(cells_)); fout->write((char*)&totBuckets_, sizeof(totBuckets_)); fout->write((char*)&bucketRange_, sizeof(bucketRange_)); fout->write((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0])); qtizer_->save(fout); cerr << "\tSaving hash functions...\n"; fingerHash_->save(fout); bucketHash_->save(fout); cerr << "\tSaving bit filter...\n"; filter_->save(fout); values_->save(fout); cerr << "\tSaving high performance dictionary...\n"; count_t size = dict_.size(); fout->write((char*)&size, sizeof(count_t)); *fout << endl; iterate(dict_, t) *fout << t->first << "\t" << t->second << "\n"; } template void PerfectHash::load(FileHandler* fin) { CHECK(fin != 0); cerr << "\tLoading perfect hash parameters...\n"; fin->read((char*)&hitMask_, sizeof(hitMask_)); fin->read((char*)&memBound_, sizeof(memBound_)); fin->read((char*)&cellWidth_, sizeof(cellWidth_)); fin->read((char*)&cells_, sizeof(cells_)); fin->read((char*)&totBuckets_, sizeof(totBuckets_)); fin->read((char*)&bucketRange_, sizeof(bucketRange_)); idxTracker_ = new uint8_t[totBuckets_]; fin->read((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0])); qtizer_ = new LogQtizer(fin); cerr << "\tLoading hash functions...\n"; fingerHash_ = new UnivHash_linear(fin); bucketHash_ = new UnivHash_linear(fin); cerr << "\tLoading bit filter...\n"; filter_ = new Filter(fin); values_ = new Filter(fin); cerr << "\tLoading HPD...\n"; count_t size = 0; fin->read((char*)&size, sizeof(count_t)); fin->ignore(256, '\n'); string line; hpDict_t::key_type key; hpDict_t::mapped_type val; for(count_t i=0; i < size; ++i) { getline(*fin, line); Utils::trim(line); std::istringstream ss(line.c_str()); ss >> key, ss >> val; dict_[key] = val; } cerr << "\tHPD size=" << dict_.size() << endl; cerr << "Finished loading ORLM." << endl; } template void PerfectHash::analyze() { cerr << "Analyzing Dynamic Bloomier Filter...\n"; // see how many items in each bucket uint8_t* bucketCnt = new uint8_t[totBuckets_]; unsigned largestBucket = 0, totalCellsSet = 0, smallestBucket = bucketRange_, totalZeroes = 0; int curBucket = -1, fullBuckets(0); for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0; for(uint64_t i =0; i < cells_; ++i) { if(i % bucketRange_ == 0) ++curBucket; if(filter_->read(i) != 0) { ++bucketCnt[curBucket]; ++totalCellsSet; } else ++totalZeroes; } count_t bi = 0, si = 0; for(int i = 0; i < totBuckets_; ++i) { if(bucketCnt[i] > largestBucket) { largestBucket = bucketCnt[i]; bi = i; } else if(bucketCnt[i] < smallestBucket) { smallestBucket = bucketCnt[i]; si = i; } } count_t trackerCells(0); for(int i = 0; i < totBuckets_; i++) { trackerCells += idxTracker_[i]; if(idxTracker_[i] == bucketRange_) ++fullBuckets; } for(int i = 0; i < totBuckets_; ++i) { if(bucketCnt[i] != idxTracker_[i]) cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl; } cerr << "total cells= " << cells_ << endl; cerr << "total buckets= " << totBuckets_ << endl; cerr << "bucket range= " << (int)bucketRange_ << endl; cerr << "fingerprint bits= " << cellWidth_ << endl; cerr << "total cells set= " << totalCellsSet; cerr << " (idxTracker set = " << trackerCells << ")" << endl; cerr << "total zeroes=" << totalZeroes; cerr << " (idxTracker zeros = " << cells_ - trackerCells << ")" << endl; cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl; cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl; cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl; cerr << "total buckets full = " << fullBuckets << endl; cerr << "total collision errors= " << collisions_ << endl; cerr << "high performance dictionary size= " << dict_.size() << endl; cerr << "high performance dictionary MBs= " << hpDictMemUse() << endl; cerr << "filter MBs= " << filter_->size() << endl; cerr << "values MBs= " << values_->size() << endl; delete[] bucketCnt; } template bool PerfectHash::update2(const wordID_t* IDs, const int len, const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { // check if key is in high perf. dictionary filterIdx = cells_ + 1; string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { hpdAddr->second += value; return true; } // else hash ngram //count_t bucket = bucketHash_->hash(IDs, len); count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); // restriction on fprint value is non-zero T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); uint64_t index = bucket * bucketRange_, // starting bucket row lastrow = index + bucketRange_; while(index < lastrow) { // must check each row for matching fp event T filterVal = filter_->read(index); if(filterVal == fp) { // found event w.h.p. int oldval = (int)qtizer_->value(values_->read(index)); values_->write(index, (T)qtizer_->code(oldval + value)); filterIdx = index; return true; } ++index; } // add if it gets here. insert(IDs, len, value); return false; } #endif