diff options
Diffstat (limited to 'contrib/moses2/TranslationModel/CompactPT/BlockHashIndex.h')
-rw-r--r-- | contrib/moses2/TranslationModel/CompactPT/BlockHashIndex.h | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/contrib/moses2/TranslationModel/CompactPT/BlockHashIndex.h b/contrib/moses2/TranslationModel/CompactPT/BlockHashIndex.h new file mode 100644 index 000000000..5706fca09 --- /dev/null +++ b/contrib/moses2/TranslationModel/CompactPT/BlockHashIndex.h @@ -0,0 +1,199 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_BlockHashIndex_h +#define moses_BlockHashIndex_h + +#include <iostream> +#include <sstream> +#include <string> +#include <vector> +#include <queue> +#include <cstring> +#include <cstdio> + +#include "MurmurHash3.h" +#include "StringVector.h" +#include "PackedArray.h" +#include "util/exception.hh" +#include "util/string_stream.hh" + +#ifdef WITH_THREADS +#include "../../legacy/ThreadPool.h" +#else +#include <ctime> +#endif + +#include <boost/shared_ptr.hpp> + +namespace Moses2 +{ + +class BlockHashIndex +{ +private: + std::priority_queue<int> m_queue; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + std::FILE* m_fileHandle; + size_t m_fileHandleStart; + + StringVector<unsigned char, unsigned long> m_landmarks; + + std::vector<void*> m_hashes; + std::vector<clock_t> m_clocks; + std::vector<PairedPackedArray<>*> m_arrays; + + std::vector<size_t> m_seekIndex; + + size_t m_size; + int m_lastSaved; + int m_lastDropped; + size_t m_numLoadedRanges; + +#ifdef WITH_THREADS + ThreadPool m_threadPool; + boost::mutex m_mutex; + + template<typename Keys> + class HashTask: public Task + { + public: + HashTask(int id, BlockHashIndex& hash, Keys& keys) : + m_id(id), m_hash(hash), m_keys(new Keys(keys)) + { + } + + virtual void Run() + { + m_hash.CalcHash(m_id, *m_keys); + } + + virtual ~HashTask() + { + delete m_keys; + } + + private: + int m_id; + BlockHashIndex& m_hash; + Keys* m_keys; + }; +#endif + + size_t GetFprint(const char* key) const; + size_t GetHash(size_t i, const char* key); + +public: +#ifdef WITH_THREADS + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum = 2); +#else + BlockHashIndex(size_t orderBits, size_t fingerPrintBits); +#endif + + ~BlockHashIndex(); + + size_t GetHash(const char* key); + size_t GetHash(std::string key); + + size_t operator[](std::string key); + size_t operator[](char* key); + + void BeginSave(std::FILE* mphf); + void SaveRange(size_t i); + void SaveLastRange(); + size_t FinalizeSave(); + +#ifdef WITH_THREADS + void WaitAll(); +#endif + + void DropRange(size_t i); + void DropLastRange(); + + size_t LoadIndex(std::FILE* mphf); + void LoadRange(size_t i); + + size_t Save(std::string filename); + size_t Save(std::FILE * mphf); + + size_t Load(std::string filename); + size_t Load(std::FILE * mphf); + + size_t GetSize() const; + + void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); + + template<typename Keys> + void AddRange(Keys &keys) + { + size_t current = m_landmarks.size(); + + if (m_landmarks.size() && m_landmarks.back().str() >= keys[0]) { + util::StringStream strme; + strme + << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n"; + strme << "1: " << m_landmarks.back().str() << "\n"; + strme << "2: " << keys[0] << "\n"; + UTIL_THROW2(strme.str()); + } + + m_landmarks.push_back(keys[0]); + m_size += keys.size(); + + if (keys.size() == 1) { + // add dummy key to avoid null hash + keys.push_back("###DUMMY_KEY###"); + } + +#ifdef WITH_THREADS + + boost::shared_ptr<HashTask<Keys> > ht( + new HashTask<Keys>(current, *this, keys)); + m_threadPool.Submit(ht); +#else + CalcHash(current, keys); +#endif + } + + template<typename Keys> + void CalcHash(size_t current, Keys &keys) + { +#ifdef HAVE_CMPH + void* source = vectorAdapter(keys); + CalcHash(current, source); +#endif + } + + void CalcHash(size_t current, void* source); + +#ifdef HAVE_CMPH + void* vectorAdapter(std::vector<std::string>& v); + void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv); + void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv); +#endif +}; + +} +#endif |