/** * ISS (Indexed Strings Storage) - memory efficient storage for permanent strings. * * Implementation note: use #define USE_HASHSET to switch between implementation * using __gnu_cxx::hash_set and implementation using std::set. * * (C) Ceslav Przywara, UFAL MFF UK, 2011 * * $Id$ */ #ifndef _ISS_H #define _ISS_H #include #include #include // Use hashset instead of std::set for string-to-number indexing? #ifdef USE_HASHSET #include #else #include #endif #include #ifdef USE_HASHSET // Forward declaration of comparator functor. template class StringsEqualComparator; template class Hasher; #else // Forward declaration of comparator functor. template class StringsLessComparator; #endif /** */ template class IndexedStringsStorage { public: typedef IndType index_type; #ifdef USE_HASHSET typedef StringsEqualComparator equality_comparator_t; typedef Hasher hasher_t; /** @typedef Hash set used as lookup table (string -> numeric index). */ typedef __gnu_cxx::hash_set index_t; #else typedef StringsLessComparator less_comparator_t; /** @typedef Set used as lookup table (string -> numeric index). */ typedef std::set index_t; #endif /** @typedef Container of pointers to stored C-strings. Acts as * conversion table: numeric index -> string. */ typedef std::vector table_t; private: /** @var memory pool used to store C-strings */ boost::pool<> _storage; /** @var index-to-string conversion table */ table_t _table; /** @var index lookup table */ index_t _index; public: /** Default constructor. */ IndexedStringsStorage(void); /** @return True, if the indices are exhausted (new strings cannot be stored). */ inline bool is_full(void) const { return _table.size() == std::numeric_limits::max(); } /** Retrieves pointer to C-string instance represented by given index. * Note: No range checks are performed! * @param index Index of C-string to retrieve. * @return Pointer to stored C-string instance. */ inline const char* get(IndType index) const { return _table[index]; } /** Stores the string and returns its numeric index. * @param str Pointer to C-string to store. * @return Index of stored copy of str. * @throw std::bad_alloc When insertion of new string would cause * overflow of indices datatype. */ IndType put(const char* str); /** @return Number of unique strings stored so far. */ inline table_t::size_type size(void) const { return _table.size(); } }; /** Functor designed for less than comparison of C-strings stored within StringStore. * @param IndType Type of numerical indices of strings within given StringStore. */ #ifdef USE_HASHSET template class StringsEqualComparator: public std::binary_function { #else template class StringsLessComparator: public std::binary_function { #endif /** @var conversion table: index -> string (necessary for indices comparison) */ const typename IndexedStringsStorage::table_t& _table; public: #ifdef USE_HASHSET StringsEqualComparator(const typename IndexedStringsStorage::table_t& table): _table(table) {} #else StringsLessComparator(const typename IndexedStringsStorage::table_t& table): _table(table) {} #endif /** Comparison of two pointers to C-strings. * @param lhs Pointer to 1st C-string. * @param rhs Pointer to 2nd C-string. * @return True, if 1st argument is equal/less than 2nd argument. */ inline bool operator()(IndType lhs, IndType rhs) const { #ifdef USE_HASHSET return strcmp(_table[lhs], _table[rhs]) == 0; #else return strcmp(_table[lhs], _table[rhs]) < 0; #endif } }; #ifdef USE_HASHSET /** Functor... TODO. */ template class Hasher: public std::unary_function { __gnu_cxx::hash _hash; /** @var conversion table: index -> string (necessary for indices comparison) */ const typename IndexedStringsStorage::table_t& _table; public: /** */ Hasher(const typename IndexedStringsStorage::table_t& table): _hash(), _table(table) {} /** Hashing function. * @param index * @return Counted hash. */ inline size_t operator()(const IndType index) const { return _hash(_table[index]); } }; #endif template #ifdef USE_HASHSET IndexedStringsStorage::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(100, hasher_t(_table), equality_comparator_t(_table)) {} #else IndexedStringsStorage::IndexedStringsStorage(void): _storage(sizeof(char)), _table(), _index(less_comparator_t(_table)) {} #endif template IndType IndexedStringsStorage::put(const char* str) { if ( this->is_full() ) { // What a pity, not a single index left to spend. throw std::bad_alloc(); } // To use the index for lookup we first have to store passed string // in conversion table (cause during lookup we compare the strings indirectly // by using their indices). // Note: thread unsafe! TODO: Redesing. IndType index = static_cast(_table.size()); _table.push_back(str); #ifdef USE_HASHSET // typename index_t::iterator iIndex = _index.find(index); #else // A lower_bound() search enables us to use "found" iterator as a hint for // eventual insertion. typename index_t::iterator iIndex = _index.lower_bound(index); #endif if ( (iIndex != _index.end()) #ifndef USE_HASHSET // In case of lower_bound() search we have to also compare found item // with passed string. && (strcmp(_table[*iIndex], str) == 0) #endif ) { // String is already present in storage! // Pop back temporary stored pointer... _table.pop_back(); // ...and return numeric index to already stored copy of `str`. return static_cast(*iIndex); } // String not found within storage. // Allocate memory required for string storage... char* mem = static_cast(_storage.ordered_malloc(strlen(str) + 1)); // ...and fill it with copy of passed string. strcpy(mem, str); // Overwrite temporary stored pointer to `str` with pointer to freshly // saved copy. _table[index] = mem; #ifdef USE_HASHSET // Insert the index into lookup table. _index.insert(index); #else // Insert the index into lookup table (use previously retrieved iterator // as a hint). _index.insert(iIndex, index); #endif // Finally. return index; } #endif