diff options
author | Kenneth Heafield <github@kheafield.com> | 2011-11-17 23:12:19 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2011-11-17 23:12:19 +0400 |
commit | 974a708dddab2b4c6836a176d95f8455d0ed5f51 (patch) | |
tree | 0916139ae18b032250dc84a6703dabbedc7dc8ac /util | |
parent | 17cec851dfcf5712c66e6304308273b62dc532e8 (diff) |
Updated kenlm 96ef3f2c11.
Invalidates old gcc and 32-bit formats, replacing these with one consistent format: 64-bit new gcc. Backwards compatible with these files.
Diffstat (limited to 'util')
-rw-r--r-- | util/file.cc | 19 | ||||
-rw-r--r-- | util/file.hh | 2 | ||||
-rw-r--r-- | util/file_piece.cc | 10 | ||||
-rw-r--r-- | util/file_piece_test.cc | 41 | ||||
-rw-r--r-- | util/getopt.c | 77 | ||||
-rw-r--r--[-rwxr-xr-x] | util/getopt.hh | 191 | ||||
-rw-r--r-- | util/have.hh | 18 | ||||
-rw-r--r-- | util/mmap.cc | 11 | ||||
-rw-r--r-- | util/portability.cc | 74 | ||||
-rw-r--r-- | util/portability.hh | 115 | ||||
-rw-r--r-- | util/probing_hash_table.hh | 33 | ||||
-rw-r--r-- | util/probing_hash_table_test.cc | 27 | ||||
-rw-r--r-- | util/sorted_uniform.hh | 93 | ||||
-rw-r--r-- | util/sorted_uniform_test.cc | 84 | ||||
-rw-r--r-- | util/tokenize_piece.hh | 64 | ||||
-rw-r--r-- | util/tokenize_piece_test.cc | 50 |
16 files changed, 278 insertions, 631 deletions
diff --git a/util/file.cc b/util/file.cc index 81d1d490c..77922cfad 100644 --- a/util/file.cc +++ b/util/file.cc @@ -1,7 +1,6 @@ #include "util/file.hh" #include "util/exception.hh" -#include "util/portability.hh" #include <cstdlib> #include <cstdio> @@ -14,6 +13,7 @@ #if defined(_WIN32) || defined(_WIN64) #include <windows.h> +#include <io.h> #endif namespace util { @@ -43,11 +43,28 @@ int OpenReadOrThrow(const char *name) { } uint64_t SizeFile(int fd) { +#if defined(_WIN32) || defined(_WIN64) + __int64 ret = _filelengthi64(fd); + return (ret == -1) ? kBadSize : ret; +#else struct stat sb; if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; return sb.st_size; +#endif +} + +void ResizeOrThrow(int fd, uint64_t to) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(_chsize_s(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#else + UTIL_THROW_IF(ftruncate(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void ReadOrThrow(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast<uint8_t*>(to_void); while (amount) { diff --git a/util/file.hh b/util/file.hh index b820ba351..04023dec0 100644 --- a/util/file.hh +++ b/util/file.hh @@ -71,6 +71,8 @@ int OpenReadOrThrow(const char *name); const uint64_t kBadSize = (uint64_t)-1; uint64_t SizeFile(int fd); +void ResizeOrThrow(int fd, uint64_t to); + void ReadOrThrow(int fd, void *to, std::size_t size); std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); diff --git a/util/file_piece.cc b/util/file_piece.cc index 43b578b65..f0d49d555 100644 --- a/util/file_piece.cc +++ b/util/file_piece.cc @@ -3,7 +3,9 @@ #include "util/exception.hh" #include "util/file.hh" #include "util/mmap.hh" -#include "util/portability.hh" +#ifdef WIN32 +#include <io.h> +#endif // WIN32 #include <iostream> #include <string> @@ -130,7 +132,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s namespace { void ParseNumber(const char *begin, char *&end, float &out) { -#ifdef sun +#if defined(sun) || defined(WIN32) out = static_cast<float>(strtod(begin, &end)); #else out = strtof(begin, &end); @@ -255,6 +257,10 @@ void FilePiece::TransitionToRead() { #endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. diff --git a/util/file_piece_test.cc b/util/file_piece_test.cc index dc9ec7e7c..f912e18af 100644 --- a/util/file_piece_test.cc +++ b/util/file_piece_test.cc @@ -1,3 +1,4 @@ +// Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" #include "util/scoped.hh" @@ -14,10 +15,18 @@ namespace util { namespace { +std::string FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "file_piece.cc"; + } + std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); + return ret; +} + /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); - FilePiece test("file_piece.cc", NULL, 1); + std::fstream ref(FileLocation().c_str(), std::ios::in); + FilePiece test(FileLocation().c_str(), NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -35,9 +44,13 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) { */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string popen_args = "cat \""; + popen_args += FileLocation(); + popen_args += '"'; - FILE *catter = popen("cat file_piece.cc", "r"); + FILE *catter = popen(popen_args.c_str(), "r"); BOOST_REQUIRE(catter); FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); @@ -58,10 +71,15 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) { // gzip file BOOST_AUTO_TEST_CASE(PlainZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::string location(FileLocation()); + std::fstream ref(location.c_str(), std::ios::in); - BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz")); - FilePiece test("file_piece.cc.gz", NULL, 1); + std::string command("gzip <\""); + command += location + "\" >\"" + location + "\".gz"; + + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + FilePiece test((location + ".gz").c_str(), NULL, 1); + unlink((location + ".gz").c_str()); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -77,12 +95,15 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) { // the test. #ifndef __APPLE__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string command("gzip <\""); + command += FileLocation() + "\""; - FILE * catter = popen("gzip <file_piece.cc", "r"); + FILE * catter = popen(command.c_str(), "r"); BOOST_REQUIRE(catter); - FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); + FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); diff --git a/util/getopt.c b/util/getopt.c new file mode 100644 index 000000000..5dfe545de --- /dev/null +++ b/util/getopt.c @@ -0,0 +1,77 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifndef __GNUC__ + +#include "getopt.hh" +#include <stdio.h> + +#define NULL 0 +#define EOF (-1) +#define ERR(s, c) if(opterr){\ + char errbuf[2];\ + errbuf[0] = c; errbuf[1] = '\n';\ + fputs(argv[0], stderr);\ + fputs(s, stderr);\ + fputc(c, stderr);} + //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ + //(void) write(2, s, (unsigned)strlen(s));\ + //(void) write(2, errbuf, 2);} + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int +getopt(argc, argv, opts) +int argc; +char **argv, *opts; +{ + static int sp = 1; + register int c; + register char *cp; + + if(sp == 1) + if(optind >= argc || + argv[optind][0] != '-' || argv[optind][1] == '\0') + return(EOF); + else if(strcmp(argv[optind], "--") == NULL) { + optind++; + return(EOF); + } + optopt = c = argv[optind][sp]; + if(c == ':' || (cp=strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if(argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return('?'); + } + if(*++cp == ':') { + if(argv[optind][sp+1] != '\0') + optarg = &argv[optind++][sp+1]; + else if(++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return('?'); + } else + optarg = argv[optind++]; + sp = 1; + } else { + if(argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return(c); +} + +#endif /* __GNUC__ */
\ No newline at end of file diff --git a/util/getopt.hh b/util/getopt.hh index 33a706833..6ad977324 100755..100644 --- a/util/getopt.hh +++ b/util/getopt.hh @@ -1,190 +1,33 @@ +/* +POSIX getopt for Windows -/* getopt.h */ -/* Declarations for getopt. - Copyright (C) 1989-1994, 1996-1999, 2001 Free Software - Foundation, Inc. This file is part of the GNU C Library. +AT&T Public License - The GNU C Library is free software; you can redistribute - it and/or modify it under the terms of the GNU Lesser - General Public License as published by the Free Software - Foundation; either version 2.1 of the License, or - (at your option) any later version. +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ - The GNU C Library is distributed in the hope that it will - be useful, but WITHOUT ANY WARRANTY; without even the - implied warranty of MERCHANTABILITY or FITNESS FOR A - PARTICULAR PURPOSE. See the GNU Lesser General Public - License for more details. - - You should have received a copy of the GNU Lesser General - Public License along with the GNU C Library; if not, write - to the Free Software Foundation, Inc., 59 Temple Place, - Suite 330, Boston, MA 02111-1307 USA. */ - - - - -#ifndef _GETOPT_H - -#ifndef __need_getopt -# define _GETOPT_H 1 +#ifdef __GNUC__ +#include <getopt.h> #endif +#ifndef __GNUC__ -/* If __GNU_LIBRARY__ is not already defined, either we are being used - standalone, or this is the first header included in the source file. - If we are being used with glibc, we need to include <features.h>, but - that does not exist if we are standalone. So: if __GNU_LIBRARY__ is - not defined, include <ctype.h>, which will pull in <features.h> for us - if it's from glibc. (Why ctype.h? It's guaranteed to exist and it - doesn't flood the namespace with stuff the way some other headers do.) */ -#if !defined __GNU_LIBRARY__ -# include <ctype.h> -#endif +#ifndef _WINGETOPT_H_ +#define _WINGETOPT_H_ -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif -int getopt (int argc, char *const *argv, const char *optstring); - -/* For communication from `getopt' to the caller. - When `getopt' finds an option that takes an argument, - the argument value is returned here. - Also, when `ordering' is RETURN_IN_ORDER, - each non-option ARGV-element is returned here. */ - -extern char *optarg; - -/* Index in ARGV of the next element to be scanned. - This is used for communication to and from the caller - and for communication between successive calls to `getopt'. - - On entry to `getopt', zero means this is the first call; initialize. - - When `getopt' returns -1, this is the index of the first of the - non-option elements that the caller should itself scan. - - Otherwise, `optind' communicates from one call to the next - how much of ARGV has been scanned so far. */ - -extern int optind; - -/* Callers store zero here to inhibit the error message `getopt' prints - for unrecognized options. */ - extern int opterr; - -/* Set to an option character which was unrecognized. */ - +extern int optind; extern int optopt; +extern char *optarg; +extern int getopt(int argc, char **argv, char *opts); -#ifndef __need_getopt -/* Describe the long-named options requested by the application. - The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector - of `struct option' terminated by an element containing a name which is - zero. - - The field `has_arg' is: - no_argument (or 0) if the option does not take an argument, - required_argument (or 1) if the option requires an argument, - optional_argument (or 2) if the option takes an optional argument. - - If the field `flag' is not NULL, it points to a variable that is set - to the value given in the field `val' when the option is found, but - left unchanged if the option is not found. - - To have a long-named option do something other than set an `int' to - a compiled-in constant, such as set a value from `optarg', set the - option's `flag' field to zero and its `val' field to a nonzero - value (the equivalent single-letter option character, if there is - one). For long options that have a zero `flag' field, `getopt' - returns the contents of the `val' field. */ - -struct option -{ -# if (defined __STDC__ && __STDC__) || defined __cplusplus - const char *name; -# else - char *name; -# endif - /* has_arg can't be an enum because some compilers complain about - type mismatches in all the code that assumes it is an int. */ - int has_arg; - int *flag; - int val; -}; - -/* Names for the values of the `has_arg' field of `struct option'. */ - -# define no_argument 0 -# define required_argument 1 -# define optional_argument 2 -#endif /* need getopt */ - - -/* Get definitions and prototypes for functions to process the - arguments in ARGV (ARGC of them, minus the program name) for - options given in OPTS. - - Return the option character from OPTS just read. Return -1 when - there are no more options. For unrecognized options, or options - missing arguments, `optopt' is set to the option letter, and '?' is - returned. - - The OPTS string is a list of characters which are recognized option - letters, optionally followed by colons, specifying that that letter - takes an argument, to be placed in `optarg'. - - If a letter in OPTS is followed by two colons, its argument is - optional. This behavior is specific to the GNU `getopt'. - - The argument `--' causes premature termination of argument - scanning, explicitly telling `getopt' that there are no more - options. - - If OPTS begins with `--', then non-option arguments are treated as - arguments to the option '\0'. This behavior is specific to the GNU - `getopt'. */ - -#if (defined __STDC__ && __STDC__) || defined __cplusplus -# ifdef __GNU_LIBRARY__ -/* Many other libraries have conflicting prototypes for getopt, with - differences in the consts, in stdlib.h. To avoid compilation - errors, only prototype getopt for the GNU C library. */ -extern int getopt (int ___argc, char *const *___argv, const char *__shortopts); -# else /* not __GNU_LIBRARY__ */ -//extern int getopt (); -# endif /* __GNU_LIBRARY__ */ - -# ifndef __need_getopt -extern int getopt_long (int ___argc, char *const *___argv, - const char *__shortopts, - const struct option *__longopts, int *__longind); -extern int getopt_long_only (int ___argc, char *const *___argv, - const char *__shortopts, - const struct option *__longopts, int *__longind); - -/* Internal only. Users should not call this directly. */ -extern int _getopt_internal (int ___argc, char *const *___argv, - const char *__shortopts, - const struct option *__longopts, int *__longind, - int __long_only); -# endif -#else /* not __STDC__ */ -extern int getopt (); -# ifndef __need_getopt -extern int getopt_long (); -extern int getopt_long_only (); - -extern int _getopt_internal (); -# endif -#endif /* __STDC__ */ - -#ifdef __cplusplus +#ifdef __cplusplus } #endif -/* Make sure we later can get all the definitions and declarations. */ -#undef __need_getopt +#endif /* _GETOPT_H_ */ +#endif /* __GNUC__ */ -#endif /* getopt.h */ diff --git a/util/have.hh b/util/have.hh index 9e1fc20cd..aca8c6264 100644 --- a/util/have.hh +++ b/util/have.hh @@ -1,9 +1,21 @@ -/* This ties kenlm's config into Moses's build system. If you are using kenlm - * outside Moses, see http://kheafield.com/code/kenlm/developers/ . - */ +/* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ #ifndef UTIL_HAVE__ #define UTIL_HAVE__ +#ifndef HAVE_ZLIB #define HAVE_ZLIB +#endif + +#ifndef HAVE_ICU +//#define HAVE_ICU +#endif + +#ifndef HAVE_BOOST +//#define HAVE_BOOST +#endif + +#ifndef HAVE_THREADS +//#define HAVE_THREADS +#endif #endif // UTIL_HAVE__ diff --git a/util/mmap.cc b/util/mmap.cc index 3dfe0ab2c..d3a2526fa 100644 --- a/util/mmap.cc +++ b/util/mmap.cc @@ -13,13 +13,14 @@ #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> +#include <stdlib.h> + #if defined(_WIN32) || defined(_WIN64) #include <windows.h> +#include <io.h> #else #include <sys/mman.h> #endif -#include <stdlib.h> -#include <unistd.h> namespace util { @@ -102,7 +103,7 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL); UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); - ret = MapViewOfFile(hMapping, protectM, 0, offset, size); + LPVOID ret = MapViewOfFile(hMapping, protectM, 0, offset, size); CloseHandle(hMapping); UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); #else @@ -159,8 +160,8 @@ void *MapAnonymous(std::size_t size) { } void *MapZeroedWrite(int fd, std::size_t size) { - UTIL_THROW_IF(-1 == ftruncate(fd, 0), ErrnoException, "ftruncate on fd " << fd << " to 0 failed"); - UTIL_THROW_IF(-1 == ftruncate(fd, size), ErrnoException, "ftruncate on fd " << fd << " to " << size << " failed"); + ResizeOrThrow(fd, 0); + ResizeOrThrow(fd, size); return MapOrThrow(size, true, kFileFlags, false, fd, 0); } diff --git a/util/portability.cc b/util/portability.cc deleted file mode 100644 index 2efd74cba..000000000 --- a/util/portability.cc +++ /dev/null @@ -1,74 +0,0 @@ - -#include <stdlib.h> -#include <errno.h> -#include "util/portability.hh" - -#ifdef WIN32 - -int RUSAGE_SELF = 0; - -int sysconf(int) { return 0; } -int msync(void*, int, int) { return 0; } -int munmap(void *, int) { return 0; } -void *mmap(void*, int, int, int, FD, OFF_T) { return 0; } -int write(int, const void *, int) {return 0; } - -//FILE *popen(const char*, const char*) { return 0; } -//int pclose(FILE *) { return 0; } -int close(FD fd) { return 0; } - - -// to be implemented by boost -int mkdtemp(const char*) { return 0; } - -// done -long lrint(float x) -{ - long ret = (long) x; - return ret; -} - -float strtof(const char *begin, char **end) -{ - double ret = strtod(begin, end); - return (float) ret; -} - - -int ftruncate (FD hfile, unsigned int size) -{ - unsigned int curpos; - /* - HANDLE hfile; - - if (fd < 0) - { - errno = EBADF; - return -1; - } - - hfile = (HANDLE) _get_osfhandle (fd); - */ - curpos = SetFilePointer (hfile, 0, NULL, FILE_CURRENT); - if (curpos == ~0 - || SetFilePointer (hfile, size, NULL, FILE_BEGIN) == ~0 - || !SetEndOfFile (hfile)) - { - int error = GetLastError (); - switch (error) - { - case ERROR_INVALID_HANDLE: - errno = EBADF; - break; - default: - errno = EIO; - break; - } - return -1; - } - return 0; -} - -#endif - - diff --git a/util/portability.hh b/util/portability.hh deleted file mode 100644 index acbc01922..000000000 --- a/util/portability.hh +++ /dev/null @@ -1,115 +0,0 @@ - -#pragma once - -#include <assert.h> -#include <stdint.h> - -#ifdef WIN32 - -#include <windows.h> -#include <direct.h> -#include <io.h> -#include <stdio.h> -#include <string.h> -#include <sys/stat.h> -#include "util/getopt.hh" - -#undef max -#undef min - -typedef HANDLE FD; - -const FD kBadFD = INVALID_HANDLE_VALUE; - -typedef int ssize_t; - -#define _SC_PAGE_SIZE 1 -#define MS_SYNC 1 - -int sysconf(int); -int msync(void*, int, int); -int ftruncate(FD, unsigned int); - -long lrint(float); - -//inline int getrusage(int, struct rusage*) { return 0; } -//extern int RUSAGE_SELF; - -typedef __int64 OFF_T; -//#define OFF_T __int64 - -#ifndef S_ISDIR -#define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) -#endif - -#ifndef S_ISREG -#define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) -#endif - -int mkdtemp(const char*); -int munmap(void *, int); -void *mmap(void*, int, int, int, FD, OFF_T); - -#define PROT_READ 1 -#define PROT_WRITE 1 -#define MAP_FAILED (void*) 0x1 -#define MAP_SHARED 1 -#define MAP_ANON 1 -#define MAP_PRIVATE 1 -#define S_IRUSR 1 -#define S_IROTH 1 -#define S_IRGRP 1 - -int write(int, const void *, int); -#define S_IRUSR 1 -#define S_IWUSR 1 - -//const char *strerror_r(int, const char *buf, int); - -float strtof(const char *begin, char **end); -//FILE *popen(const char*, const char*); -//int pclose(FILE *); -int close(FD fd); - -#define dup(x) _dup(x) -#define rmdir(x) _rmdir(x) -#define strerror_r(errNum, buffer, numberOfElements) strerror_s(buffer, numberOfElements); - -#else // assume UNIX OS - -#include <stdint.h> -#include <sys/resource.h> -#include <sys/time.h> -#include <sys/types.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <unistd.h> - -typedef int FD; -const FD kBadFD = -1; - -typedef off_t OFF_T; - -#endif - -#ifdef __GNUC__ -#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ -#else -#ifdef _WIN32 -#define UTIL_FUNC_NAME __FUNCTION__ -#else -#define UTIL_FUNC_NAME NULL -#endif -#endif - -/* Bit-level packing routines */ -#ifdef __APPLE__ - #include <architecture/byte_order.h> -#elif __linux__ - #include <endian.h> -#elif WIN32 - // nothing -#else - #include <arpa/nameser_compat.h> -#endif - diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh index 8122d69c5..f466cebc9 100644 --- a/util/probing_hash_table.hh +++ b/util/probing_hash_table.hh @@ -18,27 +18,33 @@ class ProbingSizeException : public Exception { ~ProbingSizeException() throw() {} }; +// std::identity is an SGI extension :-( +struct IdentityHash { + template <class T> T operator()(T arg) const { return arg; } +}; + /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number - * of elements, else an infinite loop happens. + * of elements, else it throws ProbingSizeException. * Memory management and initialization is externalized to make it easier to * serialize these to disk and load them quickly. * Uses linear probing to find value. * Only insert and lookup operations. */ -template <class PackingT, class HashT, class EqualT = std::equal_to<typename PackingT::Key> > class ProbingHashTable { +template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable { public: - typedef PackingT Packing; - typedef typename Packing::Key Key; - typedef typename Packing::MutableIterator MutableIterator; - typedef typename Packing::ConstIterator ConstIterator; - + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; + public: static std::size_t Size(std::size_t entries, float multiplier) { - return std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))) * Packing::kBytes; + std::size_t buckets = std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))); + return buckets * sizeof(Entry); } // Must be assigned to later. @@ -49,9 +55,9 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac {} ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) - : begin_(Packing::FromVoid(start)), - buckets_(allocated / Packing::kBytes), - end_(begin_ + (allocated / Packing::kBytes)), + : begin_(reinterpret_cast<MutableIterator>(start)), + buckets_(allocated / sizeof(Entry)), + end_(begin_ + buckets_), invalid_(invalid), hash_(hash_func), equal_(equal_func), @@ -62,11 +68,10 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac {} template <class T> MutableIterator Insert(const T &t) { - if (++entries_ >= buckets_) - UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); #ifdef DEBUG assert(initialized_); #endif + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } @@ -84,7 +89,7 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac if (equal_(got, key)) { out = i; return true; } if (equal_(got, invalid_)) return false; if (++i == end_) i = begin_; - } + } } template <class Key> bool Find(const Key key, ConstIterator &out) const { diff --git a/util/probing_hash_table_test.cc b/util/probing_hash_table_test.cc index ff2f5af31..ef68e5f22 100644 --- a/util/probing_hash_table_test.cc +++ b/util/probing_hash_table_test.cc @@ -1,6 +1,6 @@ #include "util/probing_hash_table.hh" -#include "util/key_value_packing.hh" +#include <stdint.h> #define BOOST_TEST_MODULE ProbingHashTableTest #include <boost/test/unit_test.hpp> @@ -9,17 +9,34 @@ namespace util { namespace { -typedef AlignedPacking<char, uint64_t> Packing; -typedef ProbingHashTable<Packing, boost::hash<char> > Table; +struct Entry { + unsigned char key; + typedef unsigned char Key; + + unsigned char GetKey() const { + return key; + } + + uint64_t GetValue() const { + return value; + } + + uint64_t value; +}; + +typedef ProbingHashTable<Entry, boost::hash<unsigned char> > Table; BOOST_AUTO_TEST_CASE(simple) { char mem[Table::Size(10, 1.2)]; memset(mem, 0, sizeof(mem)); Table table(mem, sizeof(mem)); - Packing::ConstIterator i = Packing::ConstIterator(); + const Entry *i = NULL; BOOST_CHECK(!table.Find(2, i)); - table.Insert(Packing::Make(3, 328920)); + Entry to_ins; + to_ins.key = 3; + to_ins.value = 328920; + table.Insert(to_ins); BOOST_REQUIRE(table.Find(3, i)); BOOST_CHECK_EQUAL(3, i->GetKey()); BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue()); diff --git a/util/sorted_uniform.hh b/util/sorted_uniform.hh index 0391189f0..7700d9e64 100644 --- a/util/sorted_uniform.hh +++ b/util/sorted_uniform.hh @@ -122,99 +122,6 @@ template <class Iterator, class Accessor> Iterator BinaryBelow( return begin - 1; } -// To use this template, you need to define a Pivot function to match Key. -template <class PackingT> class SortedUniformMap { - public: - typedef PackingT Packing; - typedef typename Packing::ConstIterator ConstIterator; - typedef typename Packing::MutableIterator MutableIterator; - - struct Accessor { - public: - typedef typename Packing::Key Key; - const Key &operator()(const ConstIterator &i) const { return i->GetKey(); } - Key &operator()(const MutableIterator &i) const { return i->GetKey(); } - }; - - // Offer consistent API with probing hash. - static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { - return sizeof(uint64_t) + entries * Packing::kBytes; - } - - SortedUniformMap() -#ifdef DEBUG - : initialized_(false), loaded_(false) -#endif - {} - - SortedUniformMap(void *start, std::size_t /*allocated*/) : - begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)), - end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start)) -#ifdef DEBUG - , initialized_(true), loaded_(false) -#endif - {} - - void LoadedBinary() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - // Restore the size. - end_ = begin_ + *size_ptr_; - } - - // Caller responsible for not exceeding specified size. Do not call after FinishedInserting. - template <class T> void Insert(const T &t) { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); -#endif - *end_ = t; - ++end_; - } - - void FinishedInserting() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - std::sort(begin_, end_); - *size_ptr_ = (end_ - begin_); - } - - // Don't use this to change the key. - template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind<MutableIterator, Accessor, Pivot64>(begin_, end_, key, out); - } - - // Do not call before FinishedInserting. - template <class Key> bool Find(const Key key, ConstIterator &out) const { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind<ConstIterator, Accessor, Pivot64>(Accessor(), ConstIterator(begin_), ConstIterator(end_), key, out); - } - - ConstIterator begin() const { return begin_; } - ConstIterator end() const { return end_; } - - private: - typename Packing::MutableIterator begin_, end_; - uint64_t *size_ptr_; -#ifdef DEBUG - bool initialized_; - bool loaded_; -#endif -}; - } // namespace util #endif // UTIL_SORTED_UNIFORM__ diff --git a/util/sorted_uniform_test.cc b/util/sorted_uniform_test.cc index 4aa4c8aad..ac7a0bfc5 100644 --- a/util/sorted_uniform_test.cc +++ b/util/sorted_uniform_test.cc @@ -1,7 +1,5 @@ #include "util/sorted_uniform.hh" -#include "util/key_value_packing.hh" - #include <boost/random/mersenne_twister.hpp> #include <boost/random/uniform_int.hpp> #include <boost/random/variate_generator.hpp> @@ -17,74 +15,86 @@ namespace util { namespace { -template <class Map, class Key, class Value> void Check(const Map &map, const boost::unordered_map<Key, Value> &reference, const Key key) { +template <class KeyT, class ValueT> struct Entry { + typedef KeyT Key; + typedef ValueT Value; + + Key key; + Value value; + + Key GetKey() const { + return key; + } + + Value GetValue() const { + return value; + } + + bool operator<(const Entry<Key,Value> &other) const { + return key < other.key; + } +}; + +template <class KeyT> struct Accessor { + typedef KeyT Key; + template <class Value> Key operator()(const Entry<Key, Value> *entry) const { + return entry->GetKey(); + } +}; + +template <class Key, class Value> void Check(const Entry<Key, Value> *begin, const Entry<Key, Value> *end, const boost::unordered_map<Key, Value> &reference, const Key key) { typename boost::unordered_map<Key, Value>::const_iterator ref = reference.find(key); - typename Map::ConstIterator i = typename Map::ConstIterator(); + typedef const Entry<Key, Value> *It; + // g++ can't tell that require will crash and burn. + It i = NULL; + bool ret = SortedUniformFind<It, Accessor<Key>, Pivot64>(Accessor<Key>(), begin, end, key, i); if (ref == reference.end()) { - BOOST_CHECK(!map.Find(key, i)); + BOOST_CHECK(!ret); } else { - // g++ can't tell that require will crash and burn. - BOOST_REQUIRE(map.Find(key, i)); + BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(ref->second, i->GetValue()); } } -typedef SortedUniformMap<AlignedPacking<uint64_t, uint32_t> > TestMap; - BOOST_AUTO_TEST_CASE(empty) { - char buf[TestMap::Size(0)]; - TestMap map(buf, TestMap::Size(0)); - map.FinishedInserting(); - TestMap::ConstIterator i; - BOOST_CHECK(!map.Find(42, i)); -} - -BOOST_AUTO_TEST_CASE(one) { - char buf[TestMap::Size(1)]; - TestMap map(buf, sizeof(buf)); - Entry<uint64_t, uint32_t> e; - e.Set(42,2); - map.Insert(e); - map.FinishedInserting(); - TestMap::ConstIterator i = TestMap::ConstIterator(); - BOOST_REQUIRE(map.Find(42, i)); - BOOST_CHECK(i == map.begin()); - BOOST_CHECK(!map.Find(43, i)); - BOOST_CHECK(!map.Find(41, i)); + typedef const Entry<uint64_t, float> T; + const T *i; + bool ret = SortedUniformFind<const T*, Accessor<uint64_t>, Pivot64>(Accessor<uint64_t>(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); + BOOST_CHECK(!ret); } template <class Key> void RandomTest(Key upper, size_t entries, size_t queries) { typedef unsigned char Value; - typedef SortedUniformMap<AlignedPacking<Key, unsigned char> > Map; - boost::scoped_array<char> buffer(new char[Map::Size(entries)]); - Map map(buffer.get(), entries); boost::mt19937 rng; boost::uniform_int<Key> range_key(0, upper); boost::uniform_int<Value> range_value(0, 255); boost::variate_generator<boost::mt19937&, boost::uniform_int<Key> > gen_key(rng, range_key); boost::variate_generator<boost::mt19937&, boost::uniform_int<unsigned char> > gen_value(rng, range_value); + typedef Entry<Key, Value> Ent; + std::vector<Ent> backing; boost::unordered_map<Key, unsigned char> reference; - Entry<Key, unsigned char> ent; + Ent ent; for (size_t i = 0; i < entries; ++i) { Key key = gen_key(); unsigned char value = gen_value(); if (reference.insert(std::make_pair(key, value)).second) { - ent.Set(key, value); - map.Insert(Entry<Key, unsigned char>(ent)); + ent.key = key; + ent.value = value; + backing.push_back(ent); } } - map.FinishedInserting(); + std::sort(backing.begin(), backing.end()); // Random queries. for (size_t i = 0; i < queries; ++i) { const Key key = gen_key(); - Check<Map, Key, unsigned char>(map, reference, key); + Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, key); } typename boost::unordered_map<Key, unsigned char>::const_iterator it = reference.begin(); for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { - Check<Map, Key, unsigned char>(map, reference, it->second); + Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, it->second); } } diff --git a/util/tokenize_piece.hh b/util/tokenize_piece.hh index 413bda0b9..c7e1c8633 100644 --- a/util/tokenize_piece.hh +++ b/util/tokenize_piece.hh @@ -1,6 +1,7 @@ #ifndef UTIL_TOKENIZE_PIECE__ #define UTIL_TOKENIZE_PIECE__ +#include "util/exception.hh" #include "util/string_piece.hh" #include <boost/iterator/iterator_facade.hpp> @@ -8,63 +9,25 @@ #include <algorithm> #include <iostream> -/* Usage: - * - * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { - * std::cout << *i << "\n"; - * } - * - */ - namespace util { -// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece. -template <char d> class PieceIterator : public boost::iterator_facade<PieceIterator<d>, const StringPiece, boost::forward_traversal_tag> { +// Thrown on dereference when out of tokens to parse +class OutOfTokens : public Exception { public: - // Default construct is end, which is also accessed by kEndPieceIterator; - PieceIterator() {} - - explicit PieceIterator(const StringPiece &str) - : after_(str) { - increment(); - } + OutOfTokens() throw() {} + ~OutOfTokens() throw() {} +}; - bool operator!() const { - return after_.data() == 0; - } - operator bool() const { - return after_.data() != 0; - } +class SingleCharacter { + public: + explicit SingleCharacter(char delim) : delim_(delim) {} - static PieceIterator<d> end() { - return PieceIterator<d>(); + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); } private: - friend class boost::iterator_core_access; - - void increment() { - const char *start = after_.data(); - for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {} - if (start == after_.data() + after_.size()) { - // End condition. - after_.clear(); - return; - } - const char *finish = start; - for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {} - current_ = StringPiece(start, finish - start); - after_ = StringPiece(finish, after_.data() + after_.size() - finish); - } - - bool equal(const PieceIterator &other) const { - return after_.data() == other.after_.data(); - } - - const StringPiece &dereference() const { return current_; } - - StringPiece current_; - StringPiece after_; + char delim_; }; class MultiCharacter { @@ -95,7 +58,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it public: TokenIter() {} - TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { increment(); } @@ -130,6 +93,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it } const StringPiece &dereference() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); return current_; } diff --git a/util/tokenize_piece_test.cc b/util/tokenize_piece_test.cc index e07ebcf5e..d856018fb 100644 --- a/util/tokenize_piece_test.cc +++ b/util/tokenize_piece_test.cc @@ -9,53 +9,7 @@ namespace util { namespace { -BOOST_AUTO_TEST_CASE(simple) { - PieceIterator<' '> it("single spaced words."); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("single"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("words."), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_delimiter) { - const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; - PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("first"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("second"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("third"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_entries) { - const char str[] = "\0split\0\0 \0me\0 "; - PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - const char first[] = "\0split\0\0"; - BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); - ++it; - BOOST_REQUIRE(it); - const char second[] = "\0me\0"; - BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); - ++it; - BOOST_CHECK(!it); -} - -/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { +BOOST_AUTO_TEST_CASE(pipe_pipe_none) { const char str[] = "nodelimit at all"; TokenIter<MultiCharacter> it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); @@ -79,7 +33,7 @@ BOOST_AUTO_TEST_CASE(remove_empty) { const char str[] = "|||"; TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||")); BOOST_CHECK(!it); -}*/ +} BOOST_AUTO_TEST_CASE(remove_empty_keep) { const char str[] = " |||"; |