diff options
-rw-r--r-- | include/llfio/revision.hpp | 6 | ||||
-rw-r--r-- | include/llfio/v2.0/detail/impl/fast_random_file_handle.ipp | 88 | ||||
-rw-r--r-- | include/llfio/v2.0/fast_random_file_handle.hpp | 71 | ||||
-rw-r--r-- | test/tests/fast_random_file_handle.cpp | 39 |
4 files changed, 161 insertions, 43 deletions
diff --git a/include/llfio/revision.hpp b/include/llfio/revision.hpp index ee51dfb3..c6402746 100644 --- a/include/llfio/revision.hpp +++ b/include/llfio/revision.hpp @@ -1,4 +1,4 @@ // Note the second line of this file must ALWAYS be the git SHA, third line ALWAYS the git SHA update time -#define LLFIO_PREVIOUS_COMMIT_REF 5f576c797bfbadfbcc9092ce7e1b98089156eebc -#define LLFIO_PREVIOUS_COMMIT_DATE "2018-10-18 08:50:15 +00:00" -#define LLFIO_PREVIOUS_COMMIT_UNIQUE 5f576c79 +#define LLFIO_PREVIOUS_COMMIT_REF d5aa6ed95238f51a9184308f55b552fba836b50e +#define LLFIO_PREVIOUS_COMMIT_DATE "2018-10-19 15:59:03 +00:00" +#define LLFIO_PREVIOUS_COMMIT_UNIQUE d5aa6ed9 diff --git a/include/llfio/v2.0/detail/impl/fast_random_file_handle.ipp b/include/llfio/v2.0/detail/impl/fast_random_file_handle.ipp index 489e2c04..ca042427 100644 --- a/include/llfio/v2.0/detail/impl/fast_random_file_handle.ipp +++ b/include/llfio/v2.0/detail/impl/fast_random_file_handle.ipp @@ -24,19 +24,6 @@ Distributed under the Boost Software License, Version 1.0. #include "../../fast_random_file_handle.hpp" -#ifdef __has_include -#if __has_include("../../quickcpplib/include/algorithm/hash.hpp") -#include "../../quickcpplib/include/algorithm/hash.hpp" -#else -#include "quickcpplib/include/algorithm/hash.hpp" -#endif -#elif __PCPP_ALWAYS_TRUE__ -#include "quickcpplib/include/algorithm/hash.hpp" -#else -#include "../../quickcpplib/include/algorithm/hash.hpp" -#endif - - LLFIO_V2_NAMESPACE_EXPORT_BEGIN fast_random_file_handle::io_result<fast_random_file_handle::buffers_type> fast_random_file_handle::read(io_request<buffers_type> reqs, deadline /* unused */) noexcept @@ -50,7 +37,7 @@ fast_random_file_handle::io_result<fast_random_file_handle::buffers_type> fast_r } return std::move(reqs.buffers); } - extent_type hashoffset, togo = _length - reqs.offset; + extent_type togo = _length - reqs.offset; // Fill the scatter buffers for(auto &buffer : reqs.buffers) { @@ -62,10 +49,10 @@ fast_random_file_handle::io_result<fast_random_file_handle::buffers_type> fast_r { for(extent_type i = 0; i < buffer.size();) { - hashoffset = reqs.offset >> 4; // add this offset to the state - _iv[0] = hashoffset; - auto hash = QUICKCPPLIB_NAMESPACE::algorithm::hash::fast_hash::hash((const char *) _iv, 16); - extent_type thisblockoffset = reqs.offset - (hashoffset << 4), thisblocklen = 16 - thisblockoffset; + // How much can we do at once? + auto hashoffset = reqs.offset >> 2; // place this offset into the state + auto thisblockoffset = reqs.offset - (hashoffset << 2); // offset into our buffer due to request offset misalignment + extent_type thisblocklen = 64; // maximum possible block this iteration if(thisblocklen > buffer.size() - i) { thisblocklen = buffer.size() - i; @@ -74,7 +61,70 @@ fast_random_file_handle::io_result<fast_random_file_handle::buffers_type> fast_r { thisblocklen = togo; } - memcpy(buffer.data() + i, hash.as_bytes + thisblockoffset, thisblocklen); + + struct _p + { + uint32_t hash[16]; + } blk, *p = &blk; + // If not copying partial buffers + if(thisblockoffset == 0) + { + // If destination is on 4 byte multiple, we can write direct + if((((uintptr_t) buffer.data() + i) & 3) == 0) + p = (_p *) (buffer.data() + i); + } + if(thisblocklen == 64) + { + auto &hash = p->hash; + prng __prngs[16] = {_prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng, _prng}; + hash[0] = __prngs[0](hashoffset + 0); + hash[1] = __prngs[1](hashoffset + 1); + hash[2] = __prngs[2](hashoffset + 2); + hash[3] = __prngs[3](hashoffset + 3); + hash[4] = __prngs[4](hashoffset + 4); + hash[5] = __prngs[5](hashoffset + 5); + hash[6] = __prngs[6](hashoffset + 6); + hash[7] = __prngs[7](hashoffset + 7); + hash[8] = __prngs[8](hashoffset + 8); + hash[9] = __prngs[9](hashoffset + 9); + hash[10] = __prngs[10](hashoffset + 10); + hash[11] = __prngs[11](hashoffset + 11); + hash[12] = __prngs[12](hashoffset + 12); + hash[13] = __prngs[13](hashoffset + 13); + hash[14] = __prngs[14](hashoffset + 14); + hash[15] = __prngs[15](hashoffset + 15); + if(thisblocklen > 64 - thisblockoffset) + { + thisblocklen = 64 - thisblockoffset; + } + } + else if(thisblocklen == 16) + { + auto &hash = p->hash; + prng __prngs[4] = {_prng, _prng, _prng, _prng}; + hash[0] = __prngs[0](hashoffset + 0); + hash[1] = __prngs[1](hashoffset + 1); + hash[2] = __prngs[2](hashoffset + 2); + hash[3] = __prngs[3](hashoffset + 3); + if(thisblocklen > 16 - thisblockoffset) + { + thisblocklen = 16 - thisblockoffset; + } + } + else + { + auto &hash = p->hash; + auto __prng(_prng); + hash[0] = __prng(hashoffset); + if(thisblocklen > 4 - thisblockoffset) + { + thisblocklen = 4 - thisblockoffset; + } + } + if(p == &blk) + { + memcpy(buffer.data() + i, ((const char *) p) + thisblockoffset, thisblocklen); + } reqs.offset += thisblocklen; i += thisblocklen; togo -= thisblocklen; diff --git a/include/llfio/v2.0/fast_random_file_handle.hpp b/include/llfio/v2.0/fast_random_file_handle.hpp index 9467f80d..fb51024f 100644 --- a/include/llfio/v2.0/fast_random_file_handle.hpp +++ b/include/llfio/v2.0/fast_random_file_handle.hpp @@ -27,6 +27,8 @@ Distributed under the Boost Software License, Version 1.0. #include "file_handle.hpp" +#include "quickcpplib/include/algorithm/small_prng.hpp" + //! \file fast_random_file_handle.hpp Provides `fast_random_file_handle`. LLFIO_V2_NAMESPACE_EXPORT_BEGIN @@ -35,18 +37,18 @@ LLFIO_V2_NAMESPACE_EXPORT_BEGIN \brief A handle to synthesised, non-cryptographic, pseudo-random data. This implementation of file handle provides read-only file data of random bits -based on Bob Jenkin's 128-bit SpookyHash (http://burtleburtle.net/bob/hash/spooky.html). -It works by initialising the hash state from the 128-bit seed you provide at construction, -and then for each 16 byte block it copies the 304 bytes of hash state, adds the scatter buffer -offset divided by 16 to the hash, generating a unique 128-bit hash for that seed and -file offset. SpookyHash is not a cryptographically secure hash, so do not use this -class for anything secure. However it certainly ought to produce very random data -indeed, as SpookyHash is an excellent quality fast hash. +based on Bob Jenkins' 32-bit JSF PRNG (http://burtleburtle.net/bob/rand/smallprng.html). +It works by initialising the prng state from the seed you provide at construction, +and then for each 4 byte block it copies the 16 bytes of prng state, overwrites the +first eight bytes with the offset divided by 4, and performs a PRNG round to generate +a fairly unique number. As there are eight bytes of randomness being mixed with eight +bytes of counter, this will not be a particularly random stream, but it's probably not +awful either. Note that writes to this handle are permitted if it was opened with write permission, but writes have no effect. -The use for a file handle full of very random data may not be obvious. The first is +The use for a file handle full of random data may not be obvious. The first is to obfuscate another file's data using `algorithm::xor_file_handle`. The second is for mock ups in testing, where this file handle stands in for some other (large) file, and you are testing throughput or latency in processing code. @@ -55,9 +57,22 @@ The third is for unit testing randomly corrupted file data. `algorithm::mix_file can randomly mix scatter gather buffers from this file handle into another file handle in order to test how well handling code copes with random data corruption. -VS2017: -- 416Mb/sec static function -- 539Mb/sec preinit, add, finalise +## Benchmarks: + +On a 3.1Ghz Intel Skylake CPU where `memcpy()` can do ~12Gb/sec: + +- GCC7: 4659 Mb/sec +- VS2017: 3653 Mb/sec + +The current implementation spots when it can do 16x simultaneous PRNG rounds, and thus +can fill a cache line at a time. The Skylake CPU used to benchmark the code dispatches +around four times the throughput with this, however there is likely still performance +left on the table. + +If someone were bothered to rewrite the JSF PRNG into SIMD, it is possible one could +approach `memcpy()` in performance. One would probably need to use AVX-512 however, +as the JSF PRNG makes heavy use of bit rotation, which is slow before AVX-512 as it +must be emulated with copious bit shifting and masking. */ class fast_random_file_handle : public file_handle { @@ -80,7 +95,27 @@ public: template <class T> using io_result = io_handle::io_result<T>; protected: - uint64_t _iv[12]{}; // sc_blockSize from SpookyHash + struct prng : public QUICKCPPLIB_NAMESPACE::algorithm::small_prng::small_prng + { + using _base = QUICKCPPLIB_NAMESPACE::algorithm::small_prng::small_prng; + prng() = default; + // Construct an instance with `seed` + explicit prng(span<const byte> seed) noexcept + { + a = 0xf1ea5eed; + b = c = d = 0xdeadbeef; + memcpy(&a, seed.data(), std::min(seed.size(), sizeof(*this))); + for(size_t i = 0; i < 20; ++i) + _base::operator()(); + } + // Return randomness for a given extent + _base::value_type operator()(extent_type offset) noexcept + { + a = offset & 0xffffffff; + b = (offset >> 32) & 0xffffffff; + return _base::operator()(); + } + } _prng; extent_type _length{0}; result<void> _perms_check() const noexcept @@ -94,16 +129,16 @@ protected: public: //! Default constructor - constexpr fast_random_file_handle() {} // NOLINT - //! Constructor. Seed can be of up to 88 bytes. + fast_random_file_handle() = default; + //! Constructor. Seed is not much use past sixteen bytes. fast_random_file_handle(extent_type length, span<const byte> seed) - : _length(length) + : _prng(seed) + , _length(length) { - memcpy(&_iv[1], seed.begin(), (seed.size() < (sizeof(_iv) - sizeof(_iv[0]))) ? seed.size() : (sizeof(_iv) - sizeof(_iv[0]))); } //! Implicit move construction of fast_random_file_handle permitted - fast_random_file_handle(fast_random_file_handle &&o) noexcept : _length(o._length) { memcpy(_iv, o._iv, sizeof(_iv)); } + fast_random_file_handle(fast_random_file_handle &&o) noexcept : _prng(o._prng), _length(o._length) {} //! No copy construction (use `clone()`) fast_random_file_handle(const fast_random_file_handle &) = delete; //! Move assignment of fast_random_file_handle permitted @@ -136,7 +171,7 @@ public: { return errc::invalid_argument; } - byte _seed[88]; + byte _seed[16]; if(seed.empty()) { utils::random_fill((char *) _seed, sizeof(_seed)); diff --git a/test/tests/fast_random_file_handle.cpp b/test/tests/fast_random_file_handle.cpp index acd8ceb0..d528c44f 100644 --- a/test/tests/fast_random_file_handle.cpp +++ b/test/tests/fast_random_file_handle.cpp @@ -74,10 +74,43 @@ static inline void TestFastRandomFileHandlePerformance() memset(store.begin(), 1, store.size()); fast_random_file_handle h = fast_random_file_handle::fast_random_file(testbytes).value(); auto begin = std::chrono::high_resolution_clock::now(); - h.read(0, {{store.begin(), store.size()}}).value(); + while(std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - begin).count() < 1) + ; + QUICKCPPLIB_NAMESPACE::algorithm::small_prng::small_prng prng; + begin = std::chrono::high_resolution_clock::now(); + for(size_t n = 0; n < 10; n++) + { + for(size_t i = 0; i < store.size(); i += 64) + { + *(uint32_t *) (&store[i + 0]) = prng(); + *(uint32_t *) (&store[i + 4]) = prng(); + *(uint32_t *) (&store[i + 8]) = prng(); + *(uint32_t *) (&store[i + 12]) = prng(); + *(uint32_t *) (&store[i + 16]) = prng(); + *(uint32_t *) (&store[i + 20]) = prng(); + *(uint32_t *) (&store[i + 24]) = prng(); + *(uint32_t *) (&store[i + 28]) = prng(); + *(uint32_t *) (&store[i + 32]) = prng(); + *(uint32_t *) (&store[i + 36]) = prng(); + *(uint32_t *) (&store[i + 40]) = prng(); + *(uint32_t *) (&store[i + 44]) = prng(); + *(uint32_t *) (&store[i + 48]) = prng(); + *(uint32_t *) (&store[i + 52]) = prng(); + *(uint32_t *) (&store[i + 56]) = prng(); + *(uint32_t *) (&store[i + 60]) = prng(); + } + } auto end = std::chrono::high_resolution_clock::now(); - auto diff = std::chrono::duration_cast<std::chrono::microseconds>(end - begin); - std::cout << "fast_random_file_handle produces randomness at " << ((testbytes / 1024.0 / 1024.0) / (diff.count() / 1000000.0)) << " Mb/sec" << std::endl; + auto diff1 = std::chrono::duration_cast<std::chrono::microseconds>(end - begin); + begin = std::chrono::high_resolution_clock::now(); + for(size_t n = 0; n < 10; n++) + { + h.read(0, {{store.begin(), store.size()}}).value(); + } + end = std::chrono::high_resolution_clock::now(); + auto diff2 = std::chrono::duration_cast<std::chrono::microseconds>(end - begin); + std::cout << "small_prng produces randomness at " << ((testbytes / 1024.0 / 1024.0) / (diff1.count() / 10000000.0)) << " Mb/sec" << std::endl; + std::cout << "fast_random_file_handle produces randomness at " << ((testbytes / 1024.0 / 1024.0) / (diff2.count() / 10000000.0)) << " Mb/sec" << std::endl; } KERNELTEST_TEST_KERNEL(integration, llfio, fast_random_file_handle, works, "Tests that fast random file handle works as expected", TestFastRandomFileHandleWorks()) |