From 67226948b9f00aebbf33c232d10c417ba1abb289 Mon Sep 17 00:00:00 2001 From: "Niall Douglas (s [underscore] sourceforge {at} nedprod [dot] com)" Date: Tue, 16 Mar 2021 12:31:35 +0000 Subject: Benchmarks now show LLFIO dynamic_thread_pool_group either matching or significantly beating all other alternatives in performance. --- CMakeLists.txt | 8 +- .../v2.0/detail/impl/dynamic_thread_pool_group.ipp | 259 +++++++++++---------- include/llfio/v2.0/dynamic_thread_pool_group.hpp | 31 ++- include/llfio/v2.0/llfio.hpp | 10 +- 4 files changed, 173 insertions(+), 135 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11750c94..aa6e8d20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,11 +28,15 @@ include(QuickCppLibUtils) include(QuickCppLibPolicies) option(LLFIO_USE_EXPERIMENTAL_SG14_STATUS_CODE "Whether to use SG14 status_code for failure handling" OFF) -option(LLFIO_DISABLE_LIBDISPATCH "Whether to disable automatic discovery of libdispatch/Grand Unified Dispatch" OFF) option(LLFIO_ENABLE_DEPENDENCY_SMOKE_TEST "Whether to build executables which are smoke tests that LLFIO is fully working. Used by various package managers such as vcpkg." OFF) option(LLFIO_ASSUME_CROSS_COMPILING "Whether to assume we are cross compiling. Normally automatically detected, but if automatic detection doesn't work, a working will not be found during cmake configure." OFF) option(UNIT_TESTS_BUILD_ALL "Whether to run all of the unit test suite." OFF) set(UNIT_TESTS_CXX_VERSION "latest" CACHE STRING "The version of C++ to use in the header-only unit tests") +if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" OR APPLE) + option(LLFIO_USE_LIBDISPATCH "Whether to use libdispatch/Grand Unified Dispatch (defaults on on BSD/Mac OS)" ON) +else() + option(LLFIO_USE_LIBDISPATCH "Whether to use libdispatch/Grand Unified Dispatch (defaults on on BSD/Mac OS)" OFF) +endif() ensure_git_subrepo("${CMAKE_CURRENT_SOURCE_DIR}/include/llfio/ntkernel-error-category/include" "https://github.com/ned14/ntkernel-error-category.git") @@ -285,7 +289,7 @@ int main() { endif() endif() # Do we have Grand Central Dispatch on this platform? -if(NOT LLFIO_DISABLE_LIBDISPATCH) +if(LLFIO_USE_LIBDISPATCH) function(check_have_libdispatch postfix) set(CMAKE_REQUIRED_LIBRARIES ${ARGN}) check_cxx_source_compiles(" diff --git a/include/llfio/v2.0/detail/impl/dynamic_thread_pool_group.ipp b/include/llfio/v2.0/detail/impl/dynamic_thread_pool_group.ipp index e3c5c347..9dff9425 100644 --- a/include/llfio/v2.0/detail/impl/dynamic_thread_pool_group.ipp +++ b/include/llfio/v2.0/detail/impl/dynamic_thread_pool_group.ipp @@ -67,141 +67,158 @@ Distributed under the Boost Software License, Version 1.0. #define LLFIO_DYNAMIC_THREAD_POOL_GROUP_PRINTING 0 -/* NOTE that the Linux results are from a VM with half the CPUs of the Windows results, +/* NOTE that the Linux results are from a VM on the same machine as the Windows results, so they are not directly comparable. Linux 4Kb and 64Kb Benchmarking asio ... - For 1 work items got 33635.5 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 59420.4 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 65653.8 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 42216.3 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 458911 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 578462 SHA256 hashes/sec with 27 maximum concurrency. - For 64 work items got 572456 SHA256 hashes/sec with 27 maximum concurrency. - For 128 work items got 572326 SHA256 hashes/sec with 26 maximum concurrency. - For 256 work items got 568558 SHA256 hashes/sec with 25 maximum concurrency. - For 512 work items got 570342 SHA256 hashes/sec with 26 maximum concurrency. - For 1024 work items got 567351 SHA256 hashes/sec with 26 maximum concurrency. + For 1 work items got 38182.6 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 68664 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 87036.4 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 78702.2 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 51911.2 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 553964 SHA256 hashes/sec with 31 maximum concurrency. + For 64 work items got 713844 SHA256 hashes/sec with 36 maximum concurrency. + For 128 work items got 700172 SHA256 hashes/sec with 37 maximum concurrency. + For 256 work items got 716099 SHA256 hashes/sec with 37 maximum concurrency. + For 512 work items got 703323 SHA256 hashes/sec with 37 maximum concurrency. + For 1024 work items got 722827 SHA256 hashes/sec with 38 maximum concurrency. Benchmarking asio ... - For 1 work items got 3768.07 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 7672.47 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 14169.5 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 21785.9 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 30875 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 43614.4 SHA256 hashes/sec with 32 maximum concurrency. - For 64 work items got 46075.4 SHA256 hashes/sec with 32 maximum concurrency. - For 128 work items got 47111.6 SHA256 hashes/sec with 32 maximum concurrency. - For 256 work items got 45926.6 SHA256 hashes/sec with 32 maximum concurrency. - For 512 work items got 45923.9 SHA256 hashes/sec with 32 maximum concurrency. - For 1024 work items got 46250.9 SHA256 hashes/sec with 32 maximum concurrency. + For 1 work items got 3917.88 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 7798.29 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 14395.2 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 23633.4 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 31771.1 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 57978 SHA256 hashes/sec with 32 maximum concurrency. + For 64 work items got 66200.6 SHA256 hashes/sec with 64 maximum concurrency. + For 128 work items got 65706.5 SHA256 hashes/sec with 64 maximum concurrency. + For 256 work items got 65717.5 SHA256 hashes/sec with 64 maximum concurrency. + For 512 work items got 65652.4 SHA256 hashes/sec with 64 maximum concurrency. + For 1024 work items got 65580.3 SHA256 hashes/sec with 64 maximum concurrency. Windows 4Kb and 64kB Benchmarking asio ... - For 1 work items got 49443.6 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 97189 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 185187 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 328105 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 513294 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 493040 SHA256 hashes/sec with 32 maximum concurrency. - For 64 work items got 1.00736e+06 SHA256 hashes/sec with 64 maximum concurrency. - For 128 work items got 996193 SHA256 hashes/sec with 64 maximum concurrency. - For 256 work items got 993805 SHA256 hashes/sec with 64 maximum concurrency. - For 512 work items got 998211 SHA256 hashes/sec with 64 maximum concurrency. - For 1024 work items got 990231 SHA256 hashes/sec with 64 maximum concurrency. + For 1 work items got 51216.7 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 97691 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 184381 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 305270 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 520728 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 482729 SHA256 hashes/sec with 32 maximum concurrency. + For 64 work items got 1.02629e+06 SHA256 hashes/sec with 64 maximum concurrency. + For 128 work items got 1.01816e+06 SHA256 hashes/sec with 64 maximum concurrency. + For 256 work items got 1.01672e+06 SHA256 hashes/sec with 64 maximum concurrency. + For 512 work items got 1.01727e+06 SHA256 hashes/sec with 64 maximum concurrency. + For 1024 work items got 1.01477e+06 SHA256 hashes/sec with 64 maximum concurrency. Benchmarking asio ... - For 1 work items got 3797.05 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 7869.94 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 15612 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 28481.1 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 41255.2 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 66182.4 SHA256 hashes/sec with 32 maximum concurrency. - For 64 work items got 67230.5 SHA256 hashes/sec with 64 maximum concurrency. - For 128 work items got 66988.5 SHA256 hashes/sec with 64 maximum concurrency. - For 256 work items got 66926.1 SHA256 hashes/sec with 64 maximum concurrency. - For 512 work items got 66964.7 SHA256 hashes/sec with 64 maximum concurrency. - For 1024 work items got 66911 SHA256 hashes/sec with 64 maximum concurrency. + For 1 work items got 4069.92 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 8099.1 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 16021.7 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 30275.2 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 40972.5 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 70919.2 SHA256 hashes/sec with 32 maximum concurrency. + For 64 work items got 71917 SHA256 hashes/sec with 64 maximum concurrency. + For 128 work items got 71111.8 SHA256 hashes/sec with 64 maximum concurrency. + For 256 work items got 70963.5 SHA256 hashes/sec with 64 maximum concurrency. + For 512 work items got 70956.3 SHA256 hashes/sec with 64 maximum concurrency. + For 1024 work items got 70989.9 SHA256 hashes/sec with 64 maximum concurrency. */ + /* Linux 4Kb and 64Kb libdispatch Benchmarking llfio (Grand Central Dispatch) ... - For 1 work items got 32058.2 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 26084.1 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 24906.8 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 24729.5 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 73749.1 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 526656 SHA256 hashes/sec with 21 maximum concurrency. - For 64 work items got 535043 SHA256 hashes/sec with 27 maximum concurrency. - For 128 work items got 541809 SHA256 hashes/sec with 30 maximum concurrency. - For 256 work items got 543568 SHA256 hashes/sec with 33 maximum concurrency. - For 512 work items got 545540 SHA256 hashes/sec with 37 maximum concurrency. - For 1024 work items got 542017 SHA256 hashes/sec with 41 maximum concurrency. + For 1 work items got 33942.7 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 91275.8 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 191446 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 325776 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 405282 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 408015 SHA256 hashes/sec with 31 maximum concurrency. + For 64 work items got 412343 SHA256 hashes/sec with 32 maximum concurrency. + For 128 work items got 450024 SHA256 hashes/sec with 41 maximum concurrency. + For 256 work items got 477885 SHA256 hashes/sec with 46 maximum concurrency. + For 512 work items got 531752 SHA256 hashes/sec with 48 maximum concurrency. + For 1024 work items got 608181 SHA256 hashes/sec with 44 maximum concurrency. Benchmarking llfio (Grand Central Dispatch) ... - For 1 work items got 3857.82 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 7666.2 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 14993.6 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 25160 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 39015.5 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 43494.4 SHA256 hashes/sec with 16 maximum concurrency. - For 64 work items got 42874.5 SHA256 hashes/sec with 16 maximum concurrency. - For 128 work items got 42678.7 SHA256 hashes/sec with 16 maximum concurrency. - For 256 work items got 42661.7 SHA256 hashes/sec with 16 maximum concurrency. - For 512 work items got 42670.9 SHA256 hashes/sec with 16 maximum concurrency. - For 1024 work items got 44609.5 SHA256 hashes/sec with 16 maximum concurrency. + For 1 work items got 3977.21 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 7980.09 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 15075.6 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 24427.3 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 41858.7 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 64896.4 SHA256 hashes/sec with 32 maximum concurrency. + For 64 work items got 65683.6 SHA256 hashes/sec with 34 maximum concurrency. + For 128 work items got 65476.1 SHA256 hashes/sec with 35 maximum concurrency. + For 256 work items got 65210.6 SHA256 hashes/sec with 36 maximum concurrency. + For 512 work items got 65241.1 SHA256 hashes/sec with 36 maximum concurrency. + For 1024 work items got 65205.3 SHA256 hashes/sec with 37 maximum concurrency. */ -/* Windows 4Kb and 64Kb Win32 thread pool - -Benchmarking llfio (Win32 thread pool (Vista+)) ... - For 1 work items got 56553.8 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 110711 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 207273 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 269391 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 245053 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 260854 SHA256 hashes/sec with 21 maximum concurrency. - For 64 work items got 402240 SHA256 hashes/sec with 19 maximum concurrency. - For 128 work items got 400192 SHA256 hashes/sec with 19 maximum concurrency. - For 256 work items got 405973 SHA256 hashes/sec with 20 maximum concurrency. - For 512 work items got 406156 SHA256 hashes/sec with 22 maximum concurrency. - For 1024 work items got 405901 SHA256 hashes/sec with 23 maximum concurrency. - -Benchmarking llfio (Win32 thread pool (Vista+)) ... - For 1 work items got 4020.88 SHA256 hashes/sec with 1 maximum concurrency. - For 2 work items got 8028.79 SHA256 hashes/sec with 2 maximum concurrency. - For 4 work items got 15813 SHA256 hashes/sec with 4 maximum concurrency. - For 8 work items got 25539.4 SHA256 hashes/sec with 8 maximum concurrency. - For 16 work items got 40522.3 SHA256 hashes/sec with 16 maximum concurrency. - For 32 work items got 65182 SHA256 hashes/sec with 32 maximum concurrency. - For 64 work items got 65572.9 SHA256 hashes/sec with 33 maximum concurrency. - For 128 work items got 66462.3 SHA256 hashes/sec with 33 maximum concurrency. - For 256 work items got 66315.3 SHA256 hashes/sec with 33 maximum concurrency. - For 512 work items got 66341.5 SHA256 hashes/sec with 33 maximum concurrency. - For 1024 work items got 66416.2 SHA256 hashes/sec with 33 maximum concurrency. +/* Linux 4Kb and 64Kb native + +Benchmarking llfio (Linux native) ... + For 1 work items got 65160.3 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 126586 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 246616 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 478938 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 529919 SHA256 hashes/sec with 15 maximum concurrency. + For 32 work items got 902885 SHA256 hashes/sec with 32 maximum concurrency. + For 64 work items got 919633 SHA256 hashes/sec with 34 maximum concurrency. + For 128 work items got 919695 SHA256 hashes/sec with 35 maximum concurrency. + For 256 work items got 923159 SHA256 hashes/sec with 36 maximum concurrency. + For 512 work items got 922961 SHA256 hashes/sec with 37 maximum concurrency. + For 1024 work items got 926624 SHA256 hashes/sec with 38 maximum concurrency. + +Benchmarking llfio (Linux native) ... + For 1 work items got 4193.79 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 8422.44 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 12521.7 SHA256 hashes/sec with 3 maximum concurrency. + For 8 work items got 20028.4 SHA256 hashes/sec with 6 maximum concurrency. + For 16 work items got 30657.4 SHA256 hashes/sec with 10 maximum concurrency. + For 32 work items got 53217.4 SHA256 hashes/sec with 20 maximum concurrency. + For 64 work items got 65452.3 SHA256 hashes/sec with 32 maximum concurrency. + For 128 work items got 65396.3 SHA256 hashes/sec with 32 maximum concurrency. + For 256 work items got 65363.7 SHA256 hashes/sec with 32 maximum concurrency. + For 512 work items got 65198.2 SHA256 hashes/sec with 32 maximum concurrency. + For 1024 work items got 65003.9 SHA256 hashes/sec with 34 maximum concurrency. */ -/* The Win32 thread pool numbers match those for ASIO on Windows for 64Kb SHA256 -so the base implementation is probably good enough. - -1. Need multiple work queues, with speculative locking for insert/remove. -2. Pumping timers needs to not be in work queue loop: - - If there is a waiting thread, it can pump timers. - - - Otherwise a separate timer thread would need to be launched. +/* Windows 4Kb and 64Kb Win32 thread pool -3. List counts for doubly linked lists need optional atomic count, so add fake -atomic type. +Benchmarking llfio (Win32 thread pool (Vista+)) ... + For 1 work items got 57995.3 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 120267 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 238139 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 413488 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 575423 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 720938 SHA256 hashes/sec with 31 maximum concurrency. + For 64 work items got 703460 SHA256 hashes/sec with 30 maximum concurrency. + For 128 work items got 678257 SHA256 hashes/sec with 29 maximum concurrency. + For 256 work items got 678898 SHA256 hashes/sec with 29 maximum concurrency. + For 512 work items got 671729 SHA256 hashes/sec with 28 maximum concurrency. + For 1024 work items got 674433 SHA256 hashes/sec with 30 maximum concurrency. +Benchmarking llfio (Win32 thread pool (Vista+)) ... + For 1 work items got 4132.18 SHA256 hashes/sec with 1 maximum concurrency. + For 2 work items got 8197.21 SHA256 hashes/sec with 2 maximum concurrency. + For 4 work items got 16281.3 SHA256 hashes/sec with 4 maximum concurrency. + For 8 work items got 27447.5 SHA256 hashes/sec with 8 maximum concurrency. + For 16 work items got 42621.3 SHA256 hashes/sec with 16 maximum concurrency. + For 32 work items got 69857.7 SHA256 hashes/sec with 32 maximum concurrency. + For 64 work items got 68797.9 SHA256 hashes/sec with 33 maximum concurrency. + For 128 work items got 68980.4 SHA256 hashes/sec with 33 maximum concurrency. + For 256 work items got 70370.8 SHA256 hashes/sec with 33 maximum concurrency. + For 512 work items got 70365.8 SHA256 hashes/sec with 33 maximum concurrency. + For 1024 work items got 70794.6 SHA256 hashes/sec with 33 maximum concurrency. */ + LLFIO_V2_NAMESPACE_BEGIN namespace detail @@ -210,6 +227,7 @@ namespace detail { using std::unique_lock::unique_lock; }; +#if 0 template class fake_atomic { T _v; @@ -232,6 +250,7 @@ namespace detail return _v + v; } }; +#endif struct global_dynamic_thread_pool_impl_workqueue_item { const size_t nesting_level; @@ -1167,8 +1186,7 @@ namespace detail return success(); } - inline void _submit_work_item(dynamic_thread_pool_group_impl_guard &g, bool submit_into_highest_priority, dynamic_thread_pool_group::work_item *workitem, - bool defer_pool_wake); + inline void _submit_work_item(bool submit_into_highest_priority, dynamic_thread_pool_group::work_item *workitem, bool defer_pool_wake); inline result submit(dynamic_thread_pool_group_impl_guard &g, dynamic_thread_pool_group_impl *group, span work) noexcept; @@ -1625,10 +1643,9 @@ namespace detail } #endif - inline void global_dynamic_thread_pool_impl::_submit_work_item(dynamic_thread_pool_group_impl_guard &g, bool submit_into_highest_priority, - dynamic_thread_pool_group::work_item *workitem, bool defer_pool_wake) + inline void global_dynamic_thread_pool_impl::_submit_work_item(bool submit_into_highest_priority, dynamic_thread_pool_group::work_item *workitem, + bool defer_pool_wake) { - (void) g; (void) submit_into_highest_priority; (void) defer_pool_wake; if(workitem->_nextwork != -1) @@ -1793,7 +1810,6 @@ namespace detail const auto active_work_items = total_submitted_workitems.fetch_add(1, std::memory_order_relaxed) + 1; if(!defer_pool_wake) { - g.unlock(); // unlock group { threadpool_guard gg(threadpool_lock); if(threadpool_active.count == 0 && threadpool_sleeping.count == 0) @@ -1813,7 +1829,6 @@ namespace detail } } } - g.lock(); // lock group } #endif } @@ -1882,6 +1897,7 @@ namespace detail } } uninit.release(); + g.unlock(); { for(auto *i : work) { @@ -1889,7 +1905,7 @@ namespace detail group->_newly_added_active_work_items++; group->_active_work_items_remaining++; #endif - _submit_work_item(g, true, i, i != work.back()); + _submit_work_item(true, i, i != work.back()); } } return success(); @@ -2237,10 +2253,10 @@ namespace detail assert(workitem->_nextwork != -1); assert(workitem->_has_timer_set()); auto *parent = workitem->_parent.load(std::memory_order_relaxed); - dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group // std::cout << "*** _timerthread " << workitem << std::endl; if(parent->_stopping.load(std::memory_order_relaxed)) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group _work_item_done(g, workitem); return; } @@ -2251,7 +2267,7 @@ namespace detail if(workitem->_timepoint1 - now > std::chrono::seconds(0)) { // Timer fired short, so schedule it again - _submit_work_item(g, false, workitem, false); + _submit_work_item(false, workitem, false); return; } #endif @@ -2264,7 +2280,7 @@ namespace detail if(workitem->_timepoint2 - now > std::chrono::seconds(0)) { // Timer fired short, so schedule it again - _submit_work_item(g, false, workitem, false); + _submit_work_item(false, workitem, false); return; } #endif @@ -2278,19 +2294,21 @@ namespace detail auto r2 = _prepare_work_item_delay(workitem, parent->_grouph, d); if(!r2) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group (void) stop(g, parent, std::move(r2)); _work_item_done(g, workitem); return; } if(-1 == workitem->_nextwork) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group _work_item_done(g, workitem); return; } - _submit_work_item(g, false, workitem, false); + _submit_work_item(false, workitem, false); return; } - _submit_work_item(g, false, workitem, false); + _submit_work_item(false, workitem, false); } // Worker thread entry point @@ -2318,16 +2336,17 @@ namespace detail auto r = (*workitem)(workitem->_nextwork); workitem->_nextwork = 0; // call next() next time tls = old_thread_local_state; - dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group // std::cout << "*** _workerthread " << workitem << " ends with work " << workitem->_nextwork << std::endl; if(!r) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group (void) stop(g, parent, std::move(r)); _work_item_done(g, workitem); workitem = nullptr; } else if(parent->_stopping.load(std::memory_order_relaxed)) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group _work_item_done(g, workitem); } else @@ -2337,16 +2356,18 @@ namespace detail auto r2 = _prepare_work_item_delay(workitem, parent->_grouph, d); if(!r2) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group (void) stop(g, parent, std::move(r2)); _work_item_done(g, workitem); return; } if(-1 == workitem->_nextwork) { + dynamic_thread_pool_group_impl_guard g(parent->_lock); // lock group _work_item_done(g, workitem); return; } - _submit_work_item(g, false, workitem, false); + _submit_work_item(false, workitem, false); } } } // namespace detail diff --git a/include/llfio/v2.0/dynamic_thread_pool_group.hpp b/include/llfio/v2.0/dynamic_thread_pool_group.hpp index 2a549efa..2f89ce98 100644 --- a/include/llfio/v2.0/dynamic_thread_pool_group.hpp +++ b/include/llfio/v2.0/dynamic_thread_pool_group.hpp @@ -135,28 +135,32 @@ of the initial `make_dynamic_thread_pool_group()`. The Win32 thread pool API may perform dynamic memory allocation internally, but that is outside our control. +Overhead of LLFIO above the Win32 thread pool API is very low, statistically +unmeasurable. + ### POSIX -If an installation of libdispatch is detected by LLFIO cmake during -configuration, it is used preferentially. libdispatch is better known as +If not on Linux, you will need libdispatch which is detected by LLFIO cmake +during configuration. libdispatch is better known as Grand Central Dispatch, originally a Mac OS technology but since ported to a high quality kernel based implementation on recent FreeBSDs, and to a lower quality userspace based implementation on Linux. Generally libdispatch should get automatically found on Mac OS without additional effort; on FreeBSD it may need installing from ports; on Linux you would need to explicitly install `libdispatch-dev` or the equivalent. You can -disable the automatic discovery in cmake of libdispatch by setting the -cmake variable `LLFIO_DISABLE_LIBDISPATCH` to On. +force the use in cmake of libdispatch by setting the cmake variable +`LLFIO_USE_LIBDISPATCH` to On. + +Overhead of LLFIO above the libdispatch API is very low, statistically +unmeasurable. ### Linux -If libdispatch is not found, we have a custom Linux only userspace -implementation. A a similar strategy to Microsoft Windows' approach is used. We +On Linux only, we have a custom userspace implementation with superior performance. +A similar strategy to Microsoft Windows' approach is used. We dynamically increase the number of kernel threads until none are sleeping -awaiting i/o. If more kernel threads are running than 1.5x the number of +awaiting i/o. If more kernel threads are running than three more than the number of CPUs in the system, the number of kernel threads is dynamically reduced. -For portability, we also gate the maximum number of kernel threads to 500, -except where threads have been detected as being in prolonged wait states. Note that **all** the kernel threads for the current process are considered, not just the kernel threads created by this thread pool implementation. Therefore, if you have alternative thread pool implementations (e.g. OpenMP, @@ -166,6 +170,15 @@ As this is wholly implemented by this library, dynamic memory allocation occurs in the initial `make_dynamic_thread_pool_group()` and per thread creation, but otherwise the implementation does not perform dynamic memory allocations. + +After multiple rewrites, eventually I got this custom userspace implementation +to have superior performance to both ASIO and libdispatch. For larger work +items the difference is meaningless between all three, however for smaller +work items I benchmarked this custom userspace implementation as beating +(non-dynamic) ASIO by approx 29% and Linux libdispatch by approx 52% (note +that Linux libdispatch appears to have a scale up bug when work items are +small and few, it is often less than half the performance of LLFIO's custom +implementation). */ class LLFIO_DECL dynamic_thread_pool_group { diff --git a/include/llfio/v2.0/llfio.hpp b/include/llfio/v2.0/llfio.hpp index dc7ed70a..a7c23b27 100644 --- a/include/llfio/v2.0/llfio.hpp +++ b/include/llfio/v2.0/llfio.hpp @@ -34,7 +34,10 @@ // If C++ Modules are on and we are not compiling the library, // we are either generating the interface or importing -#if defined(__cpp_modules) +#if !defined(__cpp_modules) || defined(GENERATING_LLFIO_MODULE_INTERFACE) || LLFIO_DISABLE_CXX_MODULES +// C++ Modules not on, therefore include as usual +#define LLFIO_INCLUDE_ALL +#else #if defined(GENERATING_LLFIO_MODULE_INTERFACE) // We are generating this module's interface #define QUICKCPPLIB_HEADERS_ONLY 0 @@ -48,9 +51,6 @@ import LLFIO_MODULE_NAME; #undef LLFIO_INCLUDE_ALL #endif -#else -// C++ Modules not on, therefore include as usual -#define LLFIO_INCLUDE_ALL #endif #ifdef LLFIO_INCLUDE_ALL @@ -86,10 +86,10 @@ import LLFIO_MODULE_NAME; #include "algorithm/summarize.hpp" #ifndef LLFIO_EXCLUDE_MAPPED_FILE_HANDLE -#include "mapped.hpp" #include "algorithm/handle_adapter/xor.hpp" #include "algorithm/shared_fs_mutex/memory_map.hpp" #include "algorithm/trivial_vector.hpp" +#include "mapped.hpp" #endif #endif -- cgit v1.2.3