From 658fd8df0bd2427cd77e7fc4bcca8a102f67b626 Mon Sep 17 00:00:00 2001 From: Jacques Lucke Date: Fri, 26 Nov 2021 11:05:47 +0100 Subject: Geometry Nodes: refactor multi-threading in field evaluation Previously, there was a fixed grain size for all multi-functions. That was not sufficient because some functions could benefit a lot from smaller grain sizes. This refactors adds a new `MultiFunction::call_auto` method which has the same effect as just calling `MultiFunction::call` but additionally figures out how to execute the specific multi-function efficiently. It determines a good grain size and decides whether the mask indices should be shifted or not. Most multi-function evaluations benefit from this, but medium sized work loads (1000 - 50000 elements) benefit from it the most. Especially when expensive multi-functions (e.g. noise) is involved. This is because for smaller work loads, threading is rarely used and for larger work loads threading worked fine before already. With this patch, multi-functions can specify execution hints, that allow the caller to execute it most efficiently. These execution hints still have to be added to more functions. Some performance measurements of a field evaluation involving noise and math nodes, ordered by the number of elements being evaluated: ``` 1,000,000: 133 ms -> 120 ms 100,000: 30 ms -> 18 ms 10,000: 20 ms -> 2.7 ms 1,000: 4 ms -> 0.5 ms 100: 0.5 ms -> 0.4 ms ``` --- source/blender/functions/CMakeLists.txt | 3 +- source/blender/functions/FN_multi_function.hh | 28 +++++ .../functions/FN_multi_function_parallel.hh | 39 ------ .../blender/functions/FN_multi_function_params.hh | 29 +++-- .../FN_multi_function_procedure_executor.hh | 3 + source/blender/functions/intern/field.cc | 10 +- source/blender/functions/intern/multi_function.cc | 133 +++++++++++++++++++++ .../functions/intern/multi_function_parallel.cc | 93 -------------- .../functions/intern/multi_function_params.cc | 44 +++++++ .../intern/multi_function_procedure_executor.cc | 10 +- 10 files changed, 234 insertions(+), 158 deletions(-) delete mode 100644 source/blender/functions/FN_multi_function_parallel.hh delete mode 100644 source/blender/functions/intern/multi_function_parallel.cc create mode 100644 source/blender/functions/intern/multi_function_params.cc (limited to 'source/blender/functions') diff --git a/source/blender/functions/CMakeLists.txt b/source/blender/functions/CMakeLists.txt index 54670c0d1b3..63c11164275 100644 --- a/source/blender/functions/CMakeLists.txt +++ b/source/blender/functions/CMakeLists.txt @@ -34,7 +34,7 @@ set(SRC intern/generic_virtual_vector_array.cc intern/multi_function.cc intern/multi_function_builder.cc - intern/multi_function_parallel.cc + intern/multi_function_params.cc intern/multi_function_procedure.cc intern/multi_function_procedure_builder.cc intern/multi_function_procedure_executor.cc @@ -54,7 +54,6 @@ set(SRC FN_multi_function_builder.hh FN_multi_function_context.hh FN_multi_function_data_type.hh - FN_multi_function_parallel.hh FN_multi_function_param_type.hh FN_multi_function_params.hh FN_multi_function_procedure.hh diff --git a/source/blender/functions/FN_multi_function.hh b/source/blender/functions/FN_multi_function.hh index 3059fe59ca7..af60e54808e 100644 --- a/source/blender/functions/FN_multi_function.hh +++ b/source/blender/functions/FN_multi_function.hh @@ -60,6 +60,7 @@ class MultiFunction { { } + void call_auto(IndexMask mask, MFParams params, MFContext context) const; virtual void call(IndexMask mask, MFParams params, MFContext context) const = 0; virtual uint64_t hash() const @@ -110,6 +111,31 @@ class MultiFunction { return *signature_ref_; } + /** + * Information about how the multi-function behaves that help a caller to execute it efficiently. + */ + struct ExecutionHints { + /** + * Suggested minimum workload under which multi-threading does not really help. + * This should be lowered when the multi-function is doing something computationally expensive. + */ + int64_t min_grain_size = 10000; + /** + * Indicates that the multi-function will allocate an array large enough to hold all indices + * passed in as mask. This tells the caller that it would be preferable to pass in smaller + * indices. Also maybe the full mask should be split up into smaller segments to decrease peak + * memory usage. + */ + bool allocates_array = false; + /** + * Tells the caller that every execution takes about the same time. This helps making a more + * educated guess about a good grain size. + */ + bool uniform_execution_time = true; + }; + + ExecutionHints execution_hints() const; + protected: /* Make the function use the given signature. This should be called once in the constructor of * child classes. No copy of the signature is made, so the caller has to make sure that the @@ -121,6 +147,8 @@ class MultiFunction { BLI_assert(signature != nullptr); signature_ref_ = signature; } + + virtual ExecutionHints get_execution_hints() const; }; inline MFParamsBuilder::MFParamsBuilder(const MultiFunction &fn, int64_t mask_size) diff --git a/source/blender/functions/FN_multi_function_parallel.hh b/source/blender/functions/FN_multi_function_parallel.hh deleted file mode 100644 index 84c57efd434..00000000000 --- a/source/blender/functions/FN_multi_function_parallel.hh +++ /dev/null @@ -1,39 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#pragma once - -/** \file - * \ingroup fn - */ - -#include "FN_multi_function.hh" - -namespace blender::fn { - -class ParallelMultiFunction : public MultiFunction { - private: - const MultiFunction &fn_; - const int64_t grain_size_; - bool threading_supported_; - - public: - ParallelMultiFunction(const MultiFunction &fn, const int64_t grain_size); - - void call(IndexMask mask, MFParams params, MFContext context) const override; -}; - -} // namespace blender::fn diff --git a/source/blender/functions/FN_multi_function_params.hh b/source/blender/functions/FN_multi_function_params.hh index 5c7e75230f3..f4ddc4f2881 100644 --- a/source/blender/functions/FN_multi_function_params.hh +++ b/source/blender/functions/FN_multi_function_params.hh @@ -25,6 +25,8 @@ * the function. `MFParams` is then used inside the called function to access the parameters. */ +#include + #include "BLI_resource_scope.hh" #include "FN_generic_pointer.hh" @@ -45,6 +47,9 @@ class MFParamsBuilder { Vector virtual_vector_arrays_; Vector vector_arrays_; + std::mutex mutex_; + Vector> dummy_output_spans_; + friend class MFParams; MFParamsBuilder(const MFSignature &signature, const IndexMask mask) @@ -62,8 +67,8 @@ class MFParamsBuilder { template void add_readonly_single_input_value(T value, StringRef expected_name = "") { - T *value_ptr = &scope_.add_value(std::move(value)); - this->add_readonly_single_input(value_ptr, expected_name); + this->add_readonly_single_input(VArray::ForSingle(std::move(value), min_array_size_), + expected_name); } template void add_readonly_single_input(const T *value, StringRef expected_name = "") { @@ -254,20 +259,12 @@ class MFParams { this->assert_correct_param(param_index, name, MFParamType::SingleOutput); int data_index = builder_->signature_->data_index(param_index); GMutableSpan span = builder_->mutable_spans_[data_index]; - if (span.is_empty()) { - /* The output is ignored by the caller, but the multi-function does not handle this case. So - * create a temporary buffer that the multi-function can write to. */ - const CPPType &type = span.type(); - void *buffer = builder_->scope_.linear_allocator().allocate( - builder_->min_array_size_ * type.size(), type.alignment()); - if (!type.is_trivially_destructible()) { - /* Make sure the temporary elements will be destructed in the end. */ - builder_->scope_.add_destruct_call( - [&type, buffer, mask = builder_->mask_]() { type.destruct_indices(buffer, mask); }); - } - span = GMutableSpan{type, buffer, builder_->min_array_size_}; + if (!span.is_empty()) { + return span; } - return span; + /* The output is ignored by the caller, but the multi-function does not handle this case. So + * create a temporary buffer that the multi-function can write to. */ + return this->ensure_dummy_single_output(data_index); } /** @@ -356,6 +353,8 @@ class MFParams { } #endif } + + GMutableSpan ensure_dummy_single_output(int data_index); }; } // namespace blender::fn diff --git a/source/blender/functions/FN_multi_function_procedure_executor.hh b/source/blender/functions/FN_multi_function_procedure_executor.hh index b12e3a91210..409a031e7ed 100644 --- a/source/blender/functions/FN_multi_function_procedure_executor.hh +++ b/source/blender/functions/FN_multi_function_procedure_executor.hh @@ -34,6 +34,9 @@ class MFProcedureExecutor : public MultiFunction { MFProcedureExecutor(const MFProcedure &procedure); void call(IndexMask mask, MFParams params, MFContext context) const override; + + private: + ExecutionHints get_execution_hints() const override; }; } // namespace blender::fn diff --git a/source/blender/functions/intern/field.cc b/source/blender/functions/intern/field.cc index 7934490a6d9..297df3c15cf 100644 --- a/source/blender/functions/intern/field.cc +++ b/source/blender/functions/intern/field.cc @@ -21,7 +21,6 @@ #include "BLI_vector_set.hh" #include "FN_field.hh" -#include "FN_multi_function_parallel.hh" namespace blender::fn { @@ -358,13 +357,8 @@ Vector evaluate_fields(ResourceScope &scope, build_multi_function_procedure_for_fields( procedure, scope, field_tree_info, varying_fields_to_evaluate); MFProcedureExecutor procedure_executor{procedure}; - /* Add multi threading capabilities to the field evaluation. */ - const int grain_size = 10000; - fn::ParallelMultiFunction parallel_procedure_executor{procedure_executor, grain_size}; - /* Utility variable to make easy to switch the executor. */ - const MultiFunction &executor_fn = parallel_procedure_executor; - MFParamsBuilder mf_params{executor_fn, &mask}; + MFParamsBuilder mf_params{procedure_executor, &mask}; MFContextBuilder mf_context; /* Provide inputs to the procedure executor. */ @@ -405,7 +399,7 @@ Vector evaluate_fields(ResourceScope &scope, mf_params.add_uninitialized_single_output(span); } - executor_fn.call(mask, mf_params, mf_context); + procedure_executor.call_auto(mask, mf_params, mf_context); } /* Evaluate constant fields if necessary. */ diff --git a/source/blender/functions/intern/multi_function.cc b/source/blender/functions/intern/multi_function.cc index ee2c69068db..3e5539d4248 100644 --- a/source/blender/functions/intern/multi_function.cc +++ b/source/blender/functions/intern/multi_function.cc @@ -16,8 +16,141 @@ #include "FN_multi_function.hh" +#include "BLI_task.hh" +#include "BLI_threads.h" + namespace blender::fn { +using ExecutionHints = MultiFunction::ExecutionHints; + +ExecutionHints MultiFunction::execution_hints() const +{ + return this->get_execution_hints(); +} + +ExecutionHints MultiFunction::get_execution_hints() const +{ + return ExecutionHints{}; +} + +static bool supports_threading_by_slicing_params(const MultiFunction &fn) +{ + for (const int i : fn.param_indices()) { + const MFParamType param_type = fn.param_type(i); + if (ELEM(param_type.interface_type(), + MFParamType::InterfaceType::Mutable, + MFParamType::InterfaceType::Output)) { + if (param_type.data_type().is_vector()) { + return false; + } + } + } + return true; +} + +static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask mask) +{ + int64_t grain_size = hints.min_grain_size; + if (hints.uniform_execution_time) { + const int thread_count = BLI_system_thread_count(); + /* Avoid using a small grain size even if it is not necessary. */ + const int64_t thread_based_grain_size = mask.size() / thread_count / 4; + grain_size = std::max(grain_size, thread_based_grain_size); + } + if (hints.allocates_array) { + const int64_t max_grain_size = 10000; + /* Avoid allocating many large intermediate arrays. Better process data in smaller chunks to + * keep peak memory usage lower. */ + grain_size = std::min(grain_size, max_grain_size); + } + return grain_size; +} + +/** + * The result is the same as using #call directly but this method has some additional features. + * - Automatic multi-threading when possible and appropriate. + * - Automatic index mask offsetting to avoid large temporary intermediate arrays that are mostly + * unused. + */ +void MultiFunction::call_auto(IndexMask mask, MFParams params, MFContext context) const +{ + if (mask.is_empty()) { + return; + } + const ExecutionHints hints = this->execution_hints(); + const int64_t grain_size = compute_grain_size(hints, mask); + + if (mask.size() <= grain_size) { + this->call(mask, params, context); + return; + } + + const bool supports_threading = supports_threading_by_slicing_params(*this); + if (!supports_threading) { + this->call(mask, params, context); + return; + } + + threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) { + const IndexMask sliced_mask = mask.slice(sub_range); + if (!hints.allocates_array) { + /* There is no benefit to changing indices in this case. */ + this->call(sliced_mask, params, context); + return; + } + if (sliced_mask[0] < grain_size) { + /* The indices are low, no need to offset them. */ + this->call(sliced_mask, params, context); + return; + } + const int64_t input_slice_start = sliced_mask[0]; + const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1; + const IndexRange input_slice_range{input_slice_start, input_slice_size}; + + Vector offset_mask_indices; + const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices); + + MFParamsBuilder offset_params{*this, offset_mask.min_array_size()}; + + /* Slice all parameters so that for the actual function call. */ + for (const int param_index : this->param_indices()) { + const MFParamType param_type = this->param_type(param_index); + switch (param_type.category()) { + case MFParamType::SingleInput: { + const GVArray &varray = params.readonly_single_input(param_index); + offset_params.add_readonly_single_input(varray.slice(input_slice_range)); + break; + } + case MFParamType::SingleMutable: { + const GMutableSpan span = params.single_mutable(param_index); + const GMutableSpan sliced_span = span.slice(input_slice_range); + offset_params.add_single_mutable(sliced_span); + break; + } + case MFParamType::SingleOutput: { + const GMutableSpan span = params.uninitialized_single_output_if_required(param_index); + if (span.is_empty()) { + offset_params.add_ignored_single_output(); + } + else { + const GMutableSpan sliced_span = span.slice(input_slice_range); + offset_params.add_uninitialized_single_output(sliced_span); + } + break; + } + case MFParamType::VectorInput: + case MFParamType::VectorMutable: + case MFParamType::VectorOutput: { + BLI_assert_unreachable(); + break; + } + } + } + + this->call(offset_mask, offset_params, context); + }); +} + std::string MultiFunction::debug_name() const { return signature_ref_->function_name; diff --git a/source/blender/functions/intern/multi_function_parallel.cc b/source/blender/functions/intern/multi_function_parallel.cc deleted file mode 100644 index eefe647644d..00000000000 --- a/source/blender/functions/intern/multi_function_parallel.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include "FN_multi_function_parallel.hh" - -#include "BLI_task.hh" - -namespace blender::fn { - -ParallelMultiFunction::ParallelMultiFunction(const MultiFunction &fn, const int64_t grain_size) - : fn_(fn), grain_size_(grain_size) -{ - this->set_signature(&fn.signature()); - - threading_supported_ = true; - for (const int param_index : fn.param_indices()) { - const MFParamType param_type = fn.param_type(param_index); - if (param_type.data_type().category() == MFDataType::Vector) { - /* Vector parameters do not support threading yet. */ - threading_supported_ = false; - break; - } - } -} - -void ParallelMultiFunction::call(IndexMask full_mask, MFParams params, MFContext context) const -{ - if (full_mask.size() <= grain_size_ || !threading_supported_) { - fn_.call(full_mask, params, context); - return; - } - - threading::parallel_for(full_mask.index_range(), grain_size_, [&](const IndexRange mask_slice) { - Vector sub_mask_indices; - const IndexMask sub_mask = full_mask.slice_and_offset(mask_slice, sub_mask_indices); - if (sub_mask.is_empty()) { - return; - } - const int64_t input_slice_start = full_mask[mask_slice.first()]; - const int64_t input_slice_size = full_mask[mask_slice.last()] - input_slice_start + 1; - const IndexRange input_slice_range{input_slice_start, input_slice_size}; - - MFParamsBuilder sub_params{fn_, sub_mask.min_array_size()}; - - /* All parameters are sliced so that the wrapped multi-function does not have to take care of - * the index offset. */ - for (const int param_index : fn_.param_indices()) { - const MFParamType param_type = fn_.param_type(param_index); - switch (param_type.category()) { - case MFParamType::SingleInput: { - const GVArray &varray = params.readonly_single_input(param_index); - sub_params.add_readonly_single_input(varray.slice(input_slice_range)); - break; - } - case MFParamType::SingleMutable: { - const GMutableSpan span = params.single_mutable(param_index); - const GMutableSpan sliced_span = span.slice(input_slice_start, input_slice_size); - sub_params.add_single_mutable(sliced_span); - break; - } - case MFParamType::SingleOutput: { - const GMutableSpan span = params.uninitialized_single_output(param_index); - const GMutableSpan sliced_span = span.slice(input_slice_start, input_slice_size); - sub_params.add_uninitialized_single_output(sliced_span); - break; - } - case MFParamType::VectorInput: - case MFParamType::VectorMutable: - case MFParamType::VectorOutput: { - BLI_assert_unreachable(); - break; - } - } - } - - fn_.call(sub_mask, sub_params, context); - }); -} - -} // namespace blender::fn diff --git a/source/blender/functions/intern/multi_function_params.cc b/source/blender/functions/intern/multi_function_params.cc new file mode 100644 index 00000000000..376c5b2deb7 --- /dev/null +++ b/source/blender/functions/intern/multi_function_params.cc @@ -0,0 +1,44 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "FN_multi_function_params.hh" + +namespace blender::fn { + +GMutableSpan MFParams::ensure_dummy_single_output(int data_index) +{ + /* Lock because we are actually modifying #builder_ and it may be used by multiple threads. */ + std::lock_guard lock{builder_->mutex_}; + + for (const std::pair &items : builder_->dummy_output_spans_) { + if (items.first == data_index) { + return items.second; + } + } + + const CPPType &type = builder_->mutable_spans_[data_index].type(); + void *buffer = builder_->scope_.linear_allocator().allocate( + builder_->min_array_size_ * type.size(), type.alignment()); + if (!type.is_trivially_destructible()) { + builder_->scope_.add_destruct_call( + [&type, buffer, mask = builder_->mask_]() { type.destruct_indices(buffer, mask); }); + } + const GMutableSpan span{type, buffer, builder_->min_array_size_}; + builder_->dummy_output_spans_.append({data_index, span}); + return span; +} + +} // namespace blender::fn diff --git a/source/blender/functions/intern/multi_function_procedure_executor.cc b/source/blender/functions/intern/multi_function_procedure_executor.cc index 06c97fd1173..ab2fd7c098c 100644 --- a/source/blender/functions/intern/multi_function_procedure_executor.cc +++ b/source/blender/functions/intern/multi_function_procedure_executor.cc @@ -1045,7 +1045,7 @@ static void execute_call_instruction(const MFCallInstruction &instruction, } try { - fn.call(mask, params, context); + fn.call_auto(mask, params, context); } catch (...) { /* Multi-functions must not throw exceptions. */ @@ -1236,4 +1236,12 @@ void MFProcedureExecutor::call(IndexMask full_mask, MFParams params, MFContext c } } +MultiFunction::ExecutionHints MFProcedureExecutor::get_execution_hints() const +{ + ExecutionHints hints; + hints.allocates_array = true; + hints.min_grain_size = 10000; + return hints; +} + } // namespace blender::fn -- cgit v1.2.3