Geometry Nodes: refactor multi-threading in field evaluation

Previously, there was a fixed grain size for all multi-functions. That was not sufficient because some functions could benefit a lot from smaller grain sizes. This refactors adds a new `MultiFunction::call_auto` method which has the same effect as just calling `MultiFunction::call` but additionally figures out how to execute the specific multi-function efficiently. It determines a good grain size and decides whether the mask indices should be shifted or not. Most multi-function evaluations benefit from this, but medium sized work loads (1000 - 50000 elements) benefit from it the most. Especially when expensive multi-functions (e.g. noise) is involved. This is because for smaller work loads, threading is rarely used and for larger work loads threading worked fine before already. With this patch, multi-functions can specify execution hints, that allow the caller to execute it most efficiently. These execution hints still have to be added to more functions. Some performance measurements of a field evaluation involving noise and math nodes, ordered by the number of elements being evaluated: ``` 1,000,000: 133 ms -> 120 ms 100,000: 30 ms -> 18 ms 10,000: 20 ms -> 2.7 ms 1,000: 4 ms -> 0.5 ms 100: 0.5 ms -> 0.4 ms ```
author: Jacques Lucke <jacques@blender.org> 2021-11-26 13:05:47 +0300
committer: Jacques Lucke <jacques@blender.org> 2021-11-26 13:06:16 +0300
commit: 658fd8df0bd2427cd77e7fc4bcca8a102f67b626 (patch)
tree: 574c5a6f4c11db7047a98ca38c6d6f129a4b10e2 /source/blender/functions/intern/multi_function.cc
parent: 004172de38d5483b715a5b13d06c2aa5dd3de3f5 (diff)
1 files changed, 133 insertions, 0 deletions
diff --git a/source/blender/functions/intern/multi_function.cc b/source/blender/functions/intern/multi_function.cc
index ee2c69068db..3e5539d4248 100644
--- a/source/blender/functions/intern/multi_function.cc
+++ b/source/blender/functions/intern/multi_function.cc
@@ -16,8 +16,141 @@
 
 #include "FN_multi_function.hh"
 
+#include "BLI_task.hh"
+#include "BLI_threads.h"
+
 namespace blender::fn {
 
+using ExecutionHints = MultiFunction::ExecutionHints;
+
+ExecutionHints MultiFunction::execution_hints() const
+{
+  return this->get_execution_hints();
+}
+
+ExecutionHints MultiFunction::get_execution_hints() const
+{
+  return ExecutionHints{};
+}
+
+static bool supports_threading_by_slicing_params(const MultiFunction &fn)
+{
+  for (const int i : fn.param_indices()) {
+    const MFParamType param_type = fn.param_type(i);
+    if (ELEM(param_type.interface_type(),
+             MFParamType::InterfaceType::Mutable,
+             MFParamType::InterfaceType::Output)) {
+      if (param_type.data_type().is_vector()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask mask)
+{
+  int64_t grain_size = hints.min_grain_size;
+  if (hints.uniform_execution_time) {
+    const int thread_count = BLI_system_thread_count();
+    /* Avoid using a small grain size even if it is not necessary. */
+    const int64_t thread_based_grain_size = mask.size() / thread_count / 4;
+    grain_size = std::max(grain_size, thread_based_grain_size);
+  }
+  if (hints.allocates_array) {
+    const int64_t max_grain_size = 10000;
+    /* Avoid allocating many large intermediate arrays. Better process data in smaller chunks to
+     * keep peak memory usage lower. */
+    grain_size = std::min(grain_size, max_grain_size);
+  }
+  return grain_size;
+}
+
+/**
+ * The result is the same as using #call directly but this method has some additional features.
+ * - Automatic multi-threading when possible and appropriate.
+ * - Automatic index mask offsetting to avoid large temporary intermediate arrays that are mostly
+ *   unused.
+ */
+void MultiFunction::call_auto(IndexMask mask, MFParams params, MFContext context) const
+{
+  if (mask.is_empty()) {
+    return;
+  }
+  const ExecutionHints hints = this->execution_hints();
+  const int64_t grain_size = compute_grain_size(hints, mask);
+
+  if (mask.size() <= grain_size) {
+    this->call(mask, params, context);
+    return;
+  }
+
+  const bool supports_threading = supports_threading_by_slicing_params(*this);
+  if (!supports_threading) {
+    this->call(mask, params, context);
+    return;
+  }
+
+  threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) {
+    const IndexMask sliced_mask = mask.slice(sub_range);
+    if (!hints.allocates_array) {
+      /* There is no benefit to changing indices in this case. */
+      this->call(sliced_mask, params, context);
+      return;
+    }
+    if (sliced_mask[0] < grain_size) {
+      /* The indices are low, no need to offset them. */
+      this->call(sliced_mask, params, context);
+      return;
+    }
+    const int64_t input_slice_start = sliced_mask[0];
+    const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
+    const IndexRange input_slice_range{input_slice_start, input_slice_size};
+
+    Vector<int64_t> offset_mask_indices;
+    const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
+
+    MFParamsBuilder offset_params{*this, offset_mask.min_array_size()};
+
+    /* Slice all parameters so that for the actual function call. */
+    for (const int param_index : this->param_indices()) {
+      const MFParamType param_type = this->param_type(param_index);
+      switch (param_type.category()) {
+        case MFParamType::SingleInput: {
+          const GVArray &varray = params.readonly_single_input(param_index);
+          offset_params.add_readonly_single_input(varray.slice(input_slice_range));
+          break;
+        }
+        case MFParamType::SingleMutable: {
+          const GMutableSpan span = params.single_mutable(param_index);
+          const GMutableSpan sliced_span = span.slice(input_slice_range);
+          offset_params.add_single_mutable(sliced_span);
+          break;
+        }
+        case MFParamType::SingleOutput: {
+          const GMutableSpan span = params.uninitialized_single_output_if_required(param_index);
+          if (span.is_empty()) {
+            offset_params.add_ignored_single_output();
+          }
+          else {
+            const GMutableSpan sliced_span = span.slice(input_slice_range);
+            offset_params.add_uninitialized_single_output(sliced_span);
+          }
+          break;
+        }
+        case MFParamType::VectorInput:
+        case MFParamType::VectorMutable:
+        case MFParamType::VectorOutput: {
+          BLI_assert_unreachable();
+          break;
+        }
+      }
+    }
+
+    this->call(offset_mask, offset_params, context);
+  });
+}
+
 std::string MultiFunction::debug_name() const
 {
   return signature_ref_->function_name;
author	Jacques Lucke <jacques@blender.org>	2021-11-26 13:05:47 +0300
committer	Jacques Lucke <jacques@blender.org>	2021-11-26 13:06:16 +0300
commit	658fd8df0bd2427cd77e7fc4bcca8a102f67b626 (patch)
tree	574c5a6f4c11db7047a98ca38c6d6f129a4b10e2 /source/blender/functions/intern/multi_function.cc
parent	004172de38d5483b715a5b13d06c2aa5dd3de3f5 (diff)