From ee3f71d747e3ffd5091335437d52b3ec518d7b67 Mon Sep 17 00:00:00 2001 From: Jacques Lucke Date: Mon, 4 Apr 2022 11:57:39 +0200 Subject: Functions: allow for better compiler optimization This extracts the inner loops into a separate function. There are two main reasons for this: * Allows using `__restrict` to indicate that no other parameter aliases with the output array. This allows for better optimization. * Makes it easier to search for the generated assembly code, especially with the `BLI_NOINLINE`. --- .../blender/functions/FN_multi_function_builder.hh | 37 ++++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'source/blender/functions') diff --git a/source/blender/functions/FN_multi_function_builder.hh b/source/blender/functions/FN_multi_function_builder.hh index 2eaada5dea0..dfdd152e62a 100644 --- a/source/blender/functions/FN_multi_function_builder.hh +++ b/source/blender/functions/FN_multi_function_builder.hh @@ -49,15 +49,23 @@ template class CustomMF_SI_SO : public MultiFunctio return [=](IndexMask mask, const VArray &in1, MutableSpan out1) { /* Devirtualization results in a 2-3x speedup for some simple functions. */ devirtualize_varray(in1, [&](const auto &in1) { - mask.to_best_mask_type([&](const auto &mask) { - for (const int64_t i : mask) { - new (static_cast(&out1[i])) Out1(element_fn(in1[i])); - } - }); + mask.to_best_mask_type( + [&](const auto &mask) { execute_SI_SO(element_fn, mask, in1, out1.data()); }); }); }; } + template + BLI_NOINLINE static void execute_SI_SO(const ElementFuncT &element_fn, + MaskT mask, + const In1Array &in1, + Out1 *__restrict r_out) + { + for (const int64_t i : mask) { + new (r_out + i) Out1(element_fn(in1[i])); + } + } + void call(IndexMask mask, MFParams params, MFContext UNUSED(context)) const override { const VArray &in1 = params.readonly_single_input(0); @@ -105,15 +113,24 @@ class CustomMF_SI_SI_SO : public MultiFunction { MutableSpan out1) { /* Devirtualization results in a 2-3x speedup for some simple functions. */ devirtualize_varray2(in1, in2, [&](const auto &in1, const auto &in2) { - mask.to_best_mask_type([&](const auto &mask) { - for (const int64_t i : mask) { - new (static_cast(&out1[i])) Out1(element_fn(in1[i], in2[i])); - } - }); + mask.to_best_mask_type( + [&](const auto &mask) { execute_SI_SI_SO(element_fn, mask, in1, in2, out1.data()); }); }); }; } + template + BLI_NOINLINE static void execute_SI_SI_SO(const ElementFuncT &element_fn, + MaskT mask, + const In1Array &in1, + const In2Array &in2, + Out1 *__restrict r_out) + { + for (const int64_t i : mask) { + new (r_out + i) Out1(element_fn(in1[i], in2[i])); + } + } + void call(IndexMask mask, MFParams params, MFContext UNUSED(context)) const override { const VArray &in1 = params.readonly_single_input(0); -- cgit v1.2.3