From 67c42e7f034aad2564d8cde1a9901d9629527daa Mon Sep 17 00:00:00 2001 From: Jacques Lucke Date: Thu, 7 Apr 2022 18:48:14 +0200 Subject: Functions: optimize simple generated multi-functions This implements two optimizations: * Reduce virtual function call overhead when a non-standard virtual array is used as input. * Use a lambda in `type_conversion.cc`. In my test setup, which creates a float attribute filled with the index, the running time drops from `4.0 ms` to `2.0 ms`. Differential Revision: https://developer.blender.org/D14585 --- .../blender/functions/FN_multi_function_builder.hh | 55 ++++++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) (limited to 'source/blender/functions') diff --git a/source/blender/functions/FN_multi_function_builder.hh b/source/blender/functions/FN_multi_function_builder.hh index b041e67390c..ed587a87695 100644 --- a/source/blender/functions/FN_multi_function_builder.hh +++ b/source/blender/functions/FN_multi_function_builder.hh @@ -47,11 +47,46 @@ template class CustomMF_SI_SO : public MultiFunctio template static FunctionT create_function(ElementFuncT element_fn) { return [=](IndexMask mask, const VArray &in1, MutableSpan out1) { - /* Devirtualization results in a 2-3x speedup for some simple functions. */ - devirtualize_varray(in1, [&](const auto &in1) { + if (in1.is_single()) { + /* Only evaluate the function once when the input is a single value. */ + const In1 in1_single = in1.get_internal_single(); + const Out1 out1_single = element_fn(in1_single); + out1.fill_indices(mask, out1_single); + return; + } + + if (in1.is_span()) { + const Span in1_span = in1.get_internal_span(); mask.to_best_mask_type( - [&](const auto &mask) { execute_SI_SO(element_fn, mask, in1, out1.data()); }); - }); + [&](auto mask) { execute_SI_SO(element_fn, mask, in1_span, out1.data()); }); + return; + } + + /* The input is an unknown virtual array type. To avoid virtual function call overhead for + * every element, elements are retrieved and processed in chunks. */ + + static constexpr int64_t MaxChunkSize = 32; + TypedBuffer in1_buffer_owner; + MutableSpan in1_buffer{in1_buffer_owner.ptr(), MaxChunkSize}; + + const int64_t mask_size = mask.size(); + for (int64_t chunk_start = 0; chunk_start < mask_size; chunk_start += MaxChunkSize) { + const int64_t chunk_size = std::min(mask_size - chunk_start, MaxChunkSize); + const IndexMask sliced_mask = mask.slice(chunk_start, chunk_size); + + /* Load input from the virtual array. */ + MutableSpan in1_chunk = in1_buffer.take_front(chunk_size); + in1.materialize_compressed_to_uninitialized(sliced_mask, in1_chunk); + + if (sliced_mask.is_range()) { + execute_SI_SO( + element_fn, IndexRange(chunk_size), in1_chunk, out1.data() + sliced_mask[0]); + } + else { + execute_SI_SO_compressed(element_fn, sliced_mask, in1_chunk, out1.data()); + } + destruct_n(in1_chunk.data(), chunk_size); + } }; } @@ -66,6 +101,18 @@ template class CustomMF_SI_SO : public MultiFunctio } } + /** Expects the input array to be "compressed", i.e. there are no gaps between the elements. */ + template + BLI_NOINLINE static void execute_SI_SO_compressed(const ElementFuncT &element_fn, + MaskT mask, + const In1Array &in1, + Out1 *__restrict r_out) + { + for (const int64_t i : IndexRange(mask.size())) { + new (r_out + mask[i]) Out1(element_fn(in1[i])); + } + } + void call(IndexMask mask, MFParams params, MFContext UNUSED(context)) const override { const VArray &in1 = params.readonly_single_input(0); -- cgit v1.2.3