9 files changed, 151 insertions, 131 deletions
diff --git a/source/blender/blenkernel/intern/colortools.c b/source/blender/blenkernel/intern/colortools.c
index f82b8b6675c..3da384a2745 100644
--- a/source/blender/blenkernel/intern/colortools.c
+++ b/source/blender/blenkernel/intern/colortools.c
@@ -1383,8 +1383,6 @@ typedef struct ScopesUpdateData {
   struct ColormanageProcessor *cm_processor;
   const unsigned char *display_buffer;
   const int ycc_mode;
-
-  unsigned int *bin_lum, *bin_r, *bin_g, *bin_b, *bin_a;
 } ScopesUpdateData;
 
 typedef struct ScopesUpdateDataChunk {
@@ -1495,23 +1493,24 @@ static void scopes_update_cb(void *__restrict userdata,
   }
 }
 
-static void scopes_update_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void scopes_update_reduce(const void *__restrict UNUSED(userdata),
+                                 void *__restrict chunk_join,
+                                 void *__restrict chunk)
 {
-  const ScopesUpdateData *data = userdata;
-  const ScopesUpdateDataChunk *data_chunk = userdata_chunk;
-
-  unsigned int *bin_lum = data->bin_lum;
-  unsigned int *bin_r = data->bin_r;
-  unsigned int *bin_g = data->bin_g;
-  unsigned int *bin_b = data->bin_b;
-  unsigned int *bin_a = data->bin_a;
+  ScopesUpdateDataChunk *join_chunk = chunk_join;
+  const ScopesUpdateDataChunk *data_chunk = chunk;
+
+  unsigned int *bin_lum = join_chunk->bin_lum;
+  unsigned int *bin_r = join_chunk->bin_r;
+  unsigned int *bin_g = join_chunk->bin_g;
+  unsigned int *bin_b = join_chunk->bin_b;
+  unsigned int *bin_a = join_chunk->bin_a;
   const unsigned int *bin_lum_c = data_chunk->bin_lum;
   const unsigned int *bin_r_c = data_chunk->bin_r;
   const unsigned int *bin_g_c = data_chunk->bin_g;
   const unsigned int *bin_b_c = data_chunk->bin_b;
   const unsigned int *bin_a_c = data_chunk->bin_a;
 
-  float(*minmax)[2] = data->scopes->minmax;
   const float *min = data_chunk->min;
   const float *max = data_chunk->max;
 
@@ -1524,11 +1523,11 @@ static void scopes_update_finalize(void *__restrict userdata, void *__restrict u
   }
 
   for (int c = 3; c--;) {
-    if (min[c] < minmax[c][0]) {
-      minmax[c][0] = min[c];
+    if (min[c] < join_chunk->min[c]) {
+      join_chunk->min[c] = min[c];
     }
-    if (max[c] > minmax[c][1]) {
-      minmax[c][1] = max[c];
+    if (max[c] > join_chunk->max[c]) {
+      join_chunk->max[c] = max[c];
     }
   }
 }
@@ -1542,7 +1541,6 @@ void BKE_scopes_update(Scopes *scopes,
   unsigned int nl, na, nr, ng, nb;
   double divl, diva, divr, divg, divb;
   const unsigned char *display_buffer = NULL;
-  uint bin_lum[256] = {0}, bin_r[256] = {0}, bin_g[256] = {0}, bin_b[256] = {0}, bin_a[256] = {0};
   int ycc_mode = -1;
   void *cache_handle = NULL;
   struct ColormanageProcessor *cm_processor = NULL;
@@ -1638,11 +1636,6 @@ void BKE_scopes_update(Scopes *scopes,
       .cm_processor = cm_processor,
       .display_buffer = display_buffer,
       .ycc_mode = ycc_mode,
-      .bin_lum = bin_lum,
-      .bin_r = bin_r,
-      .bin_g = bin_g,
-      .bin_b = bin_b,
-      .bin_a = bin_a,
   };
   ScopesUpdateDataChunk data_chunk = {{0}};
   INIT_MINMAX(data_chunk.min, data_chunk.max);
@@ -1652,26 +1645,26 @@ void BKE_scopes_update(Scopes *scopes,
   settings.use_threading = (ibuf->y > 256);
   settings.userdata_chunk = &data_chunk;
   settings.userdata_chunk_size = sizeof(data_chunk);
-  settings.func_finalize = scopes_update_finalize;
+  settings.func_reduce = scopes_update_reduce;
   BLI_task_parallel_range(0, ibuf->y, &data, scopes_update_cb, &settings);
 
   /* convert hist data to float (proportional to max count) */
   nl = na = nr = nb = ng = 0;
   for (a = 0; a < 256; a++) {
-    if (bin_lum[a] > nl) {
-      nl = bin_lum[a];
+    if (data_chunk.bin_lum[a] > nl) {
+      nl = data_chunk.bin_lum[a];
     }
-    if (bin_r[a] > nr) {
-      nr = bin_r[a];
+    if (data_chunk.bin_r[a] > nr) {
+      nr = data_chunk.bin_r[a];
     }
-    if (bin_g[a] > ng) {
-      ng = bin_g[a];
+    if (data_chunk.bin_g[a] > ng) {
+      ng = data_chunk.bin_g[a];
     }
-    if (bin_b[a] > nb) {
-      nb = bin_b[a];
+    if (data_chunk.bin_b[a] > nb) {
+      nb = data_chunk.bin_b[a];
     }
-    if (bin_a[a] > na) {
-      na = bin_a[a];
+    if (data_chunk.bin_a[a] > na) {
+      na = data_chunk.bin_a[a];
     }
   }
   divl = nl ? 1.0 / (double)nl : 1.0;
@@ -1681,11 +1674,11 @@ void BKE_scopes_update(Scopes *scopes,
   divb = nb ? 1.0 / (double)nb : 1.0;
 
   for (a = 0; a < 256; a++) {
-    scopes->hist.data_luma[a] = bin_lum[a] * divl;
-    scopes->hist.data_r[a] = bin_r[a] * divr;
-    scopes->hist.data_g[a] = bin_g[a] * divg;
-    scopes->hist.data_b[a] = bin_b[a] * divb;
-    scopes->hist.data_a[a] = bin_a[a] * diva;
+    scopes->hist.data_luma[a] = data_chunk.bin_lum[a] * divl;
+    scopes->hist.data_r[a] = data_chunk.bin_r[a] * divr;
+    scopes->hist.data_g[a] = data_chunk.bin_g[a] * divg;
+    scopes->hist.data_b[a] = data_chunk.bin_b[a] * divb;
+    scopes->hist.data_a[a] = data_chunk.bin_a[a] * diva;
   }
 
   if (cm_processor) {
diff --git a/source/blender/blenkernel/intern/dynamicpaint.c b/source/blender/blenkernel/intern/dynamicpaint.c
index 4c78c88d168..3a8b846a41d 100644
--- a/source/blender/blenkernel/intern/dynamicpaint.c
+++ b/source/blender/blenkernel/intern/dynamicpaint.c
@@ -653,15 +653,15 @@ static void grid_bound_insert_cb_ex(void *__restrict userdata,
   boundInsert(grid_bound, bData->realCoord[bData->s_pos[i]].v);
 }
 
-static void grid_bound_insert_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void grid_bound_insert_reduce(const void *__restrict UNUSED(userdata),
+                                     void *__restrict chunk_join,
+                                     void *__restrict chunk)
 {
-  PaintBakeData *bData = userdata;
-  VolumeGrid *grid = bData->grid;
-
-  Bounds3D *grid_bound = userdata_chunk;
+  Bounds3D *join = chunk_join;
+  Bounds3D *grid_bound = chunk;
 
-  boundInsert(&grid->grid_bounds, grid_bound->min);
-  boundInsert(&grid->grid_bounds, grid_bound->max);
+  boundInsert(join, grid_bound->min);
+  boundInsert(join, grid_bound->max);
 }
 
 static void grid_cell_points_cb_ex(void *__restrict userdata,
@@ -685,17 +685,20 @@ static void grid_cell_points_cb_ex(void *__restrict userdata,
   s_num[temp_t_index[i]]++;
 }
 
-static void grid_cell_points_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void grid_cell_points_reduce(const void *__restrict userdata,
+                                    void *__restrict chunk_join,
+                                    void *__restrict chunk)
 {
-  PaintBakeData *bData = userdata;
-  VolumeGrid *grid = bData->grid;
+  const PaintBakeData *bData = userdata;
+  const VolumeGrid *grid = bData->grid;
   const int grid_cells = grid->dim[0] * grid->dim[1] * grid->dim[2];
 
-  int *s_num = userdata_chunk;
+  int *join_s_num = chunk_join;
+  int *s_num = chunk;
 
   /* calculate grid indexes */
   for (int i = 0; i < grid_cells; i++) {
-    grid->s_num[i] += s_num[i];
+    join_s_num[i] += s_num[i];
   }
 }
 
@@ -753,7 +756,7 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
       settings.use_threading = (sData->total_points > 1000);
       settings.userdata_chunk = &grid->grid_bounds;
       settings.userdata_chunk_size = sizeof(grid->grid_bounds);
-      settings.func_finalize = grid_bound_insert_finalize;
+      settings.func_reduce = grid_bound_insert_reduce;
       BLI_task_parallel_range(0, sData->total_points, bData, grid_bound_insert_cb_ex, &settings);
     }
     /* get dimensions */
@@ -814,7 +817,7 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
         settings.use_threading = (sData->total_points > 1000);
         settings.userdata_chunk = grid->s_num;
         settings.userdata_chunk_size = sizeof(*grid->s_num) * grid_cells;
-        settings.func_finalize = grid_cell_points_finalize;
+        settings.func_reduce = grid_cell_points_reduce;
         BLI_task_parallel_range(0, sData->total_points, bData, grid_cell_points_cb_ex, &settings);
       }
 
@@ -4880,7 +4883,7 @@ static void dynamicPaint_prepareAdjacencyData(DynamicPaintSurface *surface, cons
       0, sData->total_points, sData, dynamic_paint_prepare_adjacency_cb, &settings);
 
   /* calculate average values (single thread).
-   * Note: tried to put this in threaded callback (using _finalize feature),
+   * Note: tried to put this in threaded callback (using _reduce feature),
    * but gave ~30% slower result! */
   bData->average_dist = 0.0;
   for (index = 0; index < sData->total_points; index++) {
diff --git a/source/blender/blenkernel/intern/particle_system.c b/source/blender/blenkernel/intern/particle_system.c
index 5ef2f7aeeff..14b1ef7b87f 100644
--- a/source/blender/blenkernel/intern/particle_system.c
+++ b/source/blender/blenkernel/intern/particle_system.c
@@ -3692,10 +3692,11 @@ typedef struct DynamicStepSolverTaskData {
   SpinLock spin;
 } DynamicStepSolverTaskData;
 
-static void dynamics_step_finalize_sphdata(void *__restrict UNUSED(userdata),
-                                           void *__restrict tls_userdata_chunk)
+static void dynamics_step_sphdata_reduce(const void *__restrict UNUSED(userdata),
+                                         void *__restrict UNUSED(join_v),
+                                         void *__restrict chunk_v)
 {
-  SPHData *sphdata = tls_userdata_chunk;
+  SPHData *sphdata = chunk_v;
 
   psys_sph_flush_springs(sphdata);
 }
@@ -3986,7 +3987,7 @@ static void dynamics_step(ParticleSimulationData *sim, float cfra)
         settings.use_threading = (psys->totpart > 100);
         settings.userdata_chunk = &sphdata;
         settings.userdata_chunk_size = sizeof(sphdata);
-        settings.func_finalize = dynamics_step_finalize_sphdata;
+        settings.func_reduce = dynamics_step_sphdata_reduce;
         BLI_task_parallel_range(
             0, psys->totpart, &task_data, dynamics_step_sph_ddr_task_cb_ex, &settings);
 
@@ -4018,7 +4019,7 @@ static void dynamics_step(ParticleSimulationData *sim, float cfra)
           settings.use_threading = (psys->totpart > 100);
           settings.userdata_chunk = &sphdata;
           settings.userdata_chunk_size = sizeof(sphdata);
-          settings.func_finalize = dynamics_step_finalize_sphdata;
+          settings.func_reduce = dynamics_step_sphdata_reduce;
           BLI_task_parallel_range(0,
                                   psys->totpart,
                                   &task_data,
@@ -4033,7 +4034,7 @@ static void dynamics_step(ParticleSimulationData *sim, float cfra)
           settings.use_threading = (psys->totpart > 100);
           settings.userdata_chunk = &sphdata;
           settings.userdata_chunk_size = sizeof(sphdata);
-          settings.func_finalize = dynamics_step_finalize_sphdata;
+          settings.func_reduce = dynamics_step_sphdata_reduce;
           BLI_task_parallel_range(0,
                                   psys->totpart,
                                   &task_data,
@@ -4189,7 +4190,7 @@ static void particles_fluid_step(ParticleSimulationData *sim,
       ParticleSettings *part = psys->part;
       ParticleData *pa = NULL;
 
-      int p, totpart, tottypepart = 0;
+      int p, totpart = 0, tottypepart = 0;
       int flagActivePart, activeParts = 0;
       float posX, posY, posZ, velX, velY, velZ;
       float resX, resY, resZ;
diff --git a/source/blender/blenkernel/intern/subdiv_ccg.c b/source/blender/blenkernel/intern/subdiv_ccg.c
index 521aeb60e66..d99c41eaa3e 100644
--- a/source/blender/blenkernel/intern/subdiv_ccg.c
+++ b/source/blender/blenkernel/intern/subdiv_ccg.c
@@ -770,8 +770,8 @@ static void subdiv_ccg_recalc_inner_normal_task(void *__restrict userdata_v,
   subdiv_ccg_average_inner_face_normals(data->subdiv_ccg, data->key, tls, grid_index);
 }
 
-static void subdiv_ccg_recalc_inner_normal_finalize(void *__restrict UNUSED(userdata),
-                                                    void *__restrict tls_v)
+static void subdiv_ccg_recalc_inner_normal_free(const void *__restrict UNUSED(userdata),
+                                                void *__restrict tls_v)
 {
   RecalcInnerNormalsTLSData *tls = tls_v;
   MEM_SAFE_FREE(tls->face_normals);
@@ -791,7 +791,7 @@ static void subdiv_ccg_recalc_inner_grid_normals(SubdivCCG *subdiv_ccg)
   BLI_parallel_range_settings_defaults(&parallel_range_settings);
   parallel_range_settings.userdata_chunk = &tls_data;
   parallel_range_settings.userdata_chunk_size = sizeof(tls_data);
-  parallel_range_settings.func_finalize = subdiv_ccg_recalc_inner_normal_finalize;
+  parallel_range_settings.func_free = subdiv_ccg_recalc_inner_normal_free;
   BLI_task_parallel_range(0,
                           subdiv_ccg->num_grids,
                           &data,
@@ -834,8 +834,8 @@ static void subdiv_ccg_recalc_modified_inner_normal_task(void *__restrict userda
   subdiv_ccg_average_inner_face_grids(subdiv_ccg, key, face);
 }
 
-static void subdiv_ccg_recalc_modified_inner_normal_finalize(void *__restrict UNUSED(userdata),
-                                                             void *__restrict tls_v)
+static void subdiv_ccg_recalc_modified_inner_normal_free(const void *__restrict UNUSED(userdata),
+                                                         void *__restrict tls_v)
 {
   RecalcInnerNormalsTLSData *tls = tls_v;
   MEM_SAFE_FREE(tls->face_normals);
@@ -857,7 +857,7 @@ static void subdiv_ccg_recalc_modified_inner_grid_normals(SubdivCCG *subdiv_ccg,
   BLI_parallel_range_settings_defaults(&parallel_range_settings);
   parallel_range_settings.userdata_chunk = &tls_data;
   parallel_range_settings.userdata_chunk_size = sizeof(tls_data);
-  parallel_range_settings.func_finalize = subdiv_ccg_recalc_modified_inner_normal_finalize;
+  parallel_range_settings.func_free = subdiv_ccg_recalc_modified_inner_normal_free;
   BLI_task_parallel_range(0,
                           num_effected_faces,
                           &data,
@@ -1077,8 +1077,8 @@ static void subdiv_ccg_average_grids_boundaries_task(void *__restrict userdata_v
   subdiv_ccg_average_grids_boundary(subdiv_ccg, key, adjacent_edge, tls);
 }
 
-static void subdiv_ccg_average_grids_boundaries_finalize(void *__restrict UNUSED(userdata),
-                                                         void *__restrict tls_v)
+static void subdiv_ccg_average_grids_boundaries_free(const void *__restrict UNUSED(userdata),
+                                                     void *__restrict tls_v)
 {
   AverageGridsBoundariesTLSData *tls = tls_v;
   MEM_SAFE_FREE(tls->accumulators);
@@ -1136,7 +1136,7 @@ static void subdiv_ccg_average_all_boundaries(SubdivCCG *subdiv_ccg, CCGKey *key
   AverageGridsBoundariesTLSData tls_data = {NULL};
   parallel_range_settings.userdata_chunk = &tls_data;
   parallel_range_settings.userdata_chunk_size = sizeof(tls_data);
-  parallel_range_settings.func_finalize = subdiv_ccg_average_grids_boundaries_finalize;
+  parallel_range_settings.func_free = subdiv_ccg_average_grids_boundaries_free;
   BLI_task_parallel_range(0,
                           subdiv_ccg->num_adjacent_edges,
                           &boundaries_data,
diff --git a/source/blender/blenkernel/intern/subdiv_foreach.c b/source/blender/blenkernel/intern/subdiv_foreach.c
index b31fb2c9312..0884f40952f 100644
--- a/source/blender/blenkernel/intern/subdiv_foreach.c
+++ b/source/blender/blenkernel/intern/subdiv_foreach.c
@@ -1838,9 +1838,9 @@ static void subdiv_foreach_boundary_edges_task(void *__restrict userdata,
   subdiv_foreach_boundary_edges(ctx, tls->userdata_chunk, edge_index);
 }
 
-static void subdiv_foreach_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void subdiv_foreach_free(const void *__restrict userdata, void *__restrict userdata_chunk)
 {
-  SubdivForeachTaskContext *ctx = userdata;
+  const SubdivForeachTaskContext *ctx = userdata;
   ctx->foreach_context->user_data_tls_free(userdata_chunk);
 }
 
@@ -1873,7 +1873,7 @@ bool BKE_subdiv_foreach_subdiv_geometry(Subdiv *subdiv,
   parallel_range_settings.userdata_chunk_size = context->user_data_tls_size;
   parallel_range_settings.min_iter_per_thread = 1;
   if (context->user_data_tls_free != NULL) {
-    parallel_range_settings.func_finalize = subdiv_foreach_finalize;
+    parallel_range_settings.func_free = subdiv_foreach_free;
   }
 
   /* TODO(sergey): Possible optimization is to have a single pool and push all
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 898e4be5f92..78370e98202 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -146,12 +146,14 @@ typedef struct TaskParallelTLS {
   void *userdata_chunk;
 } TaskParallelTLS;
 
-typedef void (*TaskParallelFinalizeFunc)(void *__restrict userdata,
-                                         void *__restrict userdata_chunk);
-
 typedef void (*TaskParallelRangeFunc)(void *__restrict userdata,
                                       const int iter,
                                       const TaskParallelTLS *__restrict tls);
+typedef void (*TaskParallelReduceFunc)(const void *__restrict userdata,
+                                       void *__restrict chunk_join,
+                                       void *__restrict chunk);
+
+typedef void (*TaskParallelFreeFunc)(const void *__restrict userdata, void *__restrict chunk);
 
 typedef struct TaskParallelSettings {
   /* Whether caller allows to do threading of the particular range.
@@ -171,7 +173,13 @@ typedef struct TaskParallelSettings {
   /* Function called from calling thread once whole range have been
    * processed.
    */
-  TaskParallelFinalizeFunc func_finalize;
+  /* Function called to join user data chunk into another, to reduce
+   * the result to the original userdata_chunk memory.
+   * The reduce functions should have no side effects, so that they
+   * can be run on any thread. */
+  TaskParallelReduceFunc func_reduce;
+  /* Function called to free data created by TaskParallelRangeFunc. */
+  TaskParallelFreeFunc func_free;
   /* Minimum allowed number of range iterators to be handled by a single
    * thread. This allows to achieve following:
    * - Reduce amount of threading overhead.
diff --git a/source/blender/blenlib/intern/task_iterator.c b/source/blender/blenlib/intern/task_iterator.c
index 4ac012fa578..0d1c96612e3 100644
--- a/source/blender/blenlib/intern/task_iterator.c
+++ b/source/blender/blenlib/intern/task_iterator.c
@@ -78,8 +78,13 @@ typedef struct TaskParallelRangeState {
   /* Number of 'tls' copies in the array, i.e. number of worker threads. */
   size_t num_elements_in_tls_storage;
 
-  /* Function called from calling thread once whole range have been processed. */
-  TaskParallelFinalizeFunc func_finalize;
+  /* Function called to join user data chunk into another, to reduce
+   * the result to the original userdata_chunk memory.
+   * The reduce functions should have no side effects, so that they
+   * can be run on any thread. */
+  TaskParallelReduceFunc func_reduce;
+  /* Function called to free data created by TaskParallelRangeFunc. */
+  TaskParallelFreeFunc func_free;
 
   /* Current value of the iterator, shared between all threads (atomically updated). */
   int iter_value;
@@ -256,23 +261,18 @@ static void parallel_range_single_thread(TaskParallelRangePool *range_pool)
 
     void *initial_tls_memory = state->initial_tls_memory;
     const size_t tls_data_size = state->tls_data_size;
-    void *flatten_tls_storage = NULL;
     const bool use_tls_data = (tls_data_size != 0) && (initial_tls_memory != NULL);
-    if (use_tls_data) {
-      flatten_tls_storage = MALLOCA(tls_data_size);
-      memcpy(flatten_tls_storage, initial_tls_memory, tls_data_size);
-    }
     TaskParallelTLS tls = {
         .thread_id = 0,
-        .userdata_chunk = flatten_tls_storage,
+        .userdata_chunk = initial_tls_memory,
     };
     for (int i = start; i < stop; i++) {
       func(userdata, i, &tls);
     }
-    if (state->func_finalize != NULL) {
-      state->func_finalize(userdata, flatten_tls_storage);
+    if (use_tls_data && state->func_free != NULL) {
+      /* `func_free` should only free data that was created during execution of `func`. */
+      state->func_free(userdata, initial_tls_memory);
     }
-    MALLOCA_FREE(flatten_tls_storage, tls_data_size);
   }
 }
 
@@ -303,7 +303,7 @@ void BLI_task_parallel_range(const int start,
       .iter_value = start,
       .initial_tls_memory = settings->userdata_chunk,
       .tls_data_size = settings->userdata_chunk_size,
-      .func_finalize = settings->func_finalize,
+      .func_free = settings->func_free,
   };
   TaskParallelRangePool range_pool = {
       .pool = NULL, .parallel_range_states = &state, .current_state = NULL, .settings = settings};
@@ -367,11 +367,15 @@ void BLI_task_parallel_range(const int start,
   BLI_task_pool_work_and_wait(task_pool);
   BLI_task_pool_free(task_pool);
 
-  if (use_tls_data) {
-    if (settings->func_finalize != NULL) {
-      for (i = 0; i < num_tasks; i++) {
-        void *userdata_chunk_local = (char *)flatten_tls_storage + (tls_data_size * (size_t)i);
-        settings->func_finalize(userdata, userdata_chunk_local);
+  if (use_tls_data && (settings->func_free != NULL || settings->func_reduce != NULL)) {
+    for (i = 0; i < num_tasks; i++) {
+      void *userdata_chunk_local = (char *)flatten_tls_storage + (tls_data_size * (size_t)i);
+      if (settings->func_reduce) {
+        settings->func_reduce(userdata, tls_data, userdata_chunk_local);
+      }
+      if (settings->func_free) {
+        /* `func_free` should only free data that was created during execution of `func`. */
+        settings->func_free(userdata, userdata_chunk_local);
       }
     }
     MALLOCA_FREE(flatten_tls_storage, tls_data_size * (size_t)num_tasks);
@@ -382,16 +386,17 @@ void BLI_task_parallel_range(const int start,
  * Initialize a task pool to parallelize several for loops at the same time.
  *
  * See public API doc of ParallelRangeSettings for description of all settings.
- * Note that loop-specific settings (like 'tls' data or finalize function) must be left NULL here.
- * Only settings controlling how iteration is parallelized must be defined, as those will affect
- * all loops added to that pool.
+ * Note that loop-specific settings (like 'tls' data or reduce/free functions) must be left NULL
+ * here. Only settings controlling how iteration is parallelized must be defined, as those will
+ * affect all loops added to that pool.
  */
 TaskParallelRangePool *BLI_task_parallel_range_pool_init(const TaskParallelSettings *settings)
 {
   TaskParallelRangePool *range_pool = MEM_callocN(sizeof(*range_pool), __func__);
 
   BLI_assert(settings->userdata_chunk == NULL);
-  BLI_assert(settings->func_finalize == NULL);
+  BLI_assert(settings->func_reduce == NULL);
+  BLI_assert(settings->func_free == NULL);
   range_pool->settings = MEM_mallocN(sizeof(*range_pool->settings), __func__);
   *range_pool->settings = *settings;
 
@@ -430,7 +435,8 @@ void BLI_task_parallel_range_pool_push(TaskParallelRangePool *range_pool,
   state->iter_value = start;
   state->initial_tls_memory = settings->userdata_chunk;
   state->tls_data_size = settings->userdata_chunk_size;
-  state->func_finalize = settings->func_finalize;
+  state->func_reduce = settings->func_reduce;
+  state->func_free = settings->func_free;
 
   state->next = range_pool->parallel_range_states;
   range_pool->parallel_range_states = state;
@@ -445,7 +451,13 @@ static void parallel_range_func_finalize(TaskPool *__restrict pool,
 
   for (int i = 0; i < range_pool->num_tasks; i++) {
     void *tls_data = (char *)state->flatten_tls_storage + (state->tls_data_size * (size_t)i);
-    state->func_finalize(state->userdata_shared, tls_data);
+    if (state->func_reduce != NULL) {
+      state->func_reduce(state->userdata_shared, state->initial_tls_memory, tls_data);
+    }
+    if (state->func_free != NULL) {
+      /* `func_free` should only free data that was created during execution of `func`. */
+      state->func_free(state->userdata_shared, tls_data);
+    }
   }
 }
 
@@ -531,7 +543,7 @@ void BLI_task_parallel_range_pool_work_and_wait(TaskParallelRangePool *range_poo
       continue;
     }
 
-    if (state->func_finalize != NULL) {
+    if (state->func_reduce != NULL || state->func_free != NULL) {
       BLI_task_pool_push_from_thread(
           task_pool, parallel_range_func_finalize, state, false, NULL, thread_id);
     }
@@ -677,11 +689,9 @@ static void task_parallel_iterator_no_threads(const TaskParallelSettings *settin
 
   parallel_iterator_func_do(state, userdata_chunk, 0);
 
-  if (use_userdata_chunk) {
-    if (settings->func_finalize != NULL) {
-      settings->func_finalize(state->userdata, userdata_chunk_local);
-    }
-    MALLOCA_FREE(userdata_chunk_local, userdata_chunk_size);
+  if (use_userdata_chunk && settings->func_free != NULL) {
+    /* `func_free` should only free data that was created during execution of `func`. */
+    settings->func_free(state->userdata, userdata_chunk_local);
   }
 }
 
@@ -740,11 +750,14 @@ static void task_parallel_iterator_do(const TaskParallelSettings *settings,
   BLI_task_pool_work_and_wait(task_pool);
   BLI_task_pool_free(task_pool);
 
-  if (use_userdata_chunk) {
-    if (settings->func_finalize != NULL) {
-      for (size_t i = 0; i < num_tasks; i++) {
-        userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
-        settings->func_finalize(state->userdata, userdata_chunk_local);
+  if (use_userdata_chunk && (settings->func_reduce != NULL || settings->func_free != NULL)) {
+    for (size_t i = 0; i < num_tasks; i++) {
+      userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
+      if (settings->func_reduce != NULL) {
+        settings->func_reduce(state->userdata, userdata_chunk, userdata_chunk_local);
+      }
+      if (settings->func_free != NULL) {
+        settings->func_free(state->userdata, userdata_chunk_local);
       }
     }
     MALLOCA_FREE(userdata_chunk_array, userdata_chunk_size * num_tasks);
diff --git a/source/blender/editors/physics/particle_edit.c b/source/blender/editors/physics/particle_edit.c
index 75ae0299318..3ce5aeedae8 100644
--- a/source/blender/editors/physics/particle_edit.c
+++ b/source/blender/editors/physics/particle_edit.c
@@ -4084,7 +4084,6 @@ typedef struct BrushAddCountIterData {
   short size;
   float imat[4][4];
   ParticleData *add_pars;
-  int num_added;
 } BrushAddCountIterData;
 
 typedef struct BrushAddCountIterTLSData {
@@ -4176,12 +4175,19 @@ static void brush_add_count_iter(void *__restrict iter_data_v,
   }
 }
 
-static void brush_add_count_iter_finalize(void *__restrict userdata_v,
-                                          void *__restrict userdata_chunk_v)
+static void brush_add_count_iter_reduce(const void *__restrict UNUSED(userdata),
+                                        void *__restrict join_v,
+                                        void *__restrict chunk_v)
+{
+  BrushAddCountIterTLSData *join = (BrushAddCountIterTLSData *)join_v;
+  BrushAddCountIterTLSData *tls = (BrushAddCountIterTLSData *)chunk_v;
+  join->num_added += tls->num_added;
+}
+
+static void brush_add_count_iter_free(const void *__restrict UNUSED(userdata_v),
+                                      void *__restrict chunk_v)
 {
-  BrushAddCountIterData *iter_data = (BrushAddCountIterData *)userdata_v;
-  BrushAddCountIterTLSData *tls = (BrushAddCountIterTLSData *)userdata_chunk_v;
-  iter_data->num_added += tls->num_added;
+  BrushAddCountIterTLSData *tls = (BrushAddCountIterTLSData *)chunk_v;
   if (tls->rng != NULL) {
     BLI_rng_free(tls->rng);
   }
@@ -4245,7 +4251,6 @@ static int brush_add(const bContext *C, PEData *data, short number)
   iter_data.number = number;
   iter_data.size = size;
   iter_data.add_pars = add_pars;
-  iter_data.num_added = 0;
   copy_m4_m4(iter_data.imat, imat);
 
   BrushAddCountIterTLSData tls = {NULL};
@@ -4255,13 +4260,14 @@ static int brush_add(const bContext *C, PEData *data, short number)
   settings.scheduling_mode = TASK_SCHEDULING_DYNAMIC;
   settings.userdata_chunk = &tls;
   settings.userdata_chunk_size = sizeof(BrushAddCountIterTLSData);
-  settings.func_finalize = brush_add_count_iter_finalize;
+  settings.func_reduce = brush_add_count_iter_reduce;
+  settings.func_free = brush_add_count_iter_free;
   BLI_task_parallel_range(0, number, &iter_data, brush_add_count_iter, &settings);
 
   /* Convert add_parse to a dense array, where all new particles are in the
    * beginning of the array.
    */
-  n = iter_data.num_added;
+  n = tls.num_added;
   for (int current_iter = 0, new_index = 0; current_iter < number; current_iter++) {
     if (add_pars[current_iter].num == DMCACHE_NOTFOUND) {
       continue;
diff --git a/source/blender/editors/space_sequencer/sequencer_scopes.c b/source/blender/editors/space_sequencer/sequencer_scopes.c
index b06bc7ad8b7..243a6e193eb 100644
--- a/source/blender/editors/space_sequencer/sequencer_scopes.c
+++ b/source/blender/editors/space_sequencer/sequencer_scopes.c
@@ -447,7 +447,6 @@ static void draw_histogram_bar(ImBuf *ibuf, int x, float val, int col)
 
 typedef struct MakeHistogramViewData {
   const ImBuf *ibuf;
-  uint32_t (*bins)[HIS_STEPS];
 } MakeHistogramViewData;
 
 static void make_histogram_view_from_ibuf_byte_fn(void *__restrict userdata,
@@ -469,17 +468,16 @@ static void make_histogram_view_from_ibuf_byte_fn(void *__restrict userdata,
   }
 }
 
-static void make_histogram_view_from_ibuf_finalize(void *__restrict userdata,
-                                                   void *__restrict userdata_chunk)
+static void make_histogram_view_from_ibuf_reduce(const void *__restrict UNUSED(userdata),
+                                                 void *__restrict chunk_join,
+                                                 void *__restrict chunk)
 {
-  MakeHistogramViewData *data = userdata;
-  uint32_t(*bins)[HIS_STEPS] = data->bins;
-
-  uint32_t(*cur_bins)[HIS_STEPS] = userdata_chunk;
+  uint32_t(*join_bins)[HIS_STEPS] = chunk_join;
+  uint32_t(*bins)[HIS_STEPS] = chunk;
 
   for (int j = 3; j--;) {
     for (int i = 0; i < HIS_STEPS; i++) {
-      bins[j][i] += cur_bins[j][i];
+      join_bins[j][i] += bins[j][i];
     }
   }
 }
@@ -496,14 +494,13 @@ static ImBuf *make_histogram_view_from_ibuf_byte(ImBuf *ibuf)
 
   MakeHistogramViewData data = {
       .ibuf = ibuf,
-      .bins = bins,
   };
   TaskParallelSettings settings;
   BLI_parallel_range_settings_defaults(&settings);
   settings.use_threading = (ibuf->y >= 256);
   settings.userdata_chunk = bins;
   settings.userdata_chunk_size = sizeof(bins);
-  settings.func_finalize = make_histogram_view_from_ibuf_finalize;
+  settings.func_reduce = make_histogram_view_from_ibuf_reduce;
   BLI_task_parallel_range(0, ibuf->y, &data, make_histogram_view_from_ibuf_byte_fn, &settings);
 
   nr = nb = ng = 0;
@@ -582,14 +579,13 @@ static ImBuf *make_histogram_view_from_ibuf_float(ImBuf *ibuf)
 
   MakeHistogramViewData data = {
       .ibuf = ibuf,
-      .bins = bins,
   };
   TaskParallelSettings settings;
   BLI_parallel_range_settings_defaults(&settings);
   settings.use_threading = (ibuf->y >= 256);
   settings.userdata_chunk = bins;
   settings.userdata_chunk_size = sizeof(bins);
-  settings.func_finalize = make_histogram_view_from_ibuf_finalize;
+  settings.func_reduce = make_histogram_view_from_ibuf_reduce;
   BLI_task_parallel_range(0, ibuf->y, &data, make_histogram_view_from_ibuf_float_fn, &settings);
 
   nr = nb = ng = 0;