Task: Separate Finalize into Reduce And Free

In preparation of TBB we need to split the finalize function into reduce and free. Reduce is used to combine results and free for freeing any allocated memory. The reduce function is called to join user data chunk into another, to reduce the result to the original userdata_chunk memory. These functions should have no side effects so that they can be run on any thread. The free functions should free data created during execution (TaskParallelRangeFunc). Original patch by Brecht van Lommel {rB61f49db843cf5095203112226ae386f301be1e1a}. Reviewed By: Brecht van Lommel, Bastien Montagne Differential Revision: https://developer.blender.org/D7394
author: Jeroen Bakker <j.bakker@atmind.nl> 2020-04-17 11:00:54 +0300
committer: Jeroen Bakker <j.bakker@atmind.nl> 2020-04-17 17:06:54 +0300
commit: d923fb784f4f429c066ceb807c669a4308c1d9b4 (patch)
tree: 60b1cc5c5af606b1df26637b08f5b4bbc714c827
parent: 74fcb531de04e049d197274b2f376b2a92b3b2f3 (diff)
10 files changed, 168 insertions, 146 deletions
diff --git a/source/blender/blenkernel/intern/colortools.c b/source/blender/blenkernel/intern/colortools.c
index f82b8b6675c..3da384a2745 100644
--- a/source/blender/blenkernel/intern/colortools.c
+++ b/source/blender/blenkernel/intern/colortools.c
@@ -1383,8 +1383,6 @@ typedef struct ScopesUpdateData {
   struct ColormanageProcessor *cm_processor;
   const unsigned char *display_buffer;
   const int ycc_mode;
-
-  unsigned int *bin_lum, *bin_r, *bin_g, *bin_b, *bin_a;
 } ScopesUpdateData;
 
 typedef struct ScopesUpdateDataChunk {
@@ -1495,23 +1493,24 @@ static void scopes_update_cb(void *__restrict userdata,
   }
 }
 
-static void scopes_update_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void scopes_update_reduce(const void *__restrict UNUSED(userdata),
+                                 void *__restrict chunk_join,
+                                 void *__restrict chunk)
 {
-  const ScopesUpdateData *data = userdata;
-  const ScopesUpdateDataChunk *data_chunk = userdata_chunk;
-
-  unsigned int *bin_lum = data->bin_lum;
-  unsigned int *bin_r = data->bin_r;
-  unsigned int *bin_g = data->bin_g;
-  unsigned int *bin_b = data->bin_b;
-  unsigned int *bin_a = data->bin_a;
+  ScopesUpdateDataChunk *join_chunk = chunk_join;
+  const ScopesUpdateDataChunk *data_chunk = chunk;
+
+  unsigned int *bin_lum = join_chunk->bin_lum;
+  unsigned int *bin_r = join_chunk->bin_r;
+  unsigned int *bin_g = join_chunk->bin_g;
+  unsigned int *bin_b = join_chunk->bin_b;
+  unsigned int *bin_a = join_chunk->bin_a;
   const unsigned int *bin_lum_c = data_chunk->bin_lum;
   const unsigned int *bin_r_c = data_chunk->bin_r;
   const unsigned int *bin_g_c = data_chunk->bin_g;
   const unsigned int *bin_b_c = data_chunk->bin_b;
   const unsigned int *bin_a_c = data_chunk->bin_a;
 
-  float(*minmax)[2] = data->scopes->minmax;
   const float *min = data_chunk->min;
   const float *max = data_chunk->max;
 
@@ -1524,11 +1523,11 @@ static void scopes_update_finalize(void *__restrict userdata, void *__restrict u
   }
 
   for (int c = 3; c--;) {
-    if (min[c] < minmax[c][0]) {
-      minmax[c][0] = min[c];
+    if (min[c] < join_chunk->min[c]) {
+      join_chunk->min[c] = min[c];
     }
-    if (max[c] > minmax[c][1]) {
-      minmax[c][1] = max[c];
+    if (max[c] > join_chunk->max[c]) {
+      join_chunk->max[c] = max[c];
     }
   }
 }
@@ -1542,7 +1541,6 @@ void BKE_scopes_update(Scopes *scopes,
   unsigned int nl, na, nr, ng, nb;
   double divl, diva, divr, divg, divb;
   const unsigned char *display_buffer = NULL;
-  uint bin_lum[256] = {0}, bin_r[256] = {0}, bin_g[256] = {0}, bin_b[256] = {0}, bin_a[256] = {0};
   int ycc_mode = -1;
   void *cache_handle = NULL;
   struct ColormanageProcessor *cm_processor = NULL;
@@ -1638,11 +1636,6 @@ void BKE_scopes_update(Scopes *scopes,
       .cm_processor = cm_processor,
       .display_buffer = display_buffer,
       .ycc_mode = ycc_mode,
-      .bin_lum = bin_lum,
-      .bin_r = bin_r,
-      .bin_g = bin_g,
-      .bin_b = bin_b,
-      .bin_a = bin_a,
   };
   ScopesUpdateDataChunk data_chunk = {{0}};
   INIT_MINMAX(data_chunk.min, data_chunk.max);
@@ -1652,26 +1645,26 @@ void BKE_scopes_update(Scopes *scopes,
   settings.use_threading = (ibuf->y > 256);
   settings.userdata_chunk = &data_chunk;
   settings.userdata_chunk_size = sizeof(data_chunk);
-  settings.func_finalize = scopes_update_finalize;
+  settings.func_reduce = scopes_update_reduce;
   BLI_task_parallel_range(0, ibuf->y, &data, scopes_update_cb, &settings);
 
   /* convert hist data to float (proportional to max count) */
   nl = na = nr = nb = ng = 0;
   for (a = 0; a < 256; a++) {
-    if (bin_lum[a] > nl) {
-      nl = bin_lum[a];
+    if (data_chunk.bin_lum[a] > nl) {
+      nl = data_chunk.bin_lum[a];
     }
-    if (bin_r[a] > nr) {
-      nr = bin_r[a];
+    if (data_chunk.bin_r[a] > nr) {
+      nr = data_chunk.bin_r[a];
     }
-    if (bin_g[a] > ng) {
-      ng = bin_g[a];
+    if (data_chunk.bin_g[a] > ng) {
+      ng = data_chunk.bin_g[a];
     }
-    if (bin_b[a] > nb) {
-      nb = bin_b[a];
+    if (data_chunk.bin_b[a] > nb) {
+      nb = data_chunk.bin_b[a];
     }
-    if (bin_a[a] > na) {
-      na = bin_a[a];
+    if (data_chunk.bin_a[a] > na) {
+      na = data_chunk.bin_a[a];
     }
   }
   divl = nl ? 1.0 / (double)nl : 1.0;
@@ -1681,11 +1674,11 @@ void BKE_scopes_update(Scopes *scopes,
   divb = nb ? 1.0 / (double)nb : 1.0;
 
   for (a = 0; a < 256; a++) {
-    scopes->hist.data_luma[a] = bin_lum[a] * divl;
-    scopes->hist.data_r[a] = bin_r[a] * divr;
-    scopes->hist.data_g[a] = bin_g[a] * divg;
-    scopes->hist.data_b[a] = bin_b[a] * divb;
-    scopes->hist.data_a[a] = bin_a[a] * diva;
+    scopes->hist.data_luma[a] = data_chunk.bin_lum[a] * divl;
+    scopes->hist.data_r[a] = data_chunk.bin_r[a] * divr;
+    scopes->hist.data_g[a] = data_chunk.bin_g[a] * divg;
+    scopes->hist.data_b[a] = data_chunk.bin_b[a] * divb;
+    scopes->hist.data_a[a] = data_chunk.bin_a[a] * diva;
   }
 
   if (cm_processor) {
diff --git a/source/blender/blenkernel/intern/dynamicpaint.c b/source/blender/blenkernel/intern/dynamicpaint.c
index 4c78c88d168..3a8b846a41d 100644
--- a/source/blender/blenkernel/intern/dynamicpaint.c
+++ b/source/blender/blenkernel/intern/dynamicpaint.c
@@ -653,15 +653,15 @@ static void grid_bound_insert_cb_ex(void *__restrict userdata,
   boundInsert(grid_bound, bData->realCoord[bData->s_pos[i]].v);
 }
 
-static void grid_bound_insert_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void grid_bound_insert_reduce(const void *__restrict UNUSED(userdata),
+                                     void *__restrict chunk_join,
+                                     void *__restrict chunk)
 {
-  PaintBakeData *bData = userdata;
-  VolumeGrid *grid = bData->grid;
-
-  Bounds3D *grid_bound = userdata_chunk;
+  Bounds3D *join = chunk_join;
+  Bounds3D *grid_bound = chunk;
 
-  boundInsert(&grid->grid_bounds, grid_bound->min);
-  boundInsert(&grid->grid_bounds, grid_bound->max);
+  boundInsert(join, grid_bound->min);
+  boundInsert(join, grid_bound->max);
 }
 
 static void grid_cell_points_cb_ex(void *__restrict userdata,
@@ -685,17 +685,20 @@ static void grid_cell_points_cb_ex(void *__restrict userdata,
   s_num[temp_t_index[i]]++;
 }
 
-static void grid_cell_points_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void grid_cell_points_reduce(const void *__restrict userdata,
+                                    void *__restrict chunk_join,
+                                    void *__restrict chunk)
 {
-  PaintBakeData *bData = userdata;
-  VolumeGrid *grid = bData->grid;
+  const PaintBakeData *bData = userdata;
+  const VolumeGrid *grid = bData->grid;
   const int grid_cells = grid->dim[0] * grid->dim[1] * grid->dim[2];
 
-  int *s_num = userdata_chunk;
+  int *join_s_num = chunk_join;
+  int *s_num = chunk;
 
   /* calculate grid indexes */
   for (int i = 0; i < grid_cells; i++) {
-    grid->s_num[i] += s_num[i];
+    join_s_num[i] += s_num[i];
   }
 }
 
@@ -753,7 +756,7 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
       settings.use_threading = (sData->total_points > 1000);
       settings.userdata_chunk = &grid->grid_bounds;
       settings.userdata_chunk_size = sizeof(grid->grid_bounds);
-      settings.func_finalize = grid_bound_insert_finalize;
+      settings.func_reduce = grid_bound_insert_reduce;
       BLI_task_parallel_range(0, sData->total_points, bData, grid_bound_insert_cb_ex, &settings);
     }
     /* get dimensions */
@@ -814,7 +817,7 @@ static void surfaceGenerateGrid(struct DynamicPaintSurface *surface)
         settings.use_threading = (sData->total_points > 1000);
         settings.userdata_chunk = grid->s_num;
         settings.userdata_chunk_size = sizeof(*grid->s_num) * grid_cells;
-        settings.func_finalize = grid_cell_points_finalize;
+        settings.func_reduce = grid_cell_points_reduce;
         BLI_task_parallel_range(0, sData->total_points, bData, grid_cell_points_cb_ex, &settings);
       }
 
@@ -4880,7 +4883,7 @@ static void dynamicPaint_prepareAdjacencyData(DynamicPaintSurface *surface, cons
       0, sData->total_points, sData, dynamic_paint_prepare_adjacency_cb, &settings);
 
   /* calculate average values (single thread).
-   * Note: tried to put this in threaded callback (using _finalize feature),
+   * Note: tried to put this in threaded callback (using _reduce feature),
    * but gave ~30% slower result! */
   bData->average_dist = 0.0;
   for (index = 0; index < sData->total_points; index++) {
diff --git a/source/blender/blenkernel/intern/particle_system.c b/source/blender/blenkernel/intern/particle_system.c
index 5ef2f7aeeff..14b1ef7b87f 100644
--- a/source/blender/blenkernel/intern/particle_system.c
+++ b/source/blender/blenkernel/intern/particle_system.c
@@ -3692,10 +3692,11 @@ typedef struct DynamicStepSolverTaskData {
   SpinLock spin;
 } DynamicStepSolverTaskData;
 
-static void dynamics_step_finalize_sphdata(void *__restrict UNUSED(userdata),
-                                           void *__restrict tls_userdata_chunk)
+static void dynamics_step_sphdata_reduce(const void *__restrict UNUSED(userdata),
+                                         void *__restrict UNUSED(join_v),
+                                         void *__restrict chunk_v)
 {
-  SPHData *sphdata = tls_userdata_chunk;
+  SPHData *sphdata = chunk_v;
 
   psys_sph_flush_springs(sphdata);
 }
@@ -3986,7 +3987,7 @@ static void dynamics_step(ParticleSimulationData *sim, float cfra)
         settings.use_threading = (psys->totpart > 100);
         settings.userdata_chunk = &sphdata;
         settings.userdata_chunk_size = sizeof(sphdata);
-        settings.func_finalize = dynamics_step_finalize_sphdata;
+        settings.func_reduce = dynamics_step_sphdata_reduce;
         BLI_task_parallel_range(
             0, psys->totpart, &task_data, dynamics_step_sph_ddr_task_cb_ex, &settings);
 
@@ -4018,7 +4019,7 @@ static void dynamics_step(ParticleSimulationData *sim, float cfra)
           settings.use_threading = (psys->totpart > 100);
           settings.userdata_chunk = &sphdata;
           settings.userdata_chunk_size = sizeof(sphdata);
-          settings.func_finalize = dynamics_step_finalize_sphdata;
+          settings.func_reduce = dynamics_step_sphdata_reduce;
           BLI_task_parallel_range(0,
                                   psys->totpart,
                                   &task_data,
@@ -4033,7 +4034,7 @@ static void dynamics_step(ParticleSimulationData *sim, float cfra)
           settings.use_threading = (psys->totpart > 100);
           settings.userdata_chunk = &sphdata;
           settings.userdata_chunk_size = sizeof(sphdata);
-          settings.func_finalize = dynamics_step_finalize_sphdata;
+          settings.func_reduce = dynamics_step_sphdata_reduce;
           BLI_task_parallel_range(0,
                                   psys->totpart,
                                   &task_data,
@@ -4189,7 +4190,7 @@ static void particles_fluid_step(ParticleSimulationData *sim,
       ParticleSettings *part = psys->part;
       ParticleData *pa = NULL;
 
-      int p, totpart, tottypepart = 0;
+      int p, totpart = 0, tottypepart = 0;
       int flagActivePart, activeParts = 0;
       float posX, posY, posZ, velX, velY, velZ;
       float resX, resY, resZ;
diff --git a/source/blender/blenkernel/intern/subdiv_ccg.c b/source/blender/blenkernel/intern/subdiv_ccg.c
index 521aeb60e66..d99c41eaa3e 100644
--- a/source/blender/blenkernel/intern/subdiv_ccg.c
+++ b/source/blender/blenkernel/intern/subdiv_ccg.c
@@ -770,8 +770,8 @@ static void subdiv_ccg_recalc_inner_normal_task(void *__restrict userdata_v,
   subdiv_ccg_average_inner_face_normals(data->subdiv_ccg, data->key, tls, grid_index);
 }
 
-static void subdiv_ccg_recalc_inner_normal_finalize(void *__restrict UNUSED(userdata),
-                                                    void *__restrict tls_v)
+static void subdiv_ccg_recalc_inner_normal_free(const void *__restrict UNUSED(userdata),
+                                                void *__restrict tls_v)
 {
   RecalcInnerNormalsTLSData *tls = tls_v;
   MEM_SAFE_FREE(tls->face_normals);
@@ -791,7 +791,7 @@ static void subdiv_ccg_recalc_inner_grid_normals(SubdivCCG *subdiv_ccg)
   BLI_parallel_range_settings_defaults(&parallel_range_settings);
   parallel_range_settings.userdata_chunk = &tls_data;
   parallel_range_settings.userdata_chunk_size = sizeof(tls_data);
-  parallel_range_settings.func_finalize = subdiv_ccg_recalc_inner_normal_finalize;
+  parallel_range_settings.func_free = subdiv_ccg_recalc_inner_normal_free;
   BLI_task_parallel_range(0,
                           subdiv_ccg->num_grids,
                           &data,
@@ -834,8 +834,8 @@ static void subdiv_ccg_recalc_modified_inner_normal_task(void *__restrict userda
   subdiv_ccg_average_inner_face_grids(subdiv_ccg, key, face);
 }
 
-static void subdiv_ccg_recalc_modified_inner_normal_finalize(void *__restrict UNUSED(userdata),
-                                                             void *__restrict tls_v)
+static void subdiv_ccg_recalc_modified_inner_normal_free(const void *__restrict UNUSED(userdata),
+                                                         void *__restrict tls_v)
 {
   RecalcInnerNormalsTLSData *tls = tls_v;
   MEM_SAFE_FREE(tls->face_normals);
@@ -857,7 +857,7 @@ static void subdiv_ccg_recalc_modified_inner_grid_normals(SubdivCCG *subdiv_ccg,
   BLI_parallel_range_settings_defaults(&parallel_range_settings);
   parallel_range_settings.userdata_chunk = &tls_data;
   parallel_range_settings.userdata_chunk_size = sizeof(tls_data);
-  parallel_range_settings.func_finalize = subdiv_ccg_recalc_modified_inner_normal_finalize;
+  parallel_range_settings.func_free = subdiv_ccg_recalc_modified_inner_normal_free;
   BLI_task_parallel_range(0,
                           num_effected_faces,
                           &data,
@@ -1077,8 +1077,8 @@ static void subdiv_ccg_average_grids_boundaries_task(void *__restrict userdata_v
   subdiv_ccg_average_grids_boundary(subdiv_ccg, key, adjacent_edge, tls);
 }
 
-static void subdiv_ccg_average_grids_boundaries_finalize(void *__restrict UNUSED(userdata),
-                                                         void *__restrict tls_v)
+static void subdiv_ccg_average_grids_boundaries_free(const void *__restrict UNUSED(userdata),
+                                                     void *__restrict tls_v)
 {
   AverageGridsBoundariesTLSData *tls = tls_v;
   MEM_SAFE_FREE(tls->accumulators);
@@ -1136,7 +1136,7 @@ static void subdiv_ccg_average_all_boundaries(SubdivCCG *subdiv_ccg, CCGKey *key
   AverageGridsBoundariesTLSData tls_data = {NULL};
   parallel_range_settings.userdata_chunk = &tls_data;
   parallel_range_settings.userdata_chunk_size = sizeof(tls_data);
-  parallel_range_settings.func_finalize = subdiv_ccg_average_grids_boundaries_finalize;
+  parallel_range_settings.func_free = subdiv_ccg_average_grids_boundaries_free;
   BLI_task_parallel_range(0,
                           subdiv_ccg->num_adjacent_edges,
                           &boundaries_data,
diff --git a/source/blender/blenkernel/intern/subdiv_foreach.c b/source/blender/blenkernel/intern/subdiv_foreach.c
index b31fb2c9312..0884f40952f 100644
--- a/source/blender/blenkernel/intern/subdiv_foreach.c
+++ b/source/blender/blenkernel/intern/subdiv_foreach.c
@@ -1838,9 +1838,9 @@ static void subdiv_foreach_boundary_edges_task(void *__restrict userdata,
   subdiv_foreach_boundary_edges(ctx, tls->userdata_chunk, edge_index);
 }
 
-static void subdiv_foreach_finalize(void *__restrict userdata, void *__restrict userdata_chunk)
+static void subdiv_foreach_free(const void *__restrict userdata, void *__restrict userdata_chunk)
 {
-  SubdivForeachTaskContext *ctx = userdata;
+  const SubdivForeachTaskContext *ctx = userdata;
   ctx->foreach_context->user_data_tls_free(userdata_chunk);
 }
 
@@ -1873,7 +1873,7 @@ bool BKE_subdiv_foreach_subdiv_geometry(Subdiv *subdiv,
   parallel_range_settings.userdata_chunk_size = context->user_data_tls_size;
   parallel_range_settings.min_iter_per_thread = 1;
   if (context->user_data_tls_free != NULL) {
-    parallel_range_settings.func_finalize = subdiv_foreach_finalize;
+    parallel_range_settings.func_free = subdiv_foreach_free;
   }
 
   /* TODO(sergey): Possible optimization is to have a single pool and push all
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 898e4be5f92..78370e98202 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -146,12 +146,14 @@ typedef struct TaskParallelTLS {
   void *userdata_chunk;
 } TaskParallelTLS;
 
-typedef void (*TaskParallelFinalizeFunc)(void *__restrict userdata,
-                                         void *__restrict userdata_chunk);
-
 typedef void (*TaskParallelRangeFunc)(void *__restrict userdata,
                                       const int iter,
                                       const TaskParallelTLS *__restrict tls);
+typedef void (*TaskParallelReduceFunc)(const void *__restrict userdata,
+                                       void *__restrict chunk_join,
+                                       void *__restrict chunk);
+
+typedef void (*TaskParallelFreeFunc)(const void *__restrict userdata, void *__restrict chunk);
 
 typedef struct TaskParallelSettings {
   /* Whether caller allows to do threading of the particular range.
@@ -171,7 +173,13 @@ typedef struct TaskParallelSettings {
   /* Function called from calling thread once whole range have been
    * processed.
    */
-  TaskParallelFinalizeFunc func_finalize;
+  /* Function called to join user data chunk into another, to reduce
+   * the result to the original userdata_chunk memory.
+   * The reduce functions should have no side effects, so that they
+   * can be run on any thread. */
+  TaskParallelReduceFunc func_reduce;
+  /* Function called to free data created by TaskParallelRangeFunc. */
+  TaskParallelFreeFunc func_free;
   /* Minimum allowed number of range iterators to be handled by a single
    * thread. This allows to achieve following:
    * - Reduce amount of threading overhead.
diff --git a/source/blender/blenlib/intern/task_iterator.c b/source/blender/blenlib/intern/task_iterator.c
index 4ac012fa578..0d1c96612e3 100644
--- a/source/blender/blenlib/intern/task_iterator.c
+++ b/source/blender/blenlib/intern/task_iterator.c
@@ -78,8 +78,13 @@ typedef struct TaskParallelRangeState {
   /* Number of 'tls' copies in the array, i.e. number of worker threads. */
   size_t num_elements_in_tls_storage;
 
-  /* Function called from calling thread once whole range have been processed. */
-  TaskParallelFinalizeFunc func_finalize;
+  /* Function called to join user data chunk into another, to reduce
+   * the result to the original userdata_chunk memory.
+   * The reduce functions should have no side effects, so that they
+   * can be run on any thread. */
+  TaskParallelReduceFunc func_reduce;
+  /* Function called to free data created by TaskParallelRangeFunc. */
+  TaskParallelFreeFunc func_free;
 
   /* Current value of the iterator, shared between all threads (atomically updated). */
   int iter_value;
@@ -256,23 +261,18 @@ static void parallel_range_single_thread(TaskParallelRangePool *range_pool)
 
     void *initial_tls_memory = state->initial_tls_memory;
     const size_t tls_data_size = state->tls_data_size;
-    void *flatten_tls_storage = NULL;
     const bool use_tls_data = (tls_data_size != 0) && (initial_tls_memory != NULL);
-    if (use_tls_data) {
-      flatten_tls_storage = MALLOCA(tls_data_size);
-      memcpy(flatten_tls_storage, initial_tls_memory, tls_data_size);
-    }
     TaskParallelTLS tls = {
         .thread_id = 0,
-        .userdata_chunk = flatten_tls_storage,
+        .userdata_chunk = initial_tls_memory,
     };
     for (int i = start; i < stop; i++) {
       func(userdata, i, &tls);
     }
-    if (state->func_finalize != NULL) {
-      state->func_finalize(userdata, flatten_tls_storage);
+    if (use_tls_data && state->func_free != NULL) {
+      /* `func_free` should only free data that was created during execution of `func`. */
+      state->func_free(userdata, initial_tls_memory);
     }
-    MALLOCA_FREE(flatten_tls_storage, tls_data_size);
   }
 }
 
@@ -303,7 +303,7 @@ void BLI_task_parallel_range(const int start,
       .iter_value = start,
       .initial_tls_memory = settings->userdata_chunk,
       .tls_data_size = settings->userdata_chunk_size,
-      .func_finalize = settings->func_finalize,
+      .func_free = settings->func_free,
   };
   TaskParallelRangePool range_pool = {
       .pool = NULL, .parallel_range_states = &state, .current_state = NULL, .settings = settings};
@@ -367,11 +367,15 @@ void BLI_task_parallel_range(const int start,
   BLI_task_pool_work_and_wait(task_pool);
   BLI_task_pool_free(task_pool);
 
-  if (use_tls_data) {
-    if (settings->func_finalize != NULL) {
-      for (i = 0; i < num_tasks; i++) {
-        void *userdata_chunk_local = (char *)flatten_tls_storage + (tls_data_size * (size_t)i);
-        settings->func_finalize(userdata, userdata_chunk_local);
+  if (use_tls_data && (settings->func_free != NULL || settings->func_reduce != NULL)) {
+    for (i = 0; i < num_tasks; i++) {
+      void *userdata_chunk_local = (char *)flatten_tls_storage + (tls_data_size * (size_t)i);
+      if (settings->func_reduce) {
+        settings->func_reduce(userdata, tls_data, userdata_chunk_local);
+      }
+      if (settings->func_free) {
+        /* `func_free` should only free data that was created during execution of `func`. */
+        settings->func_free(userdata, userdata_chunk_local);
       }
     }
     MALLOCA_FREE(flatten_tls_storage, tls_data_size * (size_t)num_tasks);
@@ -382,16 +386,17 @@ void BLI_task_parallel_range(const int start,
  * Initialize a task pool to parallelize several for loops at the same time.
  *
  * See public API doc of ParallelRangeSettings for description of all settings.
- * Note that loop-specific settings (like 'tls' data or finalize function) must be left NULL here.
- * Only settings controlling how iteration is parallelized must be defined, as those will affect
- * all loops added to that pool.
+ * Note that loop-specific settings (like 'tls' data or reduce/free functions) must be left NULL
+ * here. Only settings controlling how iteration is parallelized must be defined, as those will
+ * affect all loops added to that pool.
  */
 TaskParallelRangePool *BLI_task_parallel_range_pool_init(const TaskParallelSettings *settings)
 {
   TaskParallelRangePool *range_pool = MEM_callocN(sizeof(*range_pool), __func__);
 
   BLI_assert(settings->userdata_chunk == NULL);
-  BLI_assert(settings->func_finalize == NULL);
+  BLI_assert(settings->func_reduce == NULL);
+  BLI_assert(settings->func_free == NULL);
   range_pool->settings = MEM_mallocN(sizeof(*range_pool->settings), __func__);
   *range_pool->settings = *settings;
 
@@ -430,7 +435,8 @@ void BLI_task_parallel_range_pool_push(TaskParallelRangePool *range_pool,
   state->iter_value = start;
   state->initial_tls_memory = settings->userdata_chunk;
   state->tls_data_size = settings->userdata_chunk_size;
-  state->func_finalize = settings->func_finalize;
+  state->func_reduce = settings->func_reduce;
+  state->func_free = settings->func_free;
 
   state->next = range_pool->parallel_range_states;
   range_pool->parallel_range_states = state;
@@ -445,7 +451,13 @@ static void parallel_range_func_finalize(TaskPool *__restrict pool,
 
   for (int i = 0; i < range_pool->num_tasks; i++) {
     void *tls_data = (char *)state->flatten_tls_storage + (state->tls_data_size * (size_t)i);
-    state->func_finalize(state->userdata_shared, tls_data);
+    if (state->func_reduce != NULL) {
+      state->func_reduce(state->userdata_shared, state->initial_tls_memory, tls_data);
+    }
+    if (state->func_free != NULL) {
+      /* `func_free` should only free data that was created during execution of `func`. */
+      state->func_free(state->userdata_shared, tls_data);
+    }
   }
 }
 
@@ -531,7 +543,7 @@ void BLI_task_parallel_range_pool_work_and_wait(TaskParallelRangePool *range_poo
       continue;
     }
 
-    if (state->func_finalize != NULL) {
+    if (state->func_reduce != NULL || state->func_free != NULL) {
       BLI_task_pool_push_from_thread(
           task_pool, parallel_range_func_finalize, state, false, NULL, thread_id);
     }
@@ -677,11 +689,9 @@ static void task_parallel_iterator_no_threads(const TaskParallelSettings *settin
 
   parallel_iterator_func_do(state, userdata_chunk, 0);
 
-  if (use_userdata_chunk) {
-    if (settings->func_finalize != NULL) {
-      settings->func_finalize(state->userdata, userdata_chunk_local);
-    }
-    MALLOCA_FREE(userdata_chunk_local, userdata_chunk_size);
+  if (use_userdata_chunk && settings->func_free != NULL) {
+    /* `func_free` should only free data that was created during execution of `func`. */
+    settings->func_free(state->userdata, userdata_chunk_local);
   }
 }
 
@@ -740,11 +750,14 @@ static void task_parallel_iterator_do(const TaskParallelSettings *settings,
   BLI_task_pool_work_and_wait(task_pool);
   BLI_task_pool_free(task_pool);
 
-  if (use_userdata_chunk) {
-    if (settings->func_finalize != NULL) {
-      for (size_t i = 0; i < num_tasks; i++) {
-        userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
-        settings->func_finalize(state->userdata, userdata_chunk_local);
+  if (use_userdata_chunk && (settings->func_reduce != NULL || settings->func_free != NULL)) {
+    for (size_t i = 0; i < num_tasks; i++) {
+      userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
+      if (settings->func_reduce != NULL) {
+        settings->func_reduce(state->userdata, userdata_chunk, userdata_chunk_local);
+      }
+      if (settings->func_free != NULL) {
+        settings->func_free(state->userdata, userdata_chunk_local);
       }
     }
     MALLOCA_FREE(userdata_chunk_array, userdata_chunk_size * num_tasks);
diff --git a/source/blender/editors/physics/particle_edit.c b/source/blender/editors/physics/particle_edit.c
index 75ae0299318..3ce5aeedae8 100644
--- a/source/blender/editors/physics/particle_edit.c
+++ b/source/blender/editors/physics/particle_edit.c
@@ -4084,7 +4084,6 @@ typedef struct BrushAddCountIterData {
   short size;
   float imat[4][4];
   ParticleData *add_pars;
-  int num_added;
 } BrushAddCountIterData;
 
 typedef struct BrushAddCountIterTLSData {
@@ -4176,12 +4175,19 @@ static void brush_add_count_iter(void *__restrict iter_data_v,
   }
 }
 
-static void brush_add_count_iter_finalize(void *__restrict userdata_v,
-                                          void *__restrict userdata_chunk_v)
+static void brush_add_count_iter_reduce(const void *__restrict UNUSED(userdata),
+                                        void *__restrict join_v,
+                                        void *__restrict chunk_v)
+{
+  BrushAddCountIterTLSData *join = (BrushAddCountIterTLSData *)join_v;
+  BrushAddCountIterTLSData *tls = (BrushAddCountIterTLSData *)chunk_v;
+  join->num_added += tls->num_added;
+}
+
+static void brush_add_count_iter_free(const void *__restrict UNUSED(userdata_v),
+                                      void *__restrict chunk_v)
 {
-  BrushAddCountIterData *iter_data = (BrushAddCountIterData *)userdata_v;
-  BrushAddCountIterTLSData *tls = (BrushAddCountIterTLSData *)userdata_chunk_v;
-  iter_data->num_added += tls->num_added;
+  BrushAddCountIterTLSData *tls = (BrushAddCountIterTLSData *)chunk_v;
   if (tls->rng != NULL) {
     BLI_rng_free(tls->rng);
   }
@@ -4245,7 +4251,6 @@ static int brush_add(const bContext *C, PEData *data, short number)
   iter_data.number = number;
   iter_data.size = size;
   iter_data.add_pars = add_pars;
-  iter_data.num_added = 0;
   copy_m4_m4(iter_data.imat, imat);
 
   BrushAddCountIterTLSData tls = {NULL};
@@ -4255,13 +4260,14 @@ static int brush_add(const bContext *C, PEData *data, short number)
   settings.scheduling_mode = TASK_SCHEDULING_DYNAMIC;
   settings.userdata_chunk = &tls;
   settings.userdata_chunk_size = sizeof(BrushAddCountIterTLSData);
-  settings.func_finalize = brush_add_count_iter_finalize;
+  settings.func_reduce = brush_add_count_iter_reduce;
+  settings.func_free = brush_add_count_iter_free;
   BLI_task_parallel_range(0, number, &iter_data, brush_add_count_iter, &settings);
 
   /* Convert add_parse to a dense array, where all new particles are in the
    * beginning of the array.
    */
-  n = iter_data.num_added;
+  n = tls.num_added;
   for (int current_iter = 0, new_index = 0; current_iter < number; current_iter++) {
     if (add_pars[current_iter].num == DMCACHE_NOTFOUND) {
       continue;
diff --git a/source/blender/editors/space_sequencer/sequencer_scopes.c b/source/blender/editors/space_sequencer/sequencer_scopes.c
index b06bc7ad8b7..243a6e193eb 100644
--- a/source/blender/editors/space_sequencer/sequencer_scopes.c
+++ b/source/blender/editors/space_sequencer/sequencer_scopes.c
@@ -447,7 +447,6 @@ static void draw_histogram_bar(ImBuf *ibuf, int x, float val, int col)
 
 typedef struct MakeHistogramViewData {
   const ImBuf *ibuf;
-  uint32_t (*bins)[HIS_STEPS];
 } MakeHistogramViewData;
 
 static void make_histogram_view_from_ibuf_byte_fn(void *__restrict userdata,
@@ -469,17 +468,16 @@ static void make_histogram_view_from_ibuf_byte_fn(void *__restrict userdata,
   }
 }
 
-static void make_histogram_view_from_ibuf_finalize(void *__restrict userdata,
-                                                   void *__restrict userdata_chunk)
+static void make_histogram_view_from_ibuf_reduce(const void *__restrict UNUSED(userdata),
+                                                 void *__restrict chunk_join,
+                                                 void *__restrict chunk)
 {
-  MakeHistogramViewData *data = userdata;
-  uint32_t(*bins)[HIS_STEPS] = data->bins;
-
-  uint32_t(*cur_bins)[HIS_STEPS] = userdata_chunk;
+  uint32_t(*join_bins)[HIS_STEPS] = chunk_join;
+  uint32_t(*bins)[HIS_STEPS] = chunk;
 
   for (int j = 3; j--;) {
     for (int i = 0; i < HIS_STEPS; i++) {
-      bins[j][i] += cur_bins[j][i];
+      join_bins[j][i] += bins[j][i];
     }
   }
 }
@@ -496,14 +494,13 @@ static ImBuf *make_histogram_view_from_ibuf_byte(ImBuf *ibuf)
 
   MakeHistogramViewData data = {
       .ibuf = ibuf,
-      .bins = bins,
   };
   TaskParallelSettings settings;
   BLI_parallel_range_settings_defaults(&settings);
   settings.use_threading = (ibuf->y >= 256);
   settings.userdata_chunk = bins;
   settings.userdata_chunk_size = sizeof(bins);
-  settings.func_finalize = make_histogram_view_from_ibuf_finalize;
+  settings.func_reduce = make_histogram_view_from_ibuf_reduce;
   BLI_task_parallel_range(0, ibuf->y, &data, make_histogram_view_from_ibuf_byte_fn, &settings);
 
   nr = nb = ng = 0;
@@ -582,14 +579,13 @@ static ImBuf *make_histogram_view_from_ibuf_float(ImBuf *ibuf)
 
   MakeHistogramViewData data = {
       .ibuf = ibuf,
-      .bins = bins,
   };
   TaskParallelSettings settings;
   BLI_parallel_range_settings_defaults(&settings);
   settings.use_threading = (ibuf->y >= 256);
   settings.userdata_chunk = bins;
   settings.userdata_chunk_size = sizeof(bins);
-  settings.func_finalize = make_histogram_view_from_ibuf_finalize;
+  settings.func_reduce = make_histogram_view_from_ibuf_reduce;
   BLI_task_parallel_range(0, ibuf->y, &data, make_histogram_view_from_ibuf_float_fn, &settings);
 
   nr = nb = ng = 0;
diff --git a/tests/gtests/blenlib/BLI_task_test.cc b/tests/gtests/blenlib/BLI_task_test.cc
index d4ab9de13c4..1f8c092c66a 100644
--- a/tests/gtests/blenlib/BLI_task_test.cc
+++ b/tests/gtests/blenlib/BLI_task_test.cc
@@ -27,17 +27,19 @@ static void task_range_iter_func(void *userdata, int index, const TaskParallelTL
   //  printf("%d, %d, %d\n", index, data[index], *((int *)tls->userdata_chunk));
 }
 
-static void task_range_iter_finalize_func(void *__restrict userdata,
-                                          void *__restrict userdata_chunk)
+static void task_range_iter_reduce_func(const void *__restrict UNUSED(userdata),
+                                        void *__restrict join_v,
+                                        void *__restrict userdata_chunk)
 {
-  int *data = (int *)userdata;
-  data[NUM_ITEMS] += *(int *)userdata_chunk;
+  int *join = (int *)join_v;
+  int *chunk = (int *)userdata_chunk;
+  *join += *chunk;
   //  printf("%d, %d\n", data[NUM_ITEMS], *((int *)userdata_chunk));
 }
 
 TEST(task, RangeIter)
 {
-  int data[NUM_ITEMS + 1] = {0};
+  int data[NUM_ITEMS] = {0};
   int sum = 0;
 
   BLI_threadapi_init();
@@ -48,7 +50,7 @@ TEST(task, RangeIter)
 
   settings.userdata_chunk = &sum;
   settings.userdata_chunk_size = sizeof(sum);
-  settings.func_finalize = task_range_iter_finalize_func;
+  settings.func_reduce = task_range_iter_reduce_func;
 
   BLI_task_parallel_range(0, NUM_ITEMS, data, task_range_iter_func, &settings);
 
@@ -60,7 +62,7 @@ TEST(task, RangeIter)
     EXPECT_EQ(data[i], i);
     expected_sum += i;
   }
-  EXPECT_EQ(data[NUM_ITEMS], expected_sum);
+  EXPECT_EQ(sum, expected_sum);
 
   BLI_threadapi_exit();
 }
@@ -68,7 +70,7 @@ TEST(task, RangeIter)
 TEST(task, RangeIterPool)
 {
   const int num_tasks = 10;
-  int data[num_tasks][NUM_ITEMS + 1] = {{0}};
+  int data[num_tasks][NUM_ITEMS] = {{0}};
   int sum = 0;
 
   BLI_threadapi_init();
@@ -82,7 +84,7 @@ TEST(task, RangeIterPool)
   for (int j = 0; j < num_tasks; j++) {
     settings.userdata_chunk = &sum;
     settings.userdata_chunk_size = sizeof(sum);
-    settings.func_finalize = task_range_iter_finalize_func;
+    settings.func_reduce = task_range_iter_reduce_func;
 
     BLI_task_parallel_range_pool_push(
         range_pool, 0, NUM_ITEMS, data[j], task_range_iter_func, &settings);
@@ -93,16 +95,16 @@ TEST(task, RangeIterPool)
   /* Those checks should ensure us all items of the listbase were processed once, and only once -
    * as expected. */
 
+  int expected_sum = 0;
   for (int j = 0; j < num_tasks; j++) {
-    int expected_sum = 0;
     for (int i = 0; i < NUM_ITEMS; i++) {
-      //      EXPECT_EQ(data[j][i], i);
+      // EXPECT_EQ(data[j][i], i);
       expected_sum += i;
     }
-    EXPECT_EQ(data[j][NUM_ITEMS], expected_sum);
   }
+  EXPECT_EQ(sum, expected_sum);
 
-  /* A pool can be re-used untill it is freed. */
+  /* A pool can be re-used until it is freed. */
 
   for (int j = 0; j < num_tasks; j++) {
     memset(data[j], 0, sizeof(data[j]));
@@ -112,7 +114,7 @@ TEST(task, RangeIterPool)
   for (int j = 0; j < num_tasks; j++) {
     settings.userdata_chunk = &sum;
     settings.userdata_chunk_size = sizeof(sum);
-    settings.func_finalize = task_range_iter_finalize_func;
+    settings.func_reduce = task_range_iter_reduce_func;
 
     BLI_task_parallel_range_pool_push(
         range_pool, 0, NUM_ITEMS, data[j], task_range_iter_func, &settings);
@@ -131,8 +133,8 @@ TEST(task, RangeIterPool)
       //      EXPECT_EQ(data[j][i], i);
       expected_sum += i;
     }
-    EXPECT_EQ(data[j][NUM_ITEMS], expected_sum);
   }
+  EXPECT_EQ(sum, expected_sum);
 
   BLI_threadapi_exit();
 }
author	Jeroen Bakker <j.bakker@atmind.nl>	2020-04-17 11:00:54 +0300
committer	Jeroen Bakker <j.bakker@atmind.nl>	2020-04-17 17:06:54 +0300
commit	d923fb784f4f429c066ceb807c669a4308c1d9b4 (patch)
tree	60b1cc5c5af606b1df26637b08f5b4bbc714c827
parent	74fcb531de04e049d197274b2f376b2a92b3b2f3 (diff)