From b9c257019fe256af6c2978b94b4948d18a6cd4eb Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Tue, 30 Jul 2019 14:36:59 +0200
Subject: BLI_task: tweak default chunk size for `BLI_task_parallel_range()`.

Previously we were setting it to 1 (aka no 'chunking'), to follow
previous behavior. However, this is far from optimal, especially with
CPUs that can have tens of threads nowadays.

Now taking an heuristic approach (inspired from the one already existing
for `BLI_task_parallel_listbase()`, which tries to guesstimate best
chunk sizes based on several factors (amount of threads/parallel tasks,
total number of items, ...).

Think this is a reasonable base ground, more optimization here would of
course be possible.

Note that code that was already explicitely settings some value here
won't be affected at all by that change.
---
 source/blender/blenlib/BLI_task.h    |  7 ++---
 source/blender/blenlib/intern/task.c | 53 ++++++++++++++++++++++++++++++------
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index d11ba15a7e0..85680d8aae8 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -216,11 +216,8 @@ BLI_INLINE void BLI_parallel_range_settings_defaults(ParallelRangeSettings *sett
   memset(settings, 0, sizeof(*settings));
   settings->use_threading = true;
   settings->scheduling_mode = TASK_SCHEDULING_STATIC;
-  /* NOTE: Current value mimics old behavior, but it's not ideal by any
-   * means. Would be cool to find a common value which will work good enough
-   * for both static and dynamic scheduling.
-   */
-  settings->min_iter_per_thread = 1;
+  /* Use default heuristic to define actual chunk size. */
+  settings->min_iter_per_thread = 0;
 }
 
 #ifdef __cplusplus
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 85d39f2f98e..cc54fb6bd38 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -1054,6 +1054,49 @@ typedef struct ParallelRangeState {
   int chunk_size;
 } ParallelRangeState;
 
+BLI_INLINE void task_parallel_range_calc_chunk_size(const ParallelRangeSettings *settings,
+                                                    const int num_tasks,
+                                                    ParallelRangeState *state)
+{
+  const int tot_items = state->stop - state->start;
+  int chunk_size = 0;
+
+  if (settings->min_iter_per_thread > 0) {
+    /* Already set by user, no need to do anything here. */
+    chunk_size = settings->min_iter_per_thread;
+  }
+  else {
+    /* Basic heuristic to avoid threading on low amount of items. We could make that limit
+     * configurable in settings too... */
+    if (tot_items > 0 && tot_items < 256) {
+      chunk_size = tot_items;
+    }
+    /* NOTE: The idea here is to compensate for rather measurable threading
+     * overhead caused by fetching tasks. With too many CPU threads we are starting
+     * to spend too much time in those overheads. */
+    else if (num_tasks > 32) {
+      chunk_size = 128;
+    }
+    else if (num_tasks > 16) {
+      chunk_size = 64;
+    }
+    else {
+      chunk_size = 32;
+    }
+  }
+
+  BLI_assert(chunk_size > 0);
+
+  switch (settings->scheduling_mode) {
+    case TASK_SCHEDULING_STATIC:
+      state->chunk_size = max_ii(chunk_size, tot_items / (num_tasks));
+      break;
+    case TASK_SCHEDULING_DYNAMIC:
+      state->chunk_size = chunk_size;
+      break;
+  }
+}
+
 BLI_INLINE bool parallel_range_next_iter_get(ParallelRangeState *__restrict state,
                                              int *__restrict iter,
                                              int *__restrict count)
@@ -1162,16 +1205,8 @@ void BLI_task_parallel_range(const int start,
   state.userdata = userdata;
   state.func = func;
   state.iter = start;
-  switch (settings->scheduling_mode) {
-    case TASK_SCHEDULING_STATIC:
-      state.chunk_size = max_ii(settings->min_iter_per_thread, (stop - start) / (num_tasks));
-      break;
-    case TASK_SCHEDULING_DYNAMIC:
-      /* TODO(sergey): Make it configurable from min_iter_per_thread. */
-      state.chunk_size = 32;
-      break;
-  }
 
+  task_parallel_range_calc_chunk_size(settings, num_tasks, &state);
   num_tasks = min_ii(num_tasks, max_ii(1, (stop - start) / state.chunk_size));
 
   if (num_tasks == 1) {
-- 
cgit v1.2.3