BLI_task: Add pooled threaded index range iterator, Take II.

This code allows to push a set of different operations all based on iterations over a range of indices, and then process them all at once over multiple threads. This commit also adds unit tests for both old un-pooled, and new pooled task_parallel_range family of functions, as well as some basic performances tests. This is mainly interesting for relatively low amount of individual tasks, as expected. E.g. performance tests on a 32 threads machine, for a set of 10 different tasks, shows following improvements when using pooled version instead of ten sequential calls to BLI_task_parallel_range(): | Num Items | Sequential | Pooled | Speed-up | | --------- | ---------- | ------- | -------- | | 10K | 365 us | 138 us | 2.5 x | | 100K | 877 us | 530 us | 1.66 x | | 1000K | 5521 us | 4625 us | 1.25 x | Differential Revision: https://developer.blender.org/D6189 Note: Compared to previous commit yesterday, this reworks atomic handling in parallel iter code, and fixes a dummy double-free bug. Now we should only use the two critical values for synchronization from atomic calls results, which is the proper way to do things. Reading a value after an atomic operation does not guarantee you will get the latest value in all cases (especially on Windows release builds it seems).
author: Bastien Montagne <b.mont29@gmail.com> 2019-11-26 16:26:47 +0300
committer: Bastien Montagne <b.mont29@gmail.com> 2019-11-26 16:30:41 +0300
commit: fcbec6e97e649eee33f06e0202455ee11dcfe46e (patch)
tree: 05b21434a9a5974be893326ad378f69d4d3521b4 /tests
parent: 9ecc30250aaea4e102e869f181e4c896c315e5ef (diff)
2 files changed, 208 insertions, 2 deletions
diff --git a/tests/gtests/blenlib/BLI_task_performance_test.cc b/tests/gtests/blenlib/BLI_task_performance_test.cc
index dc8981f8064..84b7b8b6439 100644
--- a/tests/gtests/blenlib/BLI_task_performance_test.cc
+++ b/tests/gtests/blenlib/BLI_task_performance_test.cc
@@ -19,8 +19,6 @@ extern "C" {
 #include "MEM_guardedalloc.h"
 }
 
-/* *** Parallel iterations over double-linked list items. *** */
-
 #define NUM_RUN_AVERAGED 100
 
 static uint gen_pseudo_random_number(uint num)
@@ -38,6 +36,94 @@ static uint gen_pseudo_random_number(uint num)
   return ((num & 255) << 6) + 1;
 }
 
+/* *** Parallel iterations over range of indices. *** */
+
+static void task_parallel_range_func(void *UNUSED(userdata),
+                                     int index,
+                                     const TaskParallelTLS *__restrict UNUSED(tls))
+{
+  const uint limit = gen_pseudo_random_number((uint)index);
+  for (uint i = (uint)index; i < limit;) {
+    i += gen_pseudo_random_number(i);
+  }
+}
+
+static void task_parallel_range_test_do(const char *id,
+                                        const int num_items,
+                                        const bool use_threads)
+{
+  TaskParallelSettings settings;
+  BLI_parallel_range_settings_defaults(&settings);
+  settings.use_threading = use_threads;
+
+  double averaged_timing = 0.0;
+  for (int i = 0; i < NUM_RUN_AVERAGED; i++) {
+    const double init_time = PIL_check_seconds_timer();
+    for (int j = 0; j < 10; j++) {
+      BLI_task_parallel_range(i + j, i + j + num_items, NULL, task_parallel_range_func, &settings);
+    }
+    averaged_timing += PIL_check_seconds_timer() - init_time;
+  }
+
+  printf("\t%s: non-pooled done in %fs on average over %d runs\n",
+         id,
+         averaged_timing / NUM_RUN_AVERAGED,
+         NUM_RUN_AVERAGED);
+
+  averaged_timing = 0.0;
+  for (int i = 0; i < NUM_RUN_AVERAGED; i++) {
+    const double init_time = PIL_check_seconds_timer();
+    TaskParallelRangePool *range_pool = BLI_task_parallel_range_pool_init(&settings);
+    for (int j = 0; j < 10; j++) {
+      BLI_task_parallel_range_pool_push(
+          range_pool, i + j, i + j + num_items, NULL, task_parallel_range_func, &settings);
+    }
+    BLI_task_parallel_range_pool_work_and_wait(range_pool);
+    BLI_task_parallel_range_pool_free(range_pool);
+    averaged_timing += PIL_check_seconds_timer() - init_time;
+  }
+
+  printf("\t%s: pooled done in %fs on average over %d runs\n",
+         id,
+         averaged_timing / NUM_RUN_AVERAGED,
+         NUM_RUN_AVERAGED);
+}
+
+TEST(task, RangeIter10KNoThread)
+{
+  task_parallel_range_test_do(
+      "Range parallel iteration - Single thread - 10K items", 10000, false);
+}
+
+TEST(task, RangeIter10k)
+{
+  task_parallel_range_test_do("Range parallel iteration - Threaded - 10K items", 10000, true);
+}
+
+TEST(task, RangeIter100KNoThread)
+{
+  task_parallel_range_test_do(
+      "Range parallel iteration - Single thread - 100K items", 100000, false);
+}
+
+TEST(task, RangeIter100k)
+{
+  task_parallel_range_test_do("Range parallel iteration - Threaded - 100K items", 100000, true);
+}
+
+TEST(task, RangeIter1000KNoThread)
+{
+  task_parallel_range_test_do(
+      "Range parallel iteration - Single thread - 1000K items", 1000000, false);
+}
+
+TEST(task, RangeIter1000k)
+{
+  task_parallel_range_test_do("Range parallel iteration - Threaded - 1000K items", 1000000, true);
+}
+
+/* *** Parallel iterations over double-linked list items. *** */
+
 static void task_listbase_light_iter_func(void *UNUSED(userdata),
                                           void *item,
                                           int index,
diff --git a/tests/gtests/blenlib/BLI_task_test.cc b/tests/gtests/blenlib/BLI_task_test.cc
index 62ae0baaec9..a682e8aedf6 100644
--- a/tests/gtests/blenlib/BLI_task_test.cc
+++ b/tests/gtests/blenlib/BLI_task_test.cc
@@ -17,6 +17,126 @@ extern "C" {
 
 #define NUM_ITEMS 10000
 
+/* *** Parallel iterations over range of integer values. *** */
+
+static void task_range_iter_func(void *userdata, int index, const TaskParallelTLS *__restrict tls)
+{
+  int *data = (int *)userdata;
+  data[index] = index;
+  *((int *)tls->userdata_chunk) += index;
+  //  printf("%d, %d, %d\n", index, data[index], *((int *)tls->userdata_chunk));
+}
+
+static void task_range_iter_finalize_func(void *__restrict userdata,
+                                          void *__restrict userdata_chunk)
+{
+  int *data = (int *)userdata;
+  data[NUM_ITEMS] += *(int *)userdata_chunk;
+  //  printf("%d, %d\n", data[NUM_ITEMS], *((int *)userdata_chunk));
+}
+
+TEST(task, RangeIter)
+{
+  int data[NUM_ITEMS + 1] = {0};
+  int sum = 0;
+
+  BLI_threadapi_init();
+
+  TaskParallelSettings settings;
+  BLI_parallel_range_settings_defaults(&settings);
+  settings.min_iter_per_thread = 1;
+
+  settings.userdata_chunk = &sum;
+  settings.userdata_chunk_size = sizeof(sum);
+  settings.func_finalize = task_range_iter_finalize_func;
+
+  BLI_task_parallel_range(0, NUM_ITEMS, data, task_range_iter_func, &settings);
+
+  /* Those checks should ensure us all items of the listbase were processed once, and only once -
+   * as expected. */
+
+  int expected_sum = 0;
+  for (int i = 0; i < NUM_ITEMS; i++) {
+    EXPECT_EQ(data[i], i);
+    expected_sum += i;
+  }
+  EXPECT_EQ(data[NUM_ITEMS], expected_sum);
+
+  BLI_threadapi_exit();
+}
+
+TEST(task, RangeIterPool)
+{
+  const int num_tasks = 10;
+  int data[num_tasks][NUM_ITEMS + 1] = {{0}};
+  int sum = 0;
+
+  BLI_threadapi_init();
+
+  TaskParallelSettings settings;
+  BLI_parallel_range_settings_defaults(&settings);
+  settings.min_iter_per_thread = 1;
+
+  TaskParallelRangePool *range_pool = BLI_task_parallel_range_pool_init(&settings);
+
+  for (int j = 0; j < num_tasks; j++) {
+    settings.userdata_chunk = &sum;
+    settings.userdata_chunk_size = sizeof(sum);
+    settings.func_finalize = task_range_iter_finalize_func;
+
+    BLI_task_parallel_range_pool_push(
+        range_pool, 0, NUM_ITEMS, data[j], task_range_iter_func, &settings);
+  }
+
+  BLI_task_parallel_range_pool_work_and_wait(range_pool);
+
+  /* Those checks should ensure us all items of the listbase were processed once, and only once -
+   * as expected. */
+
+  for (int j = 0; j < num_tasks; j++) {
+    int expected_sum = 0;
+    for (int i = 0; i < NUM_ITEMS; i++) {
+      //      EXPECT_EQ(data[j][i], i);
+      expected_sum += i;
+    }
+    EXPECT_EQ(data[j][NUM_ITEMS], expected_sum);
+  }
+
+  /* A pool can be re-used untill it is freed. */
+
+  for (int j = 0; j < num_tasks; j++) {
+    memset(data[j], 0, sizeof(data[j]));
+  }
+  sum = 0;
+
+  for (int j = 0; j < num_tasks; j++) {
+    settings.userdata_chunk = &sum;
+    settings.userdata_chunk_size = sizeof(sum);
+    settings.func_finalize = task_range_iter_finalize_func;
+
+    BLI_task_parallel_range_pool_push(
+        range_pool, 0, NUM_ITEMS, data[j], task_range_iter_func, &settings);
+  }
+
+  BLI_task_parallel_range_pool_work_and_wait(range_pool);
+
+  BLI_task_parallel_range_pool_free(range_pool);
+
+  /* Those checks should ensure us all items of the listbase were processed once, and only once -
+   * as expected. */
+
+  for (int j = 0; j < num_tasks; j++) {
+    int expected_sum = 0;
+    for (int i = 0; i < NUM_ITEMS; i++) {
+      //      EXPECT_EQ(data[j][i], i);
+      expected_sum += i;
+    }
+    EXPECT_EQ(data[j][NUM_ITEMS], expected_sum);
+  }
+
+  BLI_threadapi_exit();
+}
+
 /* *** Parallel iterations over mempool items. *** */
 
 static void task_mempool_iter_func(void *userdata, MempoolIterData *item)
author	Bastien Montagne <b.mont29@gmail.com>	2019-11-26 16:26:47 +0300
committer	Bastien Montagne <b.mont29@gmail.com>	2019-11-26 16:30:41 +0300
commit	fcbec6e97e649eee33f06e0202455ee11dcfe46e (patch)
tree	05b21434a9a5974be893326ad378f69d4d3521b4 /tests
parent	9ecc30250aaea4e102e869f181e4c896c315e5ef (diff)