From e43b74d87a43ab919b86434db9881608c5b9f762 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 3 Nov 2014 18:24:08 +0100
Subject: Optimization of parallel range

It now supports different scheduling schemas: dynamic and static.
Static one is the default and it splits work into equal number of
range iterations.

Dynamic one allocates chunks of 32 iterations which then being
dynamically send to a thread which is currently idling.

This gives slightly better performance. Still some tricks are
possible to have. For example we can use some smarter static scheduling
when one thread might steal tasks from another threads when it runs
out of work to be done.

Also removed unneeded spin lock in the mesh deform evaluation,
on the first glance it seemed to be a reduction involved here but
int fact threads are just adding value to the original vertex
coordinates. No write access to the same element of  vertexCos
happens from separate threads.
---
 source/blender/blenlib/BLI_task.h    |  3 +-
 source/blender/blenlib/intern/task.c | 55 +++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'source/blender/blenlib')

diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 8c22a25fe14..28da673ea97 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -106,7 +106,8 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold);
+        const int range_threshold,
+        const bool use_dynamic_scheduling);
 void BLI_task_parallel_range(
         int start, int stop,
         void *userdata,
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 07c67f001f9..219ccb18d98 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -29,6 +29,7 @@
 #include "MEM_guardedalloc.h"
 
 #include "BLI_listbase.h"
+#include "BLI_math.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
@@ -452,18 +453,21 @@ typedef struct ParallelRangeState {
 	TaskParallelRangeFunc func;
 
 	int iter;
+	int chunk_size;
 	SpinLock lock;
 } ParallelRangeState;
 
 BLI_INLINE bool parallel_range_next_iter_get(
-        ParallelRangeState *state,
-        int *iter)
+        ParallelRangeState * __restrict state,
+        int * __restrict iter, int * __restrict count)
 {
 	bool result = false;
 	if (state->iter < state->stop) {
 		BLI_spin_lock(&state->lock);
 		if (state->iter < state->stop) {
-			*iter = state->iter++;
+			*count = min_ii(state->chunk_size, state->stop - state->iter);
+			*iter = state->iter;
+			state->iter += *count;
 			result = true;
 		}
 		BLI_spin_unlock(&state->lock);
@@ -472,14 +476,17 @@ BLI_INLINE bool parallel_range_next_iter_get(
 }
 
 static void parallel_range_func(
-        TaskPool *pool,
+        TaskPool * __restrict pool,
         void *UNUSED(taskdata),
         int UNUSED(threadid))
 {
-	ParallelRangeState *state = BLI_task_pool_userdata(pool);
-	int iter;
-	while (parallel_range_next_iter_get(state, &iter)) {
-		state->func(state->userdata, iter);
+	ParallelRangeState * __restrict state = BLI_task_pool_userdata(pool);
+	int iter, count;
+	while (parallel_range_next_iter_get(state, &iter, &count)) {
+		int i;
+		for (i = 0; i < count; ++i) {
+			state->func(state->userdata, iter + i);
+		}
 	}
 }
 
@@ -487,12 +494,13 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold)
+        const int range_threshold,
+        const bool use_dynamic_scheduling)
 {
 	TaskScheduler *task_scheduler;
 	TaskPool *task_pool;
 	ParallelRangeState state;
-	int i;
+	int i, num_threads, num_tasks;
 
 	BLI_assert(start < stop);
 
@@ -506,21 +514,30 @@ void BLI_task_parallel_range_ex(
 		return;
 	}
 
-	BLI_spin_init(&state.lock);
-	state.start = start;
-	state.stop = stop;
-	state.userdata = userdata;
-	state.func = func;
-	state.iter = start;
-
 	task_scheduler = BLI_task_scheduler_get();
 	task_pool = BLI_task_pool_create(task_scheduler, &state);
+	num_threads = BLI_task_scheduler_num_threads(task_scheduler);
 
 	/* The idea here is to prevent creating task for each of the loop iterations
 	 * and instead have tasks which are evenly distributed across CPU cores and
 	 * pull next iter to be crunched using the queue.
 	 */
-	for (i = 0; i < 2 * BLI_task_scheduler_num_threads(task_scheduler); i++) {
+	num_tasks = num_threads * 2;
+
+	BLI_spin_init(&state.lock);
+	state.start = start;
+	state.stop = stop;
+	state.userdata = userdata;
+	state.func = func;
+	state.iter = start;
+	if (use_dynamic_scheduling) {
+		state.chunk_size = 32;
+	}
+	else {
+		state.chunk_size = (stop - start) / (num_tasks);
+	}
+
+	for (i = 0; i < num_tasks; i++) {
 		BLI_task_pool_push(task_pool,
 		                   parallel_range_func,
 		                   NULL, false,
@@ -538,5 +555,5 @@ void BLI_task_parallel_range(
         void *userdata,
         TaskParallelRangeFunc func)
 {
-	BLI_task_parallel_range_ex(start, stop, userdata, func, 64);
+	BLI_task_parallel_range_ex(start, stop, userdata, func, 64, false);
 }
-- 
cgit v1.2.3