Optimization of parallel range

It now supports different scheduling schemas: dynamic and static. Static one is the default and it splits work into equal number of range iterations. Dynamic one allocates chunks of 32 iterations which then being dynamically send to a thread which is currently idling. This gives slightly better performance. Still some tricks are possible to have. For example we can use some smarter static scheduling when one thread might steal tasks from another threads when it runs out of work to be done. Also removed unneeded spin lock in the mesh deform evaluation, on the first glance it seemed to be a reduction involved here but int fact threads are just adding value to the original vertex coordinates. No write access to the same element of vertexCos happens from separate threads.
author: Sergey Sharybin <sergey.vfx@gmail.com> 2014-11-03 20:24:08 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2014-11-03 20:44:29 +0300
commit: e43b74d87a43ab919b86434db9881608c5b9f762 (patch)
tree: 3e885e7530db301a26d68a632147b36925855bc4 /source/blender/blenlib
parent: 4b3f1b7540c43999b94c5147eabd6b0b7a6693f8 (diff)
2 files changed, 38 insertions, 20 deletions
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 8c22a25fe14..28da673ea97 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -106,7 +106,8 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold);
+        const int range_threshold,
+        const bool use_dynamic_scheduling);
 void BLI_task_parallel_range(
         int start, int stop,
         void *userdata,
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 07c67f001f9..219ccb18d98 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -29,6 +29,7 @@
 #include "MEM_guardedalloc.h"
 
 #include "BLI_listbase.h"
+#include "BLI_math.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
@@ -452,18 +453,21 @@ typedef struct ParallelRangeState {
 	TaskParallelRangeFunc func;
 
 	int iter;
+	int chunk_size;
 	SpinLock lock;
 } ParallelRangeState;
 
 BLI_INLINE bool parallel_range_next_iter_get(
-        ParallelRangeState *state,
-        int *iter)
+        ParallelRangeState * __restrict state,
+        int * __restrict iter, int * __restrict count)
 {
 	bool result = false;
 	if (state->iter < state->stop) {
 		BLI_spin_lock(&state->lock);
 		if (state->iter < state->stop) {
-			*iter = state->iter++;
+			*count = min_ii(state->chunk_size, state->stop - state->iter);
+			*iter = state->iter;
+			state->iter += *count;
 			result = true;
 		}
 		BLI_spin_unlock(&state->lock);
@@ -472,14 +476,17 @@ BLI_INLINE bool parallel_range_next_iter_get(
 }
 
 static void parallel_range_func(
-        TaskPool *pool,
+        TaskPool * __restrict pool,
         void *UNUSED(taskdata),
         int UNUSED(threadid))
 {
-	ParallelRangeState *state = BLI_task_pool_userdata(pool);
-	int iter;
-	while (parallel_range_next_iter_get(state, &iter)) {
-		state->func(state->userdata, iter);
+	ParallelRangeState * __restrict state = BLI_task_pool_userdata(pool);
+	int iter, count;
+	while (parallel_range_next_iter_get(state, &iter, &count)) {
+		int i;
+		for (i = 0; i < count; ++i) {
+			state->func(state->userdata, iter + i);
+		}
 	}
 }
 
@@ -487,12 +494,13 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold)
+        const int range_threshold,
+        const bool use_dynamic_scheduling)
 {
 	TaskScheduler *task_scheduler;
 	TaskPool *task_pool;
 	ParallelRangeState state;
-	int i;
+	int i, num_threads, num_tasks;
 
 	BLI_assert(start < stop);
 
@@ -506,21 +514,30 @@ void BLI_task_parallel_range_ex(
 		return;
 	}
 
-	BLI_spin_init(&state.lock);
-	state.start = start;
-	state.stop = stop;
-	state.userdata = userdata;
-	state.func = func;
-	state.iter = start;
-
 	task_scheduler = BLI_task_scheduler_get();
 	task_pool = BLI_task_pool_create(task_scheduler, &state);
+	num_threads = BLI_task_scheduler_num_threads(task_scheduler);
 
 	/* The idea here is to prevent creating task for each of the loop iterations
 	 * and instead have tasks which are evenly distributed across CPU cores and
 	 * pull next iter to be crunched using the queue.
 	 */
-	for (i = 0; i < 2 * BLI_task_scheduler_num_threads(task_scheduler); i++) {
+	num_tasks = num_threads * 2;
+
+	BLI_spin_init(&state.lock);
+	state.start = start;
+	state.stop = stop;
+	state.userdata = userdata;
+	state.func = func;
+	state.iter = start;
+	if (use_dynamic_scheduling) {
+		state.chunk_size = 32;
+	}
+	else {
+		state.chunk_size = (stop - start) / (num_tasks);
+	}
+
+	for (i = 0; i < num_tasks; i++) {
 		BLI_task_pool_push(task_pool,
 		                   parallel_range_func,
 		                   NULL, false,
@@ -538,5 +555,5 @@ void BLI_task_parallel_range(
         void *userdata,
         TaskParallelRangeFunc func)
 {
-	BLI_task_parallel_range_ex(start, stop, userdata, func, 64);
+	BLI_task_parallel_range_ex(start, stop, userdata, func, 64, false);
 }
author	Sergey Sharybin <sergey.vfx@gmail.com>	2014-11-03 20:24:08 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2014-11-03 20:44:29 +0300
commit	e43b74d87a43ab919b86434db9881608c5b9f762 (patch)
tree	3e885e7530db301a26d68a632147b36925855bc4 /source/blender/blenlib
parent	4b3f1b7540c43999b94c5147eabd6b0b7a6693f8 (diff)