From 01e8e7dc6d9dd3710e0e3872b2d70196ac654d14 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Tue, 20 Nov 2018 12:17:03 +0100 Subject: Task scheduler: Optimize parallel loop over lists The goal is to address performance regression when going from few threads to 10s of threads. On a systems with more than 32 CPU threads the benefit of threaded loop was actually harmful. There are following tweaks now: - The chunk size is adaptive for the number of threads, which minimizes scheduling overhead. - The number of tasks is adaptive to the list size and chunk size. Here comes performance comparison on the production shot: Number of threads DEG time before DEG time after 44 0.09 0.02 32 0.055 0.025 16 0.025 0.025 8 0.035 0.033 --- source/blender/blenlib/intern/task.c | 63 ++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'source/blender/blenlib/intern/task.c') diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c index 2bb5d5397a9..b6d704d8d82 100644 --- a/source/blender/blenlib/intern/task.c +++ b/source/blender/blenlib/intern/task.c @@ -1221,6 +1221,31 @@ static void parallel_listbase_func( } } +static void task_parallel_listbase_no_threads( + struct ListBase *listbase, + void *userdata, + TaskParallelListbaseFunc func) +{ + int i = 0; + for (Link *link = listbase->first; link != NULL; link = link->next, ++i) { + func(userdata, link, i); + } +} + +/* NOTE: The idea here is to compensate for rather measurable threading + * overhead caused by fetching tasks. With too many CPU threads we are starting + * to spend too much time in those overheads. */ +BLI_INLINE int task_parallel_listbasecalc_chunk_size(const int num_threads) +{ + if (num_threads > 32) { + return 128; + } + else if (num_threads > 16) { + return 64; + } + return 32; +} + /** * This function allows to parallelize for loops over ListBase items. * @@ -1238,41 +1263,37 @@ void BLI_task_parallel_listbase( TaskParallelListbaseFunc func, const bool use_threading) { - TaskScheduler *task_scheduler; - TaskPool *task_pool; - ParallelListState state; - int i, num_threads, num_tasks; - if (BLI_listbase_is_empty(listbase)) { return; } - if (!use_threading) { - i = 0; - for (Link *link = listbase->first; link != NULL; link = link->next, ++i) { - func(userdata, link, i); - } + task_parallel_listbase_no_threads(listbase, userdata, func); + return; + } + TaskScheduler *task_scheduler = BLI_task_scheduler_get(); + const int num_threads = BLI_task_scheduler_num_threads(task_scheduler); + /* TODO(sergey): Consider making chunk size configurable. */ + const int chunk_size = task_parallel_listbasecalc_chunk_size(num_threads); + const int num_tasks = min_ii( + num_threads, + BLI_listbase_count(listbase) / chunk_size); + if (num_tasks <= 1) { + task_parallel_listbase_no_threads(listbase, userdata, func); return; } - task_scheduler = BLI_task_scheduler_get(); - task_pool = BLI_task_pool_create_suspended(task_scheduler, &state); - num_threads = BLI_task_scheduler_num_threads(task_scheduler); - - /* The idea here is to prevent creating task for each of the loop iterations - * and instead have tasks which are evenly distributed across CPU cores and - * pull next iter to be crunched using the queue. - */ - num_tasks = num_threads + 2; + ParallelListState state; + TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state); state.index = 0; state.link = listbase->first; state.userdata = userdata; state.func = func; - state.chunk_size = 32; + state.chunk_size = chunk_size; BLI_spin_init(&state.lock); - for (i = 0; i < num_tasks; i++) { + BLI_assert(num_tasks > 0); + for (int i = 0; i < num_tasks; i++) { /* Use this pool's pre-allocated tasks. */ BLI_task_pool_push_from_thread(task_pool, parallel_listbase_func, -- cgit v1.2.3