diff options
author | Hristo Gueorguiev <prem.nirved@gmail.com> | 2017-05-03 16:30:45 +0300 |
---|---|---|
committer | Hristo Gueorguiev <prem.nirved@gmail.com> | 2017-05-03 16:30:45 +0300 |
commit | 6bf4115c13962c99d1cdc97f2be92c4922f3fd33 (patch) | |
tree | 569c512a242caf2ea4465f2eef561933ed937a2f /intern/cycles/kernel | |
parent | 6f9c839f444f92c4b0c336a6f5e31cb9660d7dbc (diff) |
Cycles: Split kernel - sort shaders
Reduce thread divergence in kernel_shader_eval.
Rays are sorted in blocks of 2048 according to shader->id.
On R9 290 Classroom is ~30% faster, and Pabellon Barcelone is ~8% faster.
No sorting for CUDA split kernel.
Reviewers: sergey, maiself
Reviewed By: maiself
Differential Revision: https://developer.blender.org/D2598
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_types.h | 15 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/kernel_cpu.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h | 8 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_split.cu | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl | 27 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl | 28 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel_split.cl | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_shader_eval.h | 52 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_shader_setup.h | 70 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_shader_sort.h | 97 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_split_data_types.h | 5 |
13 files changed, 284 insertions, 37 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index a92e8bc4aee..9bb0455b9d5 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -21,6 +21,8 @@ set(SRC kernels/opencl/kernel_lamp_emission.cl kernels/opencl/kernel_do_volume.cl kernels/opencl/kernel_indirect_background.cl + kernels/opencl/kernel_shader_setup.cl + kernels/opencl/kernel_shader_sort.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl kernels/opencl/kernel_subsurface_scatter.cl @@ -248,6 +250,8 @@ set(SRC_SPLIT_HEADERS split/kernel_path_init.h split/kernel_queue_enqueue.h split/kernel_scene_intersect.h + split/kernel_shader_setup.h + split/kernel_shader_sort.h split/kernel_shader_eval.h split/kernel_shadow_blocked_ao.h split/kernel_shadow_blocked_dl.h @@ -457,6 +461,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 6417f621c8f..9b354457b91 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN # define WORK_POOL_SIZE WORK_POOL_SIZE_CPU #endif + +#define SHADER_SORT_BLOCK_SIZE 2048 + +#ifdef __KERNEL_OPENCL__ +# define SHADER_SORT_LOCAL_SIZE 64 +#elif defined(__KERNEL_CUDA__) +# define SHADER_SORT_LOCAL_SIZE 32 +#else +# define SHADER_SORT_LOCAL_SIZE 1 +#endif + + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -1321,6 +1333,9 @@ enum QueueNumber { */ QUEUE_SHADOW_RAY_CAST_DL_RAYS, + /* Rays sorted according to shader->id */ + QUEUE_SHADER_SORTED_RAYS, + #ifdef __BRANCHED_PATH__ /* All rays moving to next iteration of the indirect loop for light */ QUEUE_LIGHT_INDIRECT_ITER, diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 896b80d783e..39c9a9cf33c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -77,6 +77,8 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 96f54bb427e..8c05dd1d9ef 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -44,6 +44,8 @@ # include "kernel/split/kernel_do_volume.h" # include "kernel/split/kernel_queue_enqueue.h" # include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_setup.h" +# include "kernel/split/kernel_shader_sort.h" # include "kernel/split/kernel_shader_eval.h" # include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" # include "kernel/split/kernel_subsurface_scatter.h" @@ -181,7 +183,9 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) @@ -209,6 +213,8 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, REGISTER(do_volume); REGISTER(queue_enqueue); REGISTER(indirect_background); + REGISTER(shader_setup); + REGISTER(shader_sort); REGISTER(shader_eval); REGISTER(holdout_emission_blurring_pathtermination_ao); REGISTER(subsurface_scatter); diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 585b91876a9..8b7f1a8d405 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -31,6 +31,8 @@ #include "kernel/split/kernel_do_volume.h" #include "kernel/split/kernel_queue_enqueue.h" #include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_setup.h" +#include "kernel/split/kernel_shader_sort.h" #include "kernel/split/kernel_shader_eval.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" #include "kernel/split/kernel_subsurface_scatter.h" @@ -108,7 +110,9 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index 6baee460986..5bfb31b193a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_shader_eval( ccl_global char *kg, ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics; - kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics); + kernel_shader_eval((KernelGlobals*)kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl new file mode 100644 index 00000000000..38bfd04ad4c --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_setup.h" + +__kernel void kernel_ocl_path_trace_shader_setup( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + ccl_local unsigned int local_queue_atomics; + kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl new file mode 100644 index 00000000000..6f722915d45 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_sort.h" + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void kernel_ocl_path_trace_shader_sort( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + ccl_local ShaderSortLocals locals; + kernel_shader_sort((KernelGlobals*)kg, &locals); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl index 732cda30115..8de82db7afe 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -23,6 +23,8 @@ #include "kernel/kernels/opencl/kernel_do_volume.cl" #include "kernel/kernels/opencl/kernel_indirect_background.cl" #include "kernel/kernels/opencl/kernel_queue_enqueue.cl" +#include "kernel/kernels/opencl/kernel_shader_setup.cl" +#include "kernel/kernels/opencl/kernel_shader_sort.cl" #include "kernel/kernels/opencl/kernel_shader_eval.cl" #include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" #include "kernel/kernels/opencl/kernel_subsurface_scatter.cl" diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 957d70958d9..2801b32f285 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,54 +16,40 @@ CCL_NAMESPACE_BEGIN -/* This kernel sets up the ShaderData structure from the values computed +/* This kernel evaluates ShaderData structure from the values computed * by the previous kernels. - * - * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them - * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ -ccl_device void kernel_shader_eval(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) +ccl_device void kernel_shader_eval(KernelGlobals *kg) { - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + /* Sorting on cuda split is not implemented */ +#ifdef __KERNEL_CUDA__ + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; +#else + int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS]; +#endif + if(ray_index >= queue_index) { + return; + } ray_index = get_ray_index(kg, ray_index, +#ifdef __KERNEL_CUDA__ QUEUE_ACTIVE_AND_REGENERATED_RAYS, +#else + QUEUE_SHADER_SORTED_RAYS, +#endif kernel_split_state.queue_data, kernel_split_params.queue_size, 0); - ccl_global char *ray_state = kernel_split_state.ray_state; - - char enqueue_flag = 0; - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { - enqueue_flag = 1; + if(ray_index == QUEUE_EMPTY_SLOT) { + return; } - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - - /* Continue on with shader evaluation. */ + ccl_global char *ray_state = kernel_split_state.ray_state; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - Intersection isect = kernel_split_state.isect[ray_index]; RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - Ray ray = kernel_split_state.ray[ray_index]; - - shader_setup_from_ray(kg, - &kernel_split_state.sd[ray_index], - &isect, - &ray); #ifndef __BRANCHED_PATH__ float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h new file mode 100644 index 00000000000..0432689d9fa --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_setup.h @@ -0,0 +1,70 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel sets up the ShaderData structure from the values computed + * by the previous kernels. + * + * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them + * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + */ +ccl_device void kernel_shader_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection isect = kernel_split_state.isect[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + + shader_setup_from_ray(kg, + &kernel_split_state.sd[ray_index], + &isect, + &ray); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h new file mode 100644 index 00000000000..297decb0bc2 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -0,0 +1,97 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_shader_sort(KernelGlobals *kg, + ccl_local_param ShaderSortLocals *locals) +{ +#ifndef __KERNEL_CUDA__ + int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(tid == 0) { + kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize; + } + + uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE; + if(offset >= qsize) { + return; + } + + int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size); + uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size); + ccl_local uint *local_value = &locals->local_value[0]; + ccl_local ushort *local_index = &locals->local_index[0]; + + /* copy to local memory */ + for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint add = input + idx; + uint value = (~0); + if(idx < qsize) { + int ray_index = kernel_split_state.queue_data[add]; + bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(valid) { + value = kernel_split_state.sd[ray_index].shader & SHADER_MASK; + } + } + local_value[i + lid] = value; + local_index[i + lid] = i + lid; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + /* skip sorting for cpu split kernel */ +# ifdef __KERNEL_OPENCL__ + + /* bitonic sort */ + for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for (uint inc = length; inc > 0; inc >>= 1) { + for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + uint i = lid + ii; + bool direction = ((i & (length << 1)) != 0); + uint j = i ^ inc; + ushort ioff = local_index[i]; + ushort joff = local_index[j]; + uint iKey = local_value[ioff]; + uint jKey = local_value[joff]; + bool smaller = (jKey < iKey) || (jKey == iKey && j < i); + bool swap = smaller ^ (j < i) ^ direction; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + local_index[i] = (swap) ? joff : ioff; + local_index[j] = (swap) ? ioff : joff; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + } + } + } +# endif /* __KERNEL_OPENCL__ */ + + /* copy to destination */ + for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint lidx = local_index[i + lid]; + uint outi = output + idx; + uint ini = input + offset + lidx; + uint value = local_value[lidx]; + if(idx < qsize) { + kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini]; + } + } +#endif /* __KERNEL_CUDA__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index a2cb4f6ae98..913e0dfd08d 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -162,6 +162,11 @@ typedef struct BackgroundAOLocals { uint queue_atomics_ao; } BackgroundAOLocals; +typedef struct ShaderSortLocals { + uint local_value[SHADER_SORT_BLOCK_SIZE]; + ushort local_index[SHADER_SORT_BLOCK_SIZE]; +} ShaderSortLocals; + CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ |