diff options
author | Mai Lavelle <mai.lavelle@gmail.com> | 2017-02-14 13:50:29 +0300 |
---|---|---|
committer | Mai Lavelle <mai.lavelle@gmail.com> | 2017-03-08 09:24:53 +0300 |
commit | 817873cc83034c460f1be6bf410c95ff009f3ae2 (patch) | |
tree | d50373c256ff02d5f12b067be50c7401c326332b /intern/cycles/kernel/kernels | |
parent | 0892352bfe6d5a9aa6ec4c088e67f8bbbbfae610 (diff) |
Cycles: CUDA implementation of split kernel
Diffstat (limited to 'intern/cycles/kernel/kernels')
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel.cu | 100 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_config.h | 110 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_split.cu | 118 |
3 files changed, 231 insertions, 97 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 090ab2c50c2..52e541321e3 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,7 +16,10 @@ /* CUDA kernel entry points */ +#ifdef __CUDA_ARCH__ + #include "../../kernel_compat_cuda.h" +#include "kernel_config.h" #include "../../kernel_math.h" #include "../../kernel_types.h" #include "../../kernel_globals.h" @@ -25,104 +28,7 @@ #include "../../kernel_path_branched.h" #include "../../kernel_bake.h" -/* device data taken from CUDA occupancy calculator */ - -#ifdef __CUDA_ARCH__ - -/* 2.0 and 2.1 */ -#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 32 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 - -/* 3.0 and 3.5 */ -#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.0, 5.2, 5.3, 6.0, 6.1 */ -#elif __CUDA_ARCH__ >= 500 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 48 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - /* kernels */ - extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h new file mode 100644 index 00000000000..9fa39dc9ebb --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* device data taken from CUDA occupancy calculator */ + +/* 2.0 and 2.1 */ +#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 32 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* 3.0 and 3.5 */ +#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0, 5.2, 5.3, 6.0, 6.1 */ +#elif __CUDA_ARCH__ >= 500 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 48 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu new file mode 100644 index 00000000000..441cd96fafa --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -0,0 +1,118 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA split kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#define __SPLIT_KERNEL__ + +#include "../../kernel_compat_cuda.h" +#include "kernel_config.h" + +#include "../../split/kernel_split_common.h" +#include "../../split/kernel_data_init.h" +#include "../../split/kernel_scene_intersect.h" +#include "../../split/kernel_lamp_emission.h" +#include "../../split/kernel_queue_enqueue.h" +#include "../../split/kernel_background_buffer_update.h" +#include "../../split/kernel_shader_eval.h" +#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "../../split/kernel_direct_lighting.h" +#include "../../split/kernel_shadow_blocked.h" +#include "../../split/kernel_next_iteration_setup.h" +#include "../../split/kernel_sum_all_radiance.h" + +#include "../../kernel_film.h" + +/* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace_data_init( + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer) +{ + kernel_data_init(NULL, + NULL, + split_data_buffer, + num_elements, + ray_state, + rng_state, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, + Queue_index, + queuesize, + use_queues_flag, + work_pool_wgs, + num_samples, + buffer); +} + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + kernel_##name(NULL); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +#endif + |