diff options
Diffstat (limited to 'intern/cycles/kernel/kernels')
8 files changed, 34 insertions, 52 deletions
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp index 1a7b2040da1..254025be4e2 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h index 9fa39dc9ebb..7ae205b7e14 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -81,8 +81,13 @@ # error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ __launch_bounds__( \ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 628891b1458..e97e87285a5 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ kernel_##name(NULL); \ @@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ ccl_local type locals; \ diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index ba53ba4b26f..f015ac47d8a 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -235,7 +235,7 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, } __kernel void kernel_ocl_filter_finalize(int w, - int h, + int h, ccl_global float *buffer, ccl_global int *rank, ccl_global float *XtWX, diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 078acc1631e..b7108f3d0f8 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace( ccl_global float *buffer, ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader( ccl_global float4 *output, ccl_global float *output_luma, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int sx, int sw, int offset, int sample) { @@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); @@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake( ccl_global uint4 *input, ccl_global float4 *output, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int filter, int sx, int sw, int offset, int sample) { @@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); @@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -193,7 +178,7 @@ __kernel void kernel_ocl_convert_to_half_float( kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } -__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset) +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) { size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 8b85d362f8a..95b35e40a45 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init( int num_elements, ccl_global char *ray_state, ccl_global uint *rng_state, - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" - + KERNEL_BUFFER_PARAMS, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, @@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init( num_elements, ray_state, rng_state, - -#define KERNEL_TEX(type, ttype, name) name, -#include "kernel/kernel_textures.h" - + KERNEL_BUFFER_ARGS, start_sample, end_sample, sx, sy, sw, sh, offset, stride, diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl index 651addb02f4..4cbda1bc2e7 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "kernel/kernel_compat_opencl.h" // PRECOMPILED +#include "kernel/split/kernel_split_common.h" // PRECOMPILED + #include "kernel/kernels/opencl/kernel_state_buffer_size.cl" #include "kernel/kernels/opencl/kernel_data_init.cl" #include "kernel/kernels/opencl/kernel_path_init.cl" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h index f1e914a70d4..591c3846ef2 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -25,9 +25,7 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( ccl_global char *ray_state, ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, ccl_global int *queue_index, ccl_global char *use_queues_flag, @@ -52,12 +50,9 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" } - ccl_barrier(CCL_LOCAL_MEM_FENCE); + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( kg |