From 0892352bfe6d5a9aa6ec4c088e67f8bbbbfae610 Mon Sep 17 00:00:00 2001 From: Mai Lavelle Date: Tue, 14 Feb 2017 06:20:48 -0500 Subject: Cycles: CPU implementation of split kernel --- intern/cycles/blender/addon/properties.py | 1 + intern/cycles/blender/addon/ui.py | 1 + intern/cycles/blender/blender_python.cpp | 1 + intern/cycles/device/device_cpu.cpp | 335 ++++++++++++++++++++- intern/cycles/kernel/CMakeLists.txt | 11 + intern/cycles/kernel/kernel.h | 2 + intern/cycles/kernel/kernel_compat_cpu.h | 9 + intern/cycles/kernel/kernel_globals.h | 7 + intern/cycles/kernel/kernel_types.h | 25 +- intern/cycles/kernel/kernels/cpu/kernel_cpu.h | 35 +++ intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h | 89 +++++- intern/cycles/kernel/kernels/cpu/kernel_split.cpp | 63 ++++ .../cycles/kernel/kernels/cpu/kernel_split_avx.cpp | 38 +++ .../kernel/kernels/cpu/kernel_split_avx2.cpp | 40 +++ .../kernel/kernels/cpu/kernel_split_sse2.cpp | 34 +++ .../kernel/kernels/cpu/kernel_split_sse3.cpp | 36 +++ .../kernel/kernels/cpu/kernel_split_sse41.cpp | 37 +++ intern/cycles/kernel/osl/osl_closures.cpp | 1 + intern/cycles/kernel/osl/osl_services.cpp | 1 + intern/cycles/kernel/osl/osl_shader.cpp | 1 + intern/cycles/kernel/split/kernel_data_init.h | 4 + intern/cycles/kernel/split/kernel_split_common.h | 12 +- intern/cycles/util/util_debug.cpp | 8 +- intern/cycles/util/util_debug.h | 3 + intern/cycles/util/util_types.h | 3 + 25 files changed, 776 insertions(+), 21 deletions(-) create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_split.cpp create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 5c51f9afc28..1f0b712c93e 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -665,6 +665,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True) cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True) cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True) + cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False) diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 44af5f7efed..8d3fe877597 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -1518,6 +1518,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): row.prop(cscene, "debug_use_cpu_avx", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) col.prop(cscene, "debug_use_qbvh") + col.prop(cscene, "debug_use_cpu_split_kernel") col = layout.column() col.label('CUDA Flags:') diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 438abc49f88..ed410e15e7b 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -67,6 +67,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3"); flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh"); + flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); /* Synchronize OpenCL kernel type. */ diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 78e2e3ea711..702f2a9136b 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -26,10 +26,12 @@ #include "device.h" #include "device_intern.h" +#include "device_split_kernel.h" #include "kernel.h" #include "kernel_compat_cpu.h" #include "kernel_types.h" +#include "split/kernel_split_data.h" #include "kernel_globals.h" #include "osl_shader.h" @@ -41,6 +43,7 @@ #include "util_foreach.h" #include "util_function.h" #include "util_logging.h" +#include "util_map.h" #include "util_opengl.h" #include "util_progress.h" #include "util_system.h" @@ -48,8 +51,92 @@ CCL_NAMESPACE_BEGIN +class CPUDevice; + +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(DeviceTask *task); +}; + class CPUDevice : public Device { + static unordered_map kernel_functions; + + static void register_kernel_function(const char* name, void* func) + { + kernel_functions[name] = func; + } + + static const char* get_arch_name() + { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + return "cpu_avx2"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + return "cpu_avx"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + return "cpu_sse41"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + return "cpu_sse3"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { + return "cpu_sse2"; + } + else +#endif + { + return "cpu"; + } + } + + template + static F get_kernel_function(string name) + { + name = string("kernel_") + get_arch_name() + "_" + name; + + unordered_map::iterator it = kernel_functions.find(name); + + if(it == kernel_functions.end()) { + assert(!"kernel function not found"); + return NULL; + } + + return (F)it->second; + } + + friend class CPUSplitKernel; + public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -57,10 +144,15 @@ public: #ifdef WITH_OSL OSLGlobals osl_globals; #endif + + bool use_split_kernel; + + DeviceRequestedFeatures requested_features; CPUDevice(DeviceInfo& info, Stats &stats, bool background) : Device(info, stats, background) { + #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif @@ -105,6 +197,28 @@ public: { VLOG(1) << "Will be using regular kernels."; } + + use_split_kernel = DebugFlags().cpu.split_kernel; + if(use_split_kernel) { + VLOG(1) << "Will be using split kernel."; + } + + kernel_cpu_register_functions(register_kernel_function); +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + kernel_cpu_sse2_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + kernel_cpu_sse3_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + kernel_cpu_sse41_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + kernel_cpu_avx_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + kernel_cpu_avx2_register_functions(register_kernel_function); +#endif } ~CPUDevice() @@ -205,8 +319,14 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) - thread_path_trace(*task); + if(task->type == DeviceTask::PATH_TRACE) { + if(!use_split_kernel) { + thread_path_trace(*task); + } + else { + thread_path_trace_split(*task); + } + } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); else if(task->type == DeviceTask::SHADER) @@ -267,7 +387,7 @@ public: { path_trace_kernel = kernel_cpu_path_trace; } - + while(task.acquire_tile(this, tile)) { float *render_buffer = (float*)tile.buffer; uint *rng_state = (uint*)tile.rng_state; @@ -303,6 +423,49 @@ public: thread_kernel_globals_free(&kg); } + void thread_path_trace_split(DeviceTask& task) + { + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + return; + } + + RenderTile tile; + + CPUSplitKernel split_kernel(this); + + /* allocate buffer for kernel globals */ + device_memory kgbuffer; + kgbuffer.resize(sizeof(KernelGlobals)); + mem_alloc(kgbuffer, MEM_READ_WRITE); + + KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer; + *kg = thread_kernel_globals_init(); + + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel.load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); + + return; + } + + while(task.acquire_tile(this, tile)) { + device_memory data; + split_kernel.path_trace(&task, tile, kgbuffer, data); + + task.release_tile(tile); + + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + } + + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); + } + void thread_film_convert(DeviceTask& task) { float sample_scale = 1.0f/(task.sample + 1); @@ -510,6 +673,10 @@ protected: inline void thread_kernel_globals_free(KernelGlobals *kg) { + if(kg == NULL) { + return; + } + if(kg->transparent_shadow_intersections != NULL) { free(kg->transparent_shadow_intersections); } @@ -524,8 +691,170 @@ protected: OSLShader::thread_free(kg); #endif } + + virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) { + requested_features = requested_features_; + + return true; + } }; +/* split kernel */ + +class CPUSplitKernelFunction : public SplitKernelFunction { +public: + CPUDevice* device; + void (*func)(KernelGlobals *kg, KernelData *data); + + CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {} + ~CPUSplitKernelFunction() {} + + virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data) + { + if(!func) { + return false; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + func(kg, (KernelData*)data.device_pointer); + } + } + + return true; + } +}; + +CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flags, + device_memory& work_pool_wgs) +{ + typedef void(*data_init_t)(KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + + data_init_t data_init; + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + data_init = kernel_cpu_avx2_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + data_init = kernel_cpu_avx_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + data_init = kernel_cpu_sse41_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + data_init = kernel_cpu_sse3_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { + data_init = kernel_cpu_sse2_data_init; + } + else +#endif + { + data_init = kernel_cpu_data_init; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + data_init((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); + } + } + + return true; +} + +SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +{ + CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); + + kernel->func = device->get_kernel_function(kernel_name); + if(!kernel->func) { + delete kernel; + return NULL; + } + + return kernel; +} + +int2 CPUSplitKernel::split_kernel_local_size() +{ + return make_int2(1, 1); +} + +int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) { + /* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */ + return task->requested_tile_size; +} + +unordered_map CPUDevice::kernel_functions; + Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index cc1ad5ce48c..d844ba007aa 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -13,6 +13,7 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_split.cpp kernels/opencl/kernel.cl kernels/opencl/kernel_data_init.cl kernels/opencl/kernel_queue_enqueue.cl @@ -316,25 +317,35 @@ if(CXX_HAS_SSE) kernels/cpu/kernel_sse2.cpp kernels/cpu/kernel_sse3.cpp kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp ) set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) list(APPEND SRC kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_split_avx.cpp ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) list(APPEND SRC kernels/cpu/kernel_avx2.cpp + kernels/cpu/kernel_split_avx2.cpp ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 9279a94c13a..cd339e6237e 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -20,6 +20,7 @@ /* CPU Kernel Interface */ #include "util_types.h" +#include "kernel_types.h" CCL_NAMESPACE_BEGIN @@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) struct KernelGlobals; +struct KernelData; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 9d1f3bdc918..e347a1eca18 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -44,6 +44,15 @@ #define ccl_addr_space +#define ccl_local_id(d) 0 +#define ccl_global_id(d) (kg->global_id[d]) + +#define ccl_local_size(d) 1 +#define ccl_global_size(d) (kg->global_size[d]) + +#define ccl_group_id(d) ccl_global_id(d) +#define ccl_num_groups(d) ccl_global_size(d) + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index e994836f6a2..5dbea4481a3 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -64,6 +64,13 @@ typedef struct KernelGlobals { /* Storage for decoupled volume steps. */ VolumeStep *decoupled_volume_steps[2]; int decoupled_volume_steps_index; + + /* split kernel */ + SplitData split_data; + SplitParams split_param_data; + + int2 global_size; + int2 global_id; } KernelGlobals; #endif /* __KERNEL_CPU__ */ diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 279cd6dedab..a016e5293ca 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -32,6 +32,11 @@ # define ccl_addr_space #endif +#if defined(__SPLIT_KERNEL__) && !defined(__COMPUTE_DEVICE_GPU__) +/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */ +#define __COMPUTE_DEVICE_GPU__ +#endif + CCL_NAMESPACE_BEGIN /* constants */ @@ -65,17 +70,23 @@ CCL_NAMESPACE_BEGIN # endif # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# endif # ifdef WITH_OSL # define __OSL__ # endif -# define __SUBSURFACE__ +# ifndef __SPLIT_KERNEL__ +# define __SUBSURFACE__ +# endif # define __CMJ__ -# define __VOLUME__ -# define __VOLUME_DECOUPLED__ -# define __VOLUME_SCATTER__ -# define __SHADOW_RECORD_ALL__ -# define __VOLUME_RECORD_ALL__ +# ifndef __SPLIT_KERNEL__ +# define __VOLUME__ +# define __VOLUME_DECOUPLED__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ +# define __VOLUME_RECORD_ALL__ +# endif #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 1a07c705f1c..1d710157817 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -49,4 +49,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample); +/* Split kernels */ + +void KERNEL_FUNCTION_FULL_NAME(data_init)( + KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + +#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data); + +DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DECLARE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) +DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) + +void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); + #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index ec82d4b4c22..c59f4892546 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -21,17 +21,39 @@ */ #include "kernel_compat_cpu.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_cpu_image.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_path_branched.h" -#include "kernel_bake.h" + +#ifndef __SPLIT_KERNEL__ +# include "kernel_math.h" +# include "kernel_types.h" + +# include "split/kernel_split_data.h" +# include "kernel_globals.h" + +# include "kernel_cpu_image.h" +# include "kernel_film.h" +# include "kernel_path.h" +# include "kernel_path_branched.h" +# include "kernel_bake.h" +#else +# include "split/kernel_split_common.h" + +# include "split/kernel_data_init.h" +# include "split/kernel_scene_intersect.h" +# include "split/kernel_lamp_emission.h" +# include "split/kernel_queue_enqueue.h" +# include "split/kernel_background_buffer_update.h" +# include "split/kernel_shader_eval.h" +# include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "split/kernel_direct_lighting.h" +# include "split/kernel_shadow_blocked.h" +# include "split/kernel_next_iteration_setup.h" +# include "split/kernel_sum_all_radiance.h" +#endif CCL_NAMESPACE_BEGIN +#ifndef __SPLIT_KERNEL__ + /* Path Tracing */ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, @@ -131,4 +153,55 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, } } +#else /* __SPLIT_KERNEL__ */ + +/* Split Kernel Path Tracing */ + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + kernel_##name(kg); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) + +void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) +{ +#define REGISTER_NAME_STRING(name) #name +#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) +#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); + + REGISTER(path_trace); + REGISTER(convert_to_byte); + REGISTER(convert_to_half_float); + REGISTER(shader); + + REGISTER(data_init); + REGISTER(scene_intersect); + REGISTER(lamp_emission); + REGISTER(queue_enqueue); + REGISTER(background_buffer_update); + REGISTER(shader_eval); + REGISTER(holdout_emission_blurring_pathtermination_ao); + REGISTER(direct_lighting); + REGISTER(shadow_blocked); + REGISTER(next_iteration_setup); + REGISTER(sum_all_radiance); + +#undef REGISTER +#undef REGISTER_EVAL_NAME +#undef REGISTER_NAME_STRING +} + +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp new file mode 100644 index 00000000000..30519dae53e --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel.h" +#define KERNEL_ARCH cpu +#include "kernel_cpu_impl.h" + diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp new file mode 100644 index 00000000000..335ad24bdc5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# include "kernel.h" +# define KERNEL_ARCH cpu_avx +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp new file mode 100644 index 00000000000..765ba96aba3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# include "kernel.h" +# define KERNEL_ARCH cpu_avx2 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp new file mode 100644 index 00000000000..af244c03929 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# include "kernel.h" +# define KERNEL_ARCH cpu_sse2 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp new file mode 100644 index 00000000000..d1b579eeac5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# include "kernel.h" +# define KERNEL_ARCH cpu_sse3 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp new file mode 100644 index 00000000000..83d62de5aa5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# include "kernel.h" +# define KERNEL_ARCH cpu_sse41 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 94de782dca0..4cb46254bc7 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -42,6 +42,7 @@ #include "kernel_types.h" #include "kernel_compat_cpu.h" +#include "split/kernel_split_data.h" #include "kernel_globals.h" #include "kernel_montecarlo.h" #include "kernel_random.h" diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 58bbdc33920..2d5b9c1c1cc 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -39,6 +39,7 @@ #include "util_string.h" #include "kernel_compat_cpu.h" +#include "split/kernel_split_data.h" #include "kernel_globals.h" #include "kernel_random.h" #include "kernel_projection.h" diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 0d762bbdb38..78848d7dfc9 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -19,6 +19,7 @@ #include "kernel_compat_cpu.h" #include "kernel_montecarlo.h" #include "kernel_types.h" +#include "split/kernel_split_data.h" #include "kernel_globals.h" #include "geom/geom_object.h" diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 3251c091468..5604363dcd9 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -51,7 +51,11 @@ CCL_NAMESPACE_BEGIN * The number of elements in the queues is initialized to 0; */ +#ifndef __KERNEL_CPU__ ccl_device void kernel_data_init( +#else +void KERNEL_FUNCTION_FULL_NAME(data_init)( +#endif KernelGlobals *kg, ccl_constant KernelData *data, ccl_global void *split_data_buffer, diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index c3963667aea..dd0c3f9c941 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -23,7 +23,17 @@ #include "kernel_split_data.h" #include "kernel_globals.h" -#include "kernel_image_opencl.h" + +#ifdef __OSL__ +# include "osl_shader.h" +#endif + +#ifdef __KERNEL_OPENCL__ +# include "kernel_image_opencl.h" +#endif +#ifdef __KERNEL_CPU__ +# include "../kernels/cpu/kernel_cpu_image.h" +#endif #include "util_atomic.h" diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 80d177d2cae..318248998aa 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -29,7 +29,8 @@ DebugFlags::CPU::CPU() sse41(true), sse3(true), sse2(true), - qbvh(true) + qbvh(true), + split_kernel(false) { reset(); } @@ -55,6 +56,7 @@ void DebugFlags::CPU::reset() #undef CHECK_CPU_FLAGS qbvh = true; + split_kernel = false; } DebugFlags::CUDA::CUDA() @@ -133,7 +135,9 @@ std::ostream& operator <<(std::ostream &os, << " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n" << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" - << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"; + << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n" + << " QBVH : " << string_from_bool(debug_flags.cpu.qbvh) << "\n" + << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; os << "CUDA flags:\n" << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 73fd228b5d9..171e43ec32a 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -46,6 +46,9 @@ public: /* Whether QBVH usage is allowed or not. */ bool qbvh; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of CUDA feature-set to be used. */ diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 751d875fb22..36d2f1053c7 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -37,6 +37,9 @@ #define ccl_device_noinline static #define ccl_global #define ccl_constant +#define ccl_local +#define ccl_local_param +#define ccl_private #define ccl_restrict __restrict #define __KERNEL_WITH_SSE_ALIGN__ -- cgit v1.2.3