diff options
author | Martijn Berger <martijn.berger@gmail.com> | 2013-11-22 17:16:47 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2013-11-22 17:42:41 +0400 |
commit | e3a79258d17e6cdca26120eab7a2c48c7c4d4a0f (patch) | |
tree | 77d59694458125dd7525faf59ed56ce505533981 /intern/cycles/device | |
parent | 5feb0d2bfe8f6723bf48073b1760b732bc6a5ceb (diff) |
Cycles: test code for sse 4.1 kernel and alignment for some vector types.
This is mostly work towards enabling the __KERNEL_SSE__ option to start using
SIMD operations for vector math operations. This 4.1 kernel performes about 8%
faster with that option but overall is still slower than without the option.
WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 is the cmake flag for testing this kernel.
Alignment of int3, int4, float3, float4 to 16 bytes seems to give a slight 1-2%
speedup on tested systems with the current kernel already, so is enabled now.
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/CMakeLists.txt | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 50 |
2 files changed, 54 insertions, 0 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index fe2368b7ea8..920223dd8a4 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -13,6 +13,10 @@ set(INC_SYS ${GLEW_INCLUDE_PATH} ) +if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41) + add_definitions(-DWITH_CYCLES_OPTIMIZED_KERNEL_SSE41=1) +endif() + set(SRC device.cpp device_cpu.cpp diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index d04c5df82fb..85a7b9c186d 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -58,6 +58,7 @@ public: /* do now to avoid thread issues */ system_cpu_support_sse2(); system_cpu_support_sse3(); + system_cpu_support_sse41(); } ~CPUDevice() @@ -164,6 +165,28 @@ public: int end_sample = tile.start_sample + tile.num_samples; #ifdef WITH_OPTIMIZED_KERNEL +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else +#endif if(system_cpu_support_sse3()) { for(int sample = start_sample; sample < end_sample; sample++) { if (task.get_cancel() || task_pool.canceled()) { @@ -243,6 +266,15 @@ public: if(task.rgba_half) { #ifdef WITH_OPTIMIZED_KERNEL +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif if(system_cpu_support_sse3()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) @@ -266,6 +298,14 @@ public: } else { #ifdef WITH_OPTIMIZED_KERNEL +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } +#endif if(system_cpu_support_sse3()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) @@ -298,6 +338,16 @@ public: #endif #ifdef WITH_OPTIMIZED_KERNEL +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + + if(task_pool.canceled()) + break; + } + } +#endif if(system_cpu_support_sse3()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); |