diff options
author | Thomas Dinges <blender@dingto.org> | 2014-01-16 20:04:11 +0400 |
---|---|---|
committer | Thomas Dinges <blender@dingto.org> | 2014-01-16 20:04:11 +0400 |
commit | de28a4d4b2c9397c5233a5ee1dbf1400f450a15c (patch) | |
tree | 1fe23de963e206af3fb2ff2d9e2e3393cd89149c /intern/cycles/device | |
parent | 7c6d52eb07c4bd8142a95eca1dbdc794063859b8 (diff) |
Cycles: Add an AVX kernel for CPU rendering.
* AVX is available on Intel Sandy Bridge and newer and AMD Bulldozer and newer.
* We don't use dedicated AVX intrinsics yet, but gcc auto vectorization gives a 3% performance improvement for Caminandes. Tested on an i5-3570, Linux x64.
* No change for Windows yet, MSVC 2008 does not support AVX.
Reviewed by: brecht
Differential Revision: https://developer.blender.org/D216
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index b29d64eb454..76123fe44d2 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -61,6 +61,7 @@ public: system_cpu_support_sse2(); system_cpu_support_sse3(); system_cpu_support_sse41(); + system_cpu_support_avx(); } ~CPUDevice() @@ -166,6 +167,28 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int sample = start_sample; sample < end_sample; sample++) { @@ -270,6 +293,15 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int y = task.y; y < task.y + task.h; y++) @@ -305,6 +337,15 @@ public: } } else { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int y = task.y; y < task.y + task.h; y++) @@ -349,6 +390,17 @@ public: OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + + if(task_pool.canceled()) + break; + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { |