diff options
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/CMakeLists.txt | 6 | ||||
-rw-r--r-- | intern/cycles/SConscript | 25 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 48 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel.h | 13 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_bvh.h | 24 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_sse2.cpp (renamed from intern/cycles/kernel/kernel_optimized.cpp) | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_sse3.cpp | 60 | ||||
-rw-r--r-- | intern/cycles/util/util_system.cpp | 15 | ||||
-rw-r--r-- | intern/cycles/util/util_system.h | 3 |
10 files changed, 166 insertions, 40 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 535239a9205..226218ae512 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -13,10 +13,12 @@ if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD) endif() if(WIN32 AND MSVC) - set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc") + set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc") + set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /EHsc") elseif(CMAKE_COMPILER_IS_GNUCC) - set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse") + set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") + set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index 19af7dede9f..8a8ef9cce39 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -36,7 +36,8 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob(' sources.remove(path.join('util', 'util_view.cpp')) sources.remove(path.join('render', 'film_response.cpp')) -sources.remove(path.join('kernel', 'kernel_optimized.cpp')) +sources.remove(path.join('kernel', 'kernel_sse2.cpp')) +sources.remove(path.join('kernel', 'kernel_sse3.cpp')) incs = [] defs = [] @@ -73,21 +74,29 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', ' # optimized kernel if env['WITH_BF_RAYOPTIMIZATION']: - optim_cxxflags = Split(env['CXXFLAGS']) + sse2_cxxflags = Split(env['CXXFLAGS']) + sse3_cxxflags = Split(env['CXXFLAGS']) if env['OURPLATFORM'] == 'win32-vc': - optim_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse2_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse3_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) elif env['OURPLATFORM'] == 'win64-vc': - optim_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) else: - optim_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split()) + sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split()) + sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split()) defs.append('WITH_OPTIMIZED_KERNEL') optim_defs = defs[:] - optim_sources = [path.join('kernel', 'kernel_optimized.cpp')] - cycles_optim = cycles.Clone() - cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=optim_cxxflags) + cycles_sse3 = cycles.Clone() + sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')] + cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags) + + cycles_sse2 = cycles.Clone() + sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')] + cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags) cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index a1d7706a34e..1915245bb55 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -58,7 +58,8 @@ public: #endif /* do now to avoid thread issues */ - system_cpu_support_optimized(); + system_cpu_support_sse2(); + system_cpu_support_sse3(); } ~CPUDevice() @@ -170,7 +171,7 @@ public: int end_sample = tile.start_sample + tile.num_samples; #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_optimized()) { + if(system_cpu_support_sse2()) { for(int sample = start_sample; sample < end_sample; sample++) { if (task.get_cancel() || task_pool.cancelled()) { if(task.need_finish_queue == false) @@ -179,7 +180,26 @@ public: for(int y = tile.y; y < tile.y + tile.h; y++) { for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_optimized_path_trace(&kg, render_buffer, rng_state, + kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else if(system_cpu_support_sse3()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.cancelled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state, sample, x, y, tile.offset, tile.stride); } } @@ -227,10 +247,16 @@ public: void thread_tonemap(DeviceTask& task) { #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_optimized()) { + if(system_cpu_support_sse2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, + task.sample, task.resolution, x, y, task.offset, task.stride); + } + else if(system_cpu_support_sse3()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_optimized_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, + kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, task.sample, task.resolution, x, y, task.offset, task.stride); } else @@ -252,9 +278,17 @@ public: #endif #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_optimized()) { + if(system_cpu_support_sse2()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + + if(task_pool.cancelled()) + break; + } + } + else if(system_cpu_support_sse3()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_optimized_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); if(task_pool.cancelled()) break; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 6d5b9a063a0..e83756b7c8a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -12,7 +12,8 @@ set(INC_SYS set(SRC kernel.cpp - kernel_optimized.cpp + kernel_sse2.cpp + kernel_sse3.cpp kernel.cl kernel.cu ) @@ -149,7 +150,8 @@ include_directories(SYSTEM ${INC_SYS}) add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS}) if(WITH_CYCLES_OPTIMIZED_KERNEL) - set_source_files_properties(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_OPTIMIZED_KERNEL_FLAGS}") + set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") endif() if(WITH_CYCLES_CUDA) diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 26c0bcd6d1a..20ea5a61906 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -44,11 +44,18 @@ void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #ifdef WITH_OPTIMIZED_KERNEL -void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, +void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); -void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, +void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride); -void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output, +void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i); + +void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, + int sample, int resolution, int x, int y, int offset, int stride); +void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #endif diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h index 1a85b5bbefd..2b9ebf35d0c 100644 --- a/intern/cycles/kernel/kernel_bvh.h +++ b/intern/cycles/kernel/kernel_bvh.h @@ -126,21 +126,21 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg, /* intersect ray against child nodes */ float3 ood = P * idir; - float c0lox = n0xy.x * idir.x - ood.x; - float c0hix = n0xy.y * idir.x - ood.x; - float c0loy = n0xy.z * idir.y - ood.y; - float c0hiy = n0xy.w * idir.y - ood.y; - float c0loz = nz.x * idir.z - ood.z; - float c0hiz = nz.y * idir.z - ood.z; + NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x; + NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x; + NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y; + NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y; + NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z; + NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z; NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - float c1loz = nz.z * idir.z - ood.z; - float c1hiz = nz.w * idir.z - ood.z; - float c1lox = n1xy.x * idir.x - ood.x; - float c1hix = n1xy.y * idir.x - ood.x; - float c1loy = n1xy.z * idir.y - ood.y; - float c1hiy = n1xy.w * idir.y - ood.y; + NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z; + NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z; + NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x; + NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x; + NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y; + NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y; NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_sse2.cpp index 0b662095133..7947107a43c 100644 --- a/intern/cycles/kernel/kernel_optimized.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -35,21 +35,21 @@ CCL_NAMESPACE_BEGIN /* Path Tracing */ -void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) { kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } /* Tonemapping */ -void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride) +void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride) { kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride); } /* Shader Evaluate */ -void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) { kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); } diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp new file mode 100644 index 00000000000..9a8b389cf68 --- /dev/null +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -0,0 +1,60 @@ +/* + * Copyright 2011, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#ifdef WITH_OPTIMIZED_KERNEL + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_displace.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Tonemapping */ + +void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride) +{ + kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +{ + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); +} + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 2d9f0fffae6..4fda090e09e 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -136,7 +136,7 @@ struct CPUCapabilities { bool fma4; }; -bool system_cpu_support_optimized() +static CPUCapabilities& system_cpu_capabilities() { static CPUCapabilities caps; static bool caps_init = false; @@ -182,7 +182,18 @@ bool system_cpu_support_optimized() caps_init = true; } - /* optimization flags use these */ + return caps; +} + +bool system_cpu_support_sse2() +{ + CPUCapabilities& caps = system_cpu_capabilities(); + return caps.sse && caps.sse2; +} + +bool system_cpu_support_sse3() +{ + CPUCapabilities& caps = system_cpu_capabilities(); return caps.sse && caps.sse2 && caps.sse3; } diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index f25e009a250..257112883d1 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -26,7 +26,8 @@ CCL_NAMESPACE_BEGIN int system_cpu_thread_count(); string system_cpu_brand_string(); int system_cpu_bits(); -bool system_cpu_support_optimized(); +bool system_cpu_support_sse2(); +bool system_cpu_support_sse3(); CCL_NAMESPACE_END |