Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/CMakeLists.txt6
-rw-r--r--intern/cycles/SConscript25
-rw-r--r--intern/cycles/device/device_cpu.cpp48
-rw-r--r--intern/cycles/kernel/CMakeLists.txt6
-rw-r--r--intern/cycles/kernel/kernel.h13
-rw-r--r--intern/cycles/kernel/kernel_bvh.h24
-rw-r--r--intern/cycles/kernel/kernel_sse2.cpp (renamed from intern/cycles/kernel/kernel_optimized.cpp)6
-rw-r--r--intern/cycles/kernel/kernel_sse3.cpp60
-rw-r--r--intern/cycles/util/util_system.cpp15
-rw-r--r--intern/cycles/util/util_system.h3
10 files changed, 166 insertions, 40 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 535239a9205..226218ae512 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -13,10 +13,12 @@ if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
endif()
if(WIN32 AND MSVC)
- set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
+ set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
+ set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /EHsc")
elseif(CMAKE_COMPILER_IS_GNUCC)
- set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse")
+ set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
+ set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 19af7dede9f..8a8ef9cce39 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -36,7 +36,8 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
sources.remove(path.join('util', 'util_view.cpp'))
sources.remove(path.join('render', 'film_response.cpp'))
-sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
+sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
+sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
incs = []
defs = []
@@ -73,21 +74,29 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
# optimized kernel
if env['WITH_BF_RAYOPTIMIZATION']:
- optim_cxxflags = Split(env['CXXFLAGS'])
+ sse2_cxxflags = Split(env['CXXFLAGS'])
+ sse3_cxxflags = Split(env['CXXFLAGS'])
if env['OURPLATFORM'] == 'win32-vc':
- optim_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+ sse2_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+ sse3_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
elif env['OURPLATFORM'] == 'win64-vc':
- optim_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+ sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+ sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
else:
- optim_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split())
+ sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+ sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split())
defs.append('WITH_OPTIMIZED_KERNEL')
optim_defs = defs[:]
- optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
- cycles_optim = cycles.Clone()
- cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=optim_cxxflags)
+ cycles_sse3 = cycles.Clone()
+ sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+ cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+
+ cycles_sse2 = cycles.Clone()
+ sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+ cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index a1d7706a34e..1915245bb55 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -58,7 +58,8 @@ public:
#endif
/* do now to avoid thread issues */
- system_cpu_support_optimized();
+ system_cpu_support_sse2();
+ system_cpu_support_sse3();
}
~CPUDevice()
@@ -170,7 +171,7 @@ public:
int end_sample = tile.start_sample + tile.num_samples;
#ifdef WITH_OPTIMIZED_KERNEL
- if(system_cpu_support_optimized()) {
+ if(system_cpu_support_sse2()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.cancelled()) {
if(task.need_finish_queue == false)
@@ -179,7 +180,26 @@ public:
for(int y = tile.y; y < tile.y + tile.h; y++) {
for(int x = tile.x; x < tile.x + tile.w; x++) {
- kernel_cpu_optimized_path_trace(&kg, render_buffer, rng_state,
+ kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
+ sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(tile);
+ }
+ }
+ else if(system_cpu_support_sse3()) {
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if (task.get_cancel() || task_pool.cancelled()) {
+ if(task.need_finish_queue == false)
+ break;
+ }
+
+ for(int y = tile.y; y < tile.y + tile.h; y++) {
+ for(int x = tile.x; x < tile.x + tile.w; x++) {
+ kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
sample, x, y, tile.offset, tile.stride);
}
}
@@ -227,10 +247,16 @@ public:
void thread_tonemap(DeviceTask& task)
{
#ifdef WITH_OPTIMIZED_KERNEL
- if(system_cpu_support_optimized()) {
+ if(system_cpu_support_sse2()) {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
+ task.sample, task.resolution, x, y, task.offset, task.stride);
+ }
+ else if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- kernel_cpu_optimized_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
+ kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
task.sample, task.resolution, x, y, task.offset, task.stride);
}
else
@@ -252,9 +278,17 @@ public:
#endif
#ifdef WITH_OPTIMIZED_KERNEL
- if(system_cpu_support_optimized()) {
+ if(system_cpu_support_sse2()) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+ kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+ if(task_pool.cancelled())
+ break;
+ }
+ }
+ else if(system_cpu_support_sse3()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_optimized_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
if(task_pool.cancelled())
break;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 6d5b9a063a0..e83756b7c8a 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -12,7 +12,8 @@ set(INC_SYS
set(SRC
kernel.cpp
- kernel_optimized.cpp
+ kernel_sse2.cpp
+ kernel_sse3.cpp
kernel.cl
kernel.cu
)
@@ -149,7 +150,8 @@ include_directories(SYSTEM ${INC_SYS})
add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS})
if(WITH_CYCLES_OPTIMIZED_KERNEL)
- set_source_files_properties(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_OPTIMIZED_KERNEL_FLAGS}")
+ set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
endif()
if(WITH_CYCLES_CUDA)
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 26c0bcd6d1a..20ea5a61906 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -44,11 +44,18 @@ void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
#ifdef WITH_OPTIMIZED_KERNEL
-void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
-void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
int sample, int resolution, int x, int y, int offset, int stride);
-void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+ int type, int i);
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+ int sample, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+ int sample, int resolution, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
#endif
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index 1a85b5bbefd..2b9ebf35d0c 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -126,21 +126,21 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg,
/* intersect ray against child nodes */
float3 ood = P * idir;
- float c0lox = n0xy.x * idir.x - ood.x;
- float c0hix = n0xy.y * idir.x - ood.x;
- float c0loy = n0xy.z * idir.y - ood.y;
- float c0hiy = n0xy.w * idir.y - ood.y;
- float c0loz = nz.x * idir.z - ood.z;
- float c0hiz = nz.y * idir.z - ood.z;
+ NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
+ NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
+ NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
+ NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
+ NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
+ NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
- float c1loz = nz.z * idir.z - ood.z;
- float c1hiz = nz.w * idir.z - ood.z;
- float c1lox = n1xy.x * idir.x - ood.x;
- float c1hix = n1xy.y * idir.x - ood.x;
- float c1loy = n1xy.z * idir.y - ood.y;
- float c1hiy = n1xy.w * idir.y - ood.y;
+ NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
+ NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
+ NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
+ NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
+ NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
+ NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 0b662095133..7947107a43c 100644
--- a/intern/cycles/kernel/kernel_optimized.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -35,21 +35,21 @@ CCL_NAMESPACE_BEGIN
/* Path Tracing */
-void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
/* Tonemapping */
-void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride)
+void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride)
{
kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride);
}
/* Shader Evaluate */
-void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
{
kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
}
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
new file mode 100644
index 00000000000..9a8b389cf68
--- /dev/null
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_displace.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+ kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Tonemapping */
+
+void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride)
+{
+ kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+{
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+}
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 2d9f0fffae6..4fda090e09e 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -136,7 +136,7 @@ struct CPUCapabilities {
bool fma4;
};
-bool system_cpu_support_optimized()
+static CPUCapabilities& system_cpu_capabilities()
{
static CPUCapabilities caps;
static bool caps_init = false;
@@ -182,7 +182,18 @@ bool system_cpu_support_optimized()
caps_init = true;
}
- /* optimization flags use these */
+ return caps;
+}
+
+bool system_cpu_support_sse2()
+{
+ CPUCapabilities& caps = system_cpu_capabilities();
+ return caps.sse && caps.sse2;
+}
+
+bool system_cpu_support_sse3()
+{
+ CPUCapabilities& caps = system_cpu_capabilities();
return caps.sse && caps.sse2 && caps.sse3;
}
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index f25e009a250..257112883d1 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -26,7 +26,8 @@ CCL_NAMESPACE_BEGIN
int system_cpu_thread_count();
string system_cpu_brand_string();
int system_cpu_bits();
-bool system_cpu_support_optimized();
+bool system_cpu_support_sse2();
+bool system_cpu_support_sse3();
CCL_NAMESPACE_END