Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@pandora.be>2011-11-15 19:13:38 +0400
committerBrecht Van Lommel <brechtvanlommel@pandora.be>2011-11-15 19:13:38 +0400
commitdb8024f4b54ac4cf83b5346fe1548c009fd21082 (patch)
treeb005764126eff5502fcbae63048b0d17a46d6778 /intern
parent2bc78219135eba9b8079dc69ea7fd062a283a9b3 (diff)
Fix #29259: cycles issues on certain processors. Now two versions of the kernel
are compiled, one SSE optimized and the other not, and it will choose between them at runtime.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/CMakeLists.txt31
-rw-r--r--intern/cycles/SConscript28
-rw-r--r--intern/cycles/device/device_cpu.cpp63
-rw-r--r--intern/cycles/kernel/CMakeLists.txt8
-rw-r--r--intern/cycles/kernel/kernel.h7
-rw-r--r--intern/cycles/kernel/kernel_optimized.cpp60
-rw-r--r--intern/cycles/util/util_system.cpp73
-rw-r--r--intern/cycles/util/util_system.h1
8 files changed, 226 insertions, 45 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index d1ee5e0050d..cfff7485e61 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -9,31 +9,18 @@ include(cmake/external_libs.cmake)
# Build Flags
if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
- set(GCC_OPTIM_FLAGS "-ffast-math -msse -msse2 -msse3")
-endif()
-
-if(APPLE)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-endif()
+ set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
-if(WIN32)
- if(MSVC)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
- set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+ if(WIN32 AND MSVC)
+ set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
elseif(CMAKE_COMPILER_IS_GNUCC)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+ set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -DGOGOGO")
endif()
endif()
-if(UNIX AND NOT APPLE)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-endif()
-
-# not needed yet, is for open shading language
-set(RTTI_DISABLE_FLAGS "")
+# for OSL, not needed yet
+# set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+# set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
# Definitions and Includes
@@ -42,6 +29,10 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
add_definitions(-DCCL_NAMESPACE_END=})
+if(WITH_CYCLES_OPTIMIZED_KERNEL)
+ add_definitions(-DWITH_OPTIMIZED_KERNEL)
+endif()
+
if(WITH_CYCLES_NETWORK)
add_definitions(-DWITH_NETWORK)
endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index e2c81edea37..1acb7321f09 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -10,11 +10,10 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
sources.remove(path.join('util', 'util_view.cpp'))
sources.remove(path.join('render', 'film_response.cpp'))
+sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
incs = []
defs = []
-ccflags = []
-cxxflags = []
defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
defs.append('CCL_NAMESPACE_END=}')
@@ -23,14 +22,6 @@ defs.append('WITH_OPENCL')
defs.append('WITH_MULTI')
defs.append('WITH_CUDA')
-if env['OURPLATFORM'] in ('win32-mingw'):
- if env['WITH_BF_RAYOPTIMIZATION']:
- cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
- ccflags.append('-ffast-math -msse -msse2 -msse3'.split())
- # not needed yet, is for open shading language
- # cxxflags.append('-fno-rtti'.split())
- # defs.append('BOOST_NO_RTTI BOOST_NO_TYPEID'.split())
-
incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna'.split())
incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
@@ -39,5 +30,20 @@ incs.append(cycles['BF_OIIO_INC'])
incs.append(cycles['BF_BOOST_INC'])
incs.append(cycles['BF_PYTHON_INC'])
-cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None], cc_compileflags=ccflags, cxx_compileflags=cxxflags)
+# optimized kernel
+if env['WITH_BF_RAYOPTIMIZATION']:
+ optim_cxxflags = []
+
+ if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
+ optim_cxxflags.append('/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast'.split())
+ else:
+ optim_cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
+
+ optim_defs = defs + ['WITH_OPTIMIZED_KERNEL']
+ optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
+
+ cycles_optim = cycles.Clone()
+ cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[0], compileflags=[None], cxx_compileflags=optim_cxxflags)
+
+cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None])
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index d6e1c200996..990b7cb94b0 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -48,6 +48,9 @@ public:
{
kg = kernel_globals_create();
+ /* do now to avoid thread issues */
+ system_cpu_support_optimized();
+
if(threads_num == 0)
threads_num = system_cpu_thread_count();
@@ -155,12 +158,26 @@ public:
OSLShader::thread_init(kg);
#endif
- for(int y = task.y; y < task.y + task.h; y++) {
- for(int x = task.x; x < task.x + task.w; x++)
- kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
+#ifdef WITH_OPTIMIZED_KERNEL
+ if(system_cpu_support_optimized()) {
+ for(int y = task.y; y < task.y + task.h; y++) {
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
- if(tasks.worker_cancel())
- break;
+ if(tasks.worker_cancel())
+ break;
+ }
+ }
+ else
+#endif
+ {
+ for(int y = task.y; y < task.y + task.h; y++) {
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
+
+ if(tasks.worker_cancel())
+ break;
+ }
}
#ifdef WITH_OSL
@@ -171,9 +188,18 @@ public:
void thread_tonemap(DeviceTask& task)
{
- for(int y = task.y; y < task.y + task.h; y++) {
- for(int x = task.x; x < task.x + task.w; x++)
- kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
+#ifdef WITH_OPTIMIZED_KERNEL
+ if(system_cpu_support_optimized()) {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
+ }
+ else
+#endif
+ {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
}
}
@@ -184,11 +210,24 @@ public:
OSLShader::thread_init(kg);
#endif
- for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
- kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
+#ifdef WITH_OPTIMIZED_KERNEL
+ if(system_cpu_support_optimized()) {
+ for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
+ kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
+
+ if(tasks.worker_cancel())
+ break;
+ }
+ }
+ else
+#endif
+ {
+ for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
+ kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
- if(tasks.worker_cancel())
- break;
+ if(tasks.worker_cancel())
+ break;
+ }
}
#ifdef WITH_OSL
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 2bfb6c58120..73425486be1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -8,6 +8,7 @@ set(INC
set(SRC
kernel.cpp
+ kernel_optimized.cpp
kernel.cl
kernel.cu
)
@@ -123,11 +124,15 @@ include_directories(${INC})
add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_SVM_HEADERS})
+if(WITH_CYCLES_OPTIMIZED_KERNEL)
+ SET_SOURCE_FILES_PROPERTIES(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS ${CYCLES_OPTIMIZED_KERNEL_FLAGS})
+endif()
+
if(WITH_CYCLES_CUDA)
add_dependencies(cycles_kernel cycles_kernel_cuda)
endif()
-# OPENCL kernel
+# OpenCL kernel
#set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
#add_custom_command(
@@ -142,3 +147,4 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 7f60730e8bf..700ee49c5f2 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -38,9 +38,14 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
void kernel_cpu_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
-
void kernel_cpu_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
+#ifdef WITH_OPTIMIZED_KERNEL
+void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
+void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
+void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
+#endif
+
CCL_NAMESPACE_END
#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_optimized.cpp
new file mode 100644
index 00000000000..85a2b798a62
--- /dev/null
+++ b/intern/cycles/kernel/kernel_optimized.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3
+ optimization flags and nearly all functions inlined, while kernel.cpp
+ is compiled without for other CPU's. */
+
+#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_displace.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y)
+{
+ kernel_path_trace(kg, buffer, rng_state, sample, x, y);
+}
+
+/* Tonemapping */
+
+void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y)
+{
+ kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y);
+}
+
+/* Displacement */
+
+void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i)
+{
+ kernel_displace(kg, input, offset, i);
+}
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 8b09f227a74..abf5e08de97 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -118,5 +118,78 @@ int system_cpu_bits()
return (sizeof(void*)*8);
}
+#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)
+
+struct CPUCapabilities {
+ bool x64;
+ bool mmx;
+ bool sse;
+ bool sse2;
+ bool sse3;
+ bool ssse3;
+ bool sse41;
+ bool sse42;
+ bool sse4a;
+ bool avx;
+ bool xop;
+ bool fma3;
+ bool fma4;
+};
+
+bool system_cpu_support_optimized()
+{
+ static CPUCapabilities caps;
+ static bool caps_init = false;
+
+ if(!caps_init) {
+ int result[4], num, num_ex;
+
+ memset(&caps, 0, sizeof(caps));
+
+ __cpuid(result, 0);
+ num = result[0];
+
+ __cpuid(result, 0x80000000);
+ num_ex = result[0];
+
+ if(num >= 1){
+ __cpuid(result, 0x00000001);
+ caps.mmx = (result[3] & ((int)1 << 23)) != 0;
+ caps.sse = (result[3] & ((int)1 << 25)) != 0;
+ caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
+ caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+
+ caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
+ caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
+ caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+
+ caps.avx = (result[2] & ((int)1 << 28)) != 0;
+ caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
+ }
+
+ /*if(num_ex >= 0x80000001){
+ __cpuid(result, 0x80000001);
+ caps.x64 = (result[3] & ((int)1 << 29)) != 0;
+ caps.sse4a = (result[2] & ((int)1 << 6)) != 0;
+ caps.fma4 = (result[2] & ((int)1 << 16)) != 0;
+ caps.xop = (result[2] & ((int)1 << 11)) != 0;
+ }*/
+
+ caps_init = true;
+ }
+
+ /* optimization flags use these */
+ return caps.sse && caps.sse2 && caps.sse3;
+}
+
+#else
+
+bool system_cpu_support_optimized()
+{
+ return false;
+}
+
+#endif
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 214b3a18ca3..f25e009a250 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
int system_cpu_thread_count();
string system_cpu_brand_string();
int system_cpu_bits();
+bool system_cpu_support_optimized();
CCL_NAMESPACE_END