Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@pandora.be>2011-11-15 19:13:38 +0400
committerBrecht Van Lommel <brechtvanlommel@pandora.be>2011-11-15 19:13:38 +0400
commitdb8024f4b54ac4cf83b5346fe1548c009fd21082 (patch)
treeb005764126eff5502fcbae63048b0d17a46d6778
parent2bc78219135eba9b8079dc69ea7fd062a283a9b3 (diff)
Fix #29259: cycles issues on certain processors. Now two versions of the kernel
are compiled, one SSE optimized and the other not, and it will choose between them at runtime.
-rw-r--r--intern/cycles/CMakeLists.txt31
-rw-r--r--intern/cycles/SConscript28
-rw-r--r--intern/cycles/device/device_cpu.cpp63
-rw-r--r--intern/cycles/kernel/CMakeLists.txt8
-rw-r--r--intern/cycles/kernel/kernel.h7
-rw-r--r--intern/cycles/kernel/kernel_optimized.cpp60
-rw-r--r--intern/cycles/util/util_system.cpp73
-rw-r--r--intern/cycles/util/util_system.h1
8 files changed, 226 insertions, 45 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index d1ee5e0050d..cfff7485e61 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -9,31 +9,18 @@ include(cmake/external_libs.cmake)
# Build Flags
if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
- set(GCC_OPTIM_FLAGS "-ffast-math -msse -msse2 -msse3")
-endif()
-
-if(APPLE)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-endif()
+ set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
-if(WIN32)
- if(MSVC)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
- set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+ if(WIN32 AND MSVC)
+ set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
elseif(CMAKE_COMPILER_IS_GNUCC)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+ set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -DGOGOGO")
endif()
endif()
-if(UNIX AND NOT APPLE)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-endif()
-
-# not needed yet, is for open shading language
-set(RTTI_DISABLE_FLAGS "")
+# for OSL, not needed yet
+# set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+# set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
# Definitions and Includes
@@ -42,6 +29,10 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
add_definitions(-DCCL_NAMESPACE_END=})
+if(WITH_CYCLES_OPTIMIZED_KERNEL)
+ add_definitions(-DWITH_OPTIMIZED_KERNEL)
+endif()
+
if(WITH_CYCLES_NETWORK)
add_definitions(-DWITH_NETWORK)
endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index e2c81edea37..1acb7321f09 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -10,11 +10,10 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
sources.remove(path.join('util', 'util_view.cpp'))
sources.remove(path.join('render', 'film_response.cpp'))
+sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
incs = []
defs = []
-ccflags = []
-cxxflags = []
defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
defs.append('CCL_NAMESPACE_END=}')
@@ -23,14 +22,6 @@ defs.append('WITH_OPENCL')
defs.append('WITH_MULTI')
defs.append('WITH_CUDA')
-if env['OURPLATFORM'] in ('win32-mingw'):
- if env['WITH_BF_RAYOPTIMIZATION']:
- cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
- ccflags.append('-ffast-math -msse -msse2 -msse3'.split())
- # not needed yet, is for open shading language
- # cxxflags.append('-fno-rtti'.split())
- # defs.append('BOOST_NO_RTTI BOOST_NO_TYPEID'.split())
-
incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna'.split())
incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
@@ -39,5 +30,20 @@ incs.append(cycles['BF_OIIO_INC'])
incs.append(cycles['BF_BOOST_INC'])
incs.append(cycles['BF_PYTHON_INC'])
-cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None], cc_compileflags=ccflags, cxx_compileflags=cxxflags)
+# optimized kernel
+if env['WITH_BF_RAYOPTIMIZATION']:
+ optim_cxxflags = []
+
+ if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
+ optim_cxxflags.append('/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast'.split())
+ else:
+ optim_cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
+
+ optim_defs = defs + ['WITH_OPTIMIZED_KERNEL']
+ optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
+
+ cycles_optim = cycles.Clone()
+ cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[0], compileflags=[None], cxx_compileflags=optim_cxxflags)
+
+cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None])
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index d6e1c200996..990b7cb94b0 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -48,6 +48,9 @@ public:
{
kg = kernel_globals_create();
+ /* do now to avoid thread issues */
+ system_cpu_support_optimized();
+
if(threads_num == 0)
threads_num = system_cpu_thread_count();
@@ -155,12 +158,26 @@ public:
OSLShader::thread_init(kg);
#endif
- for(int y = task.y; y < task.y + task.h; y++) {
- for(int x = task.x; x < task.x + task.w; x++)
- kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
+#ifdef WITH_OPTIMIZED_KERNEL
+ if(system_cpu_support_optimized()) {
+ for(int y = task.y; y < task.y + task.h; y++) {
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
- if(tasks.worker_cancel())
- break;
+ if(tasks.worker_cancel())
+ break;
+ }
+ }
+ else
+#endif
+ {
+ for(int y = task.y; y < task.y + task.h; y++) {
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
+
+ if(tasks.worker_cancel())
+ break;
+ }
}
#ifdef WITH_OSL
@@ -171,9 +188,18 @@ public:
void thread_tonemap(DeviceTask& task)
{
- for(int y = task.y; y < task.y + task.h; y++) {
- for(int x = task.x; x < task.x + task.w; x++)
- kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
+#ifdef WITH_OPTIMIZED_KERNEL
+ if(system_cpu_support_optimized()) {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
+ }
+ else
+#endif
+ {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
}
}
@@ -184,11 +210,24 @@ public:
OSLShader::thread_init(kg);
#endif
- for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
- kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
+#ifdef WITH_OPTIMIZED_KERNEL
+ if(system_cpu_support_optimized()) {
+ for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
+ kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
+
+ if(tasks.worker_cancel())
+ break;
+ }
+ }
+ else
+#endif
+ {
+ for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
+ kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
- if(tasks.worker_cancel())
- break;
+ if(tasks.worker_cancel())
+ break;
+ }
}
#ifdef WITH_OSL
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 2bfb6c58120..73425486be1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -8,6 +8,7 @@ set(INC
set(SRC
kernel.cpp
+ kernel_optimized.cpp
kernel.cl
kernel.cu
)
@@ -123,11 +124,15 @@ include_directories(${INC})
add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_SVM_HEADERS})
+if(WITH_CYCLES_OPTIMIZED_KERNEL)
+ SET_SOURCE_FILES_PROPERTIES(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS ${CYCLES_OPTIMIZED_KERNEL_FLAGS})
+endif()
+
if(WITH_CYCLES_CUDA)
add_dependencies(cycles_kernel cycles_kernel_cuda)
endif()
-# OPENCL kernel
+# OpenCL kernel
#set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
#add_custom_command(
@@ -142,3 +147,4 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 7f60730e8bf..700ee49c5f2 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -38,9 +38,14 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
void kernel_cpu_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
-
void kernel_cpu_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
+#ifdef WITH_OPTIMIZED_KERNEL
+void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
+void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
+void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
+#endif
+
CCL_NAMESPACE_END
#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_optimized.cpp
new file mode 100644
index 00000000000..85a2b798a62
--- /dev/null
+++ b/intern/cycles/kernel/kernel_optimized.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3
+ optimization flags and nearly all functions inlined, while kernel.cpp
+ is compiled without for other CPU's. */
+
+#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_displace.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y)
+{
+ kernel_path_trace(kg, buffer, rng_state, sample, x, y);
+}
+
+/* Tonemapping */
+
+void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y)
+{
+ kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y);
+}
+
+/* Displacement */
+
+void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i)
+{
+ kernel_displace(kg, input, offset, i);
+}
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 8b09f227a74..abf5e08de97 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -118,5 +118,78 @@ int system_cpu_bits()
return (sizeof(void*)*8);
}
+#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)
+
+struct CPUCapabilities {
+ bool x64;
+ bool mmx;
+ bool sse;
+ bool sse2;
+ bool sse3;
+ bool ssse3;
+ bool sse41;
+ bool sse42;
+ bool sse4a;
+ bool avx;
+ bool xop;
+ bool fma3;
+ bool fma4;
+};
+
+bool system_cpu_support_optimized()
+{
+ static CPUCapabilities caps;
+ static bool caps_init = false;
+
+ if(!caps_init) {
+ int result[4], num, num_ex;
+
+ memset(&caps, 0, sizeof(caps));
+
+ __cpuid(result, 0);
+ num = result[0];
+
+ __cpuid(result, 0x80000000);
+ num_ex = result[0];
+
+ if(num >= 1){
+ __cpuid(result, 0x00000001);
+ caps.mmx = (result[3] & ((int)1 << 23)) != 0;
+ caps.sse = (result[3] & ((int)1 << 25)) != 0;
+ caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
+ caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+
+ caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
+ caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
+ caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+
+ caps.avx = (result[2] & ((int)1 << 28)) != 0;
+ caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
+ }
+
+ /*if(num_ex >= 0x80000001){
+ __cpuid(result, 0x80000001);
+ caps.x64 = (result[3] & ((int)1 << 29)) != 0;
+ caps.sse4a = (result[2] & ((int)1 << 6)) != 0;
+ caps.fma4 = (result[2] & ((int)1 << 16)) != 0;
+ caps.xop = (result[2] & ((int)1 << 11)) != 0;
+ }*/
+
+ caps_init = true;
+ }
+
+ /* optimization flags use these */
+ return caps.sse && caps.sse2 && caps.sse3;
+}
+
+#else
+
+bool system_cpu_support_optimized()
+{
+ return false;
+}
+
+#endif
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 214b3a18ca3..f25e009a250 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
int system_cpu_thread_count();
string system_cpu_brand_string();
int system_cpu_bits();
+bool system_cpu_support_optimized();
CCL_NAMESPACE_END