diff options
author | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2011-11-15 19:13:38 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2011-11-15 19:13:38 +0400 |
commit | db8024f4b54ac4cf83b5346fe1548c009fd21082 (patch) | |
tree | b005764126eff5502fcbae63048b0d17a46d6778 | |
parent | 2bc78219135eba9b8079dc69ea7fd062a283a9b3 (diff) |
Fix #29259: cycles issues on certain processors. Now two versions of the kernel
are compiled, one SSE optimized and the other not, and it will choose between
them at runtime.
-rw-r--r-- | intern/cycles/CMakeLists.txt | 31 | ||||
-rw-r--r-- | intern/cycles/SConscript | 28 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 63 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 8 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel.h | 7 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_optimized.cpp | 60 | ||||
-rw-r--r-- | intern/cycles/util/util_system.cpp | 73 | ||||
-rw-r--r-- | intern/cycles/util/util_system.h | 1 |
8 files changed, 226 insertions, 45 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index d1ee5e0050d..cfff7485e61 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -9,31 +9,18 @@ include(cmake/external_libs.cmake) # Build Flags if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD) - set(GCC_OPTIM_FLAGS "-ffast-math -msse -msse2 -msse3") -endif() - -if(APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}") - set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") -endif() + set(WITH_CYCLES_OPTIMIZED_KERNEL ON) -if(WIN32) - if(MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast") - set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") + if(WIN32 AND MSVC) + set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast") elseif(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}") - set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") + set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -DGOGOGO") endif() endif() -if(UNIX AND NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}") - set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") -endif() - -# not needed yet, is for open shading language -set(RTTI_DISABLE_FLAGS "") +# for OSL, not needed yet +# set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") +# set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") # Definitions and Includes @@ -42,6 +29,10 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS}) add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {) add_definitions(-DCCL_NAMESPACE_END=}) +if(WITH_CYCLES_OPTIMIZED_KERNEL) + add_definitions(-DWITH_OPTIMIZED_KERNEL) +endif() + if(WITH_CYCLES_NETWORK) add_definitions(-DWITH_NETWORK) endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index e2c81edea37..1acb7321f09 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -10,11 +10,10 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob(' sources.remove(path.join('util', 'util_view.cpp')) sources.remove(path.join('render', 'film_response.cpp')) +sources.remove(path.join('kernel', 'kernel_optimized.cpp')) incs = [] defs = [] -ccflags = [] -cxxflags = [] defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {') defs.append('CCL_NAMESPACE_END=}') @@ -23,14 +22,6 @@ defs.append('WITH_OPENCL') defs.append('WITH_MULTI') defs.append('WITH_CUDA') -if env['OURPLATFORM'] in ('win32-mingw'): - if env['WITH_BF_RAYOPTIMIZATION']: - cxxflags.append('-ffast-math -msse -msse2 -msse3'.split()) - ccflags.append('-ffast-math -msse -msse2 -msse3'.split()) - # not needed yet, is for open shading language - # cxxflags.append('-fno-rtti'.split()) - # defs.append('BOOST_NO_RTTI BOOST_NO_TYPEID'.split()) - incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split()) incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna'.split()) incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split()) @@ -39,5 +30,20 @@ incs.append(cycles['BF_OIIO_INC']) incs.append(cycles['BF_BOOST_INC']) incs.append(cycles['BF_PYTHON_INC']) -cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None], cc_compileflags=ccflags, cxx_compileflags=cxxflags) +# optimized kernel +if env['WITH_BF_RAYOPTIMIZATION']: + optim_cxxflags = [] + + if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'): + optim_cxxflags.append('/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast'.split()) + else: + optim_cxxflags.append('-ffast-math -msse -msse2 -msse3'.split()) + + optim_defs = defs + ['WITH_OPTIMIZED_KERNEL'] + optim_sources = [path.join('kernel', 'kernel_optimized.cpp')] + + cycles_optim = cycles.Clone() + cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[0], compileflags=[None], cxx_compileflags=optim_cxxflags) + +cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None]) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index d6e1c200996..990b7cb94b0 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -48,6 +48,9 @@ public: { kg = kernel_globals_create(); + /* do now to avoid thread issues */ + system_cpu_support_optimized(); + if(threads_num == 0) threads_num = system_cpu_thread_count(); @@ -155,12 +158,26 @@ public: OSLShader::thread_init(kg); #endif - for(int y = task.y; y < task.y + task.h; y++) { - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y); +#ifdef WITH_OPTIMIZED_KERNEL + if(system_cpu_support_optimized()) { + for(int y = task.y; y < task.y + task.h; y++) { + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y); - if(tasks.worker_cancel()) - break; + if(tasks.worker_cancel()) + break; + } + } + else +#endif + { + for(int y = task.y; y < task.y + task.h; y++) { + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y); + + if(tasks.worker_cancel()) + break; + } } #ifdef WITH_OSL @@ -171,9 +188,18 @@ public: void thread_tonemap(DeviceTask& task) { - for(int y = task.y; y < task.y + task.h; y++) { - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y); +#ifdef WITH_OPTIMIZED_KERNEL + if(system_cpu_support_optimized()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y); + } + else +#endif + { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y); } } @@ -184,11 +210,24 @@ public: OSLShader::thread_init(kg); #endif - for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) { - kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x); +#ifdef WITH_OPTIMIZED_KERNEL + if(system_cpu_support_optimized()) { + for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) { + kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x); + + if(tasks.worker_cancel()) + break; + } + } + else +#endif + { + for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) { + kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x); - if(tasks.worker_cancel()) - break; + if(tasks.worker_cancel()) + break; + } } #ifdef WITH_OSL diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 2bfb6c58120..73425486be1 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -8,6 +8,7 @@ set(INC set(SRC kernel.cpp + kernel_optimized.cpp kernel.cl kernel.cu ) @@ -123,11 +124,15 @@ include_directories(${INC}) add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_SVM_HEADERS}) +if(WITH_CYCLES_OPTIMIZED_KERNEL) + SET_SOURCE_FILES_PROPERTIES(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS ${CYCLES_OPTIMIZED_KERNEL_FLAGS}) +endif() + if(WITH_CYCLES_CUDA) add_dependencies(cycles_kernel cycles_kernel_cuda) endif() -# OPENCL kernel +# OpenCL kernel #set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl) #add_custom_command( @@ -142,3 +147,4 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) + diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 7f60730e8bf..700ee49c5f2 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -38,9 +38,14 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t void kernel_cpu_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y); void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y); - void kernel_cpu_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i); +#ifdef WITH_OPTIMIZED_KERNEL +void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y); +void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y); +void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i); +#endif + CCL_NAMESPACE_END #endif /* __KERNEL_H__ */ diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_optimized.cpp new file mode 100644 index 00000000000..85a2b798a62 --- /dev/null +++ b/intern/cycles/kernel/kernel_optimized.cpp @@ -0,0 +1,60 @@ +/* + * Copyright 2011, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3 + optimization flags and nearly all functions inlined, while kernel.cpp + is compiled without for other CPU's. */ + +#ifdef WITH_OPTIMIZED_KERNEL + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_displace.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y) +{ + kernel_path_trace(kg, buffer, rng_state, sample, x, y); +} + +/* Tonemapping */ + +void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y) +{ + kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y); +} + +/* Displacement */ + +void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i) +{ + kernel_displace(kg, input, offset, i); +} + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 8b09f227a74..abf5e08de97 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -118,5 +118,78 @@ int system_cpu_bits() return (sizeof(void*)*8); } +#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86) + +struct CPUCapabilities { + bool x64; + bool mmx; + bool sse; + bool sse2; + bool sse3; + bool ssse3; + bool sse41; + bool sse42; + bool sse4a; + bool avx; + bool xop; + bool fma3; + bool fma4; +}; + +bool system_cpu_support_optimized() +{ + static CPUCapabilities caps; + static bool caps_init = false; + + if(!caps_init) { + int result[4], num, num_ex; + + memset(&caps, 0, sizeof(caps)); + + __cpuid(result, 0); + num = result[0]; + + __cpuid(result, 0x80000000); + num_ex = result[0]; + + if(num >= 1){ + __cpuid(result, 0x00000001); + caps.mmx = (result[3] & ((int)1 << 23)) != 0; + caps.sse = (result[3] & ((int)1 << 25)) != 0; + caps.sse2 = (result[3] & ((int)1 << 26)) != 0; + caps.sse3 = (result[2] & ((int)1 << 0)) != 0; + + caps.ssse3 = (result[2] & ((int)1 << 9)) != 0; + caps.sse41 = (result[2] & ((int)1 << 19)) != 0; + caps.sse42 = (result[2] & ((int)1 << 20)) != 0; + + caps.avx = (result[2] & ((int)1 << 28)) != 0; + caps.fma3 = (result[2] & ((int)1 << 12)) != 0; + } + + /*if(num_ex >= 0x80000001){ + __cpuid(result, 0x80000001); + caps.x64 = (result[3] & ((int)1 << 29)) != 0; + caps.sse4a = (result[2] & ((int)1 << 6)) != 0; + caps.fma4 = (result[2] & ((int)1 << 16)) != 0; + caps.xop = (result[2] & ((int)1 << 11)) != 0; + }*/ + + caps_init = true; + } + + /* optimization flags use these */ + return caps.sse && caps.sse2 && caps.sse3; +} + +#else + +bool system_cpu_support_optimized() +{ + return false; +} + +#endif + CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index 214b3a18ca3..f25e009a250 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN int system_cpu_thread_count(); string system_cpu_brand_string(); int system_cpu_bits(); +bool system_cpu_support_optimized(); CCL_NAMESPACE_END |