diff options
author | Thomas Dinges <blender@dingto.org> | 2014-01-16 20:04:11 +0400 |
---|---|---|
committer | Thomas Dinges <blender@dingto.org> | 2014-01-16 20:04:11 +0400 |
commit | de28a4d4b2c9397c5233a5ee1dbf1400f450a15c (patch) | |
tree | 1fe23de963e206af3fb2ff2d9e2e3393cd89149c | |
parent | 7c6d52eb07c4bd8142a95eca1dbdc794063859b8 (diff) |
Cycles: Add an AVX kernel for CPU rendering.
* AVX is available on Intel Sandy Bridge and newer and AMD Bulldozer and newer.
* We don't use dedicated AVX intrinsics yet, but gcc auto vectorization gives a 3% performance improvement for Caminandes. Tested on an i5-3570, Linux x64.
* No change for Windows yet, MSVC 2008 does not support AVX.
Reviewed by: brecht
Differential Revision: https://developer.blender.org/D216
-rw-r--r-- | intern/cycles/CMakeLists.txt | 4 | ||||
-rw-r--r-- | intern/cycles/SConscript | 9 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 52 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel.h | 11 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_avx.cpp | 82 | ||||
-rw-r--r-- | intern/cycles/util/util_optimization.h | 4 | ||||
-rw-r--r-- | intern/cycles/util/util_system.cpp | 16 | ||||
-rw-r--r-- | intern/cycles/util/util_system.h | 1 |
9 files changed, 180 insertions, 1 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 6fa6260c81e..cc1f5978dc1 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -14,10 +14,12 @@ if(WIN32 AND MSVC) set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") + set(CYCLES_AVX_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") #/arch:AVX for VC2012 and above else() set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") + set(CYCLES_AVX_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") #/arch:AVX for VC2012 and above endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") @@ -28,11 +30,13 @@ elseif(CMAKE_COMPILER_IS_GNUCC) set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") + set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") + set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index b1dfeee1560..7ced4613d64 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -38,6 +38,7 @@ sources.remove(path.join('util', 'util_view.cpp')) sources.remove(path.join('kernel', 'kernel_sse2.cpp')) sources.remove(path.join('kernel', 'kernel_sse3.cpp')) sources.remove(path.join('kernel', 'kernel_sse41.cpp')) +sources.remove(path.join('kernel', 'kernel_avx.cpp')) incs = [] defs = [] @@ -78,23 +79,31 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', ' sse2_cxxflags = Split(env['CXXFLAGS']) sse3_cxxflags = Split(env['CXXFLAGS']) sse41_cxxflags = Split(env['CXXFLAGS']) +avx_cxxflags = Split(env['CXXFLAGS']) if env['OURPLATFORM'] == 'win32-vc': # there is no /arch:SSE3, but intrinsics are available anyway sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) + avx_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) #/arch:AVX for VC2012 and above elif env['OURPLATFORM'] == 'win64-vc': sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) + avx_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split()) #/arch:AVX for VC2012 and above else: sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split()) sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split()) sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split()) + avx_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse'.split()) optim_defs = defs[:] +cycles_avx = cycles.Clone() +avx_sources = [path.join('kernel', 'kernel_avx.cpp')] +cycles_avx.BlenderLib('bf_intern_cycles_avx', avx_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=avx_cxxflags) + cycles_sse41 = cycles.Clone() sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')] cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index b29d64eb454..76123fe44d2 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -61,6 +61,7 @@ public: system_cpu_support_sse2(); system_cpu_support_sse3(); system_cpu_support_sse41(); + system_cpu_support_avx(); } ~CPUDevice() @@ -166,6 +167,28 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int sample = start_sample; sample < end_sample; sample++) { @@ -270,6 +293,15 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int y = task.y; y < task.y + task.h; y++) @@ -305,6 +337,15 @@ public: } } else { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int y = task.y; y < task.y + task.h; y++) @@ -349,6 +390,17 @@ public: OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + + if(task_pool.canceled()) + break; + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 08427b17879..6f5f744e311 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -15,6 +15,7 @@ set(SRC kernel_sse2.cpp kernel_sse3.cpp kernel_sse41.cpp + kernel_avx.cpp kernel.cl kernel.cu ) @@ -197,6 +198,7 @@ include_directories(SYSTEM ${INC_SYS}) set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") +set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS}) diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 01bea10c1e7..039dc791b08 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -76,6 +76,17 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i); +#endif + CCL_NAMESPACE_END #endif /* __KERNEL_H__ */ diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp new file mode 100644 index 00000000000..d2a7142c551 --- /dev/null +++ b/intern/cycles/kernel/kernel_avx.cpp @@ -0,0 +1,82 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +#define __KERNEL_SSE2__ +#define __KERNEL_SSE3__ +#define __KERNEL_SSSE3__ +#define __KERNEL_SSE41__ +#endif + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_displace.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) + kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + else +#endif + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Film */ + +void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +{ + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); +} + +CCL_NAMESPACE_END +#else + +/* needed for some linkers in combination with scons making empty compilation unit in a library */ +void __dummy_function_cycles_avx(void); +void __dummy_function_cycles_avx(void){} + +#endif diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index b7a2506c950..ac94d43c998 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -42,10 +42,12 @@ /* no SSE2 kernel on x86-64, part of regular kernel */ #define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 #define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX -/* VC2008 is not ready for sse41, probably broken blendv intrinsic... */ +/* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */ #if defined(_MSC_VER) && (_MSC_VER < 1700) #undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX #endif #endif diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 79bf5fd26b7..3d7781f6146 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -198,6 +198,12 @@ bool system_cpu_support_sse41() CPUCapabilities& caps = system_cpu_capabilities(); return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41; } + +bool system_cpu_support_avx() +{ + CPUCapabilities& caps = system_cpu_capabilities(); + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx; +} #else bool system_cpu_support_sse2() @@ -210,6 +216,16 @@ bool system_cpu_support_sse3() return false; } +bool system_cpu_support_sse41() +{ + return false; +} + +bool system_cpu_support_avx() +{ + return false; +} + #endif CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index 64cfa4906b3..4409ea752cd 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -27,6 +27,7 @@ int system_cpu_bits(); bool system_cpu_support_sse2(); bool system_cpu_support_sse3(); bool system_cpu_support_sse41(); +bool system_cpu_support_avx(); CCL_NAMESPACE_END |