diff options
-rw-r--r-- | intern/cycles/CMakeLists.txt | 7 | ||||
-rw-r--r-- | intern/cycles/SConscript | 3 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 53 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel.h | 11 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_avx2.cpp | 87 | ||||
-rw-r--r-- | intern/cycles/util/util_optimization.h | 5 | ||||
-rw-r--r-- | intern/cycles/util/util_system.cpp | 17 | ||||
-rw-r--r-- | intern/cycles/util/util_system.h | 1 |
9 files changed, 186 insertions, 0 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index a1b0030491e..5a6dc36b213 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -20,8 +20,10 @@ if(WIN32 AND MSVC) # /arch:AVX for VC2012 and above if(NOT MSVC_VERSION LESS 1700) set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX") + set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2") elseif(NOT CMAKE_CL_64) set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2") + set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") endif() # there is no /arch:SSE3, but intrinsics are available anyway @@ -30,11 +32,13 @@ if(WIN32 AND MSVC) set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") else() set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") @@ -48,6 +52,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC) set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mbmi -mbmi2 -mfpmath=sse") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") @@ -57,6 +62,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 —mfma -mbmi -mbmi2") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() @@ -67,6 +73,7 @@ if(CXX_HAS_SSE) -DWITH_KERNEL_SSE3 -DWITH_KERNEL_SSE41 -DWITH_KERNEL_AVX + -DWITH_KERNEL_AVX2 ) endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index 542bb82cf2a..dab8f25de4a 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -39,6 +39,7 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp')) sources.remove(path.join('kernel', 'kernel_sse3.cpp')) sources.remove(path.join('kernel', 'kernel_sse41.cpp')) sources.remove(path.join('kernel', 'kernel_avx.cpp')) +sources.remove(path.join('kernel', 'kernel_avx2.cpp')) incs = [] defs = [] @@ -98,6 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc': if env['MSVC_VERSION'] >= '12.0': kernel_flags['sse41'] = kernel_flags['sse3'] kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX' + kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2' else: # -mavx only available with relatively new gcc/clang kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse' @@ -106,6 +108,7 @@ else: if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'): kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx' + kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mbmi -mbmi2' for kernel_type in kernel_flags.keys(): defs.append('WITH_KERNEL_' + kernel_type.upper()) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 71bf2d23d6e..7308d036fe3 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -62,6 +62,7 @@ public: system_cpu_support_sse3(); system_cpu_support_sse41(); system_cpu_support_avx(); + system_cpu_support_avx2(); } ~CPUDevice() @@ -167,6 +168,28 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int sample = start_sample; sample < end_sample; sample++) { @@ -293,6 +316,15 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int y = task.y; y < task.y + task.h; y++) @@ -337,6 +369,15 @@ public: } } else { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int y = task.y; y < task.y + task.h; y++) @@ -390,6 +431,18 @@ public: OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + for(int sample = 0; sample < task.num_samples; sample++) + kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); + + if(task.get_cancel() || task_pool.canceled()) + break; + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 62d56b46509..9896a55cf02 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -214,12 +214,14 @@ if(CXX_HAS_SSE) kernel_sse3.cpp kernel_sse41.cpp kernel_avx.cpp + kernel_avx2.cpp ) set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index b169b15b9b5..264e5e3e4d0 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i, int sample); +#endif + CCL_NAMESPACE_END #endif /* __KERNEL_H__ */ diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp new file mode 100644 index 00000000000..339421a002b --- /dev/null +++ b/intern/cycles/kernel/kernel_avx2.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +#define __KERNEL_SSE2__ +#define __KERNEL_SSE3__ +#define __KERNEL_SSSE3__ +#define __KERNEL_SSE41__ +#define __KERNEL_AVX__ +#define __KERNEL_AVX2__ +#endif + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_bake.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) + kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + else +#endif + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Film */ + +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample) +{ + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); +} + +CCL_NAMESPACE_END +#else + +/* needed for some linkers in combination with scons making empty compilation unit in a library */ +void __dummy_function_cycles_avx2(void); +void __dummy_function_cycles_avx2(void) {} + +#endif diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 0a6013cddd4..5d0fea34761 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -65,10 +65,15 @@ #define WITH_CYCLES_OPTIMIZED_KERNEL_AVX #endif +#ifdef WITH_KERNEL_AVX2 +#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +#endif + /* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */ #if defined(_MSC_VER) && (_MSC_VER < 1700) #undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 #undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 #endif #endif diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 0764f7d9345..7c0445577e2 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -127,9 +127,12 @@ struct CPUCapabilities { bool sse42; bool sse4a; bool avx; + bool avx2; bool xop; bool fma3; bool fma4; + bool bmi1; + bool bmi2; }; static CPUCapabilities& system_cpu_capabilities() @@ -180,6 +183,11 @@ static CPUCapabilities& system_cpu_capabilities() #endif caps.avx = (xcr_feature_mask & 0x6) == 0x6; } + + __cpuid(result, 0x00000007); + caps.bmi1 = (result[1] & ((int)1 << 3)) != 0; + caps.bmi2 = (result[1] & ((int)1 << 8)) != 0; + caps.avx2 = (result[1] & ((int)1 << 5)) != 0; } #if 0 @@ -221,6 +229,11 @@ bool system_cpu_support_avx() CPUCapabilities& caps = system_cpu_capabilities(); return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx; } +bool system_cpu_support_avx2() +{ + CPUCapabilities& caps = system_cpu_capabilities(); + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2; +} #else bool system_cpu_support_sse2() @@ -242,6 +255,10 @@ bool system_cpu_support_avx() { return false; } +bool system_cpu_support_avx2() +{ + return false; +} #endif diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index 4409ea752cd..0e8868c7dfc 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -28,6 +28,7 @@ bool system_cpu_support_sse2(); bool system_cpu_support_sse3(); bool system_cpu_support_sse41(); bool system_cpu_support_avx(); +bool system_cpu_support_avx2(); CCL_NAMESPACE_END |