Cycles: Add an AVX2 CPU kernel.

This kernel is compiled with AVX2, FMA3, and BMI compiler flags. At the moment only Intel Haswell benefits from this, but future AMD CPUs will have these instructions as well. Makes rendering on Haswell CPUs a few percent faster, only benchmarked with clang on OS X though. Part of my GSoC 2014.
author: Thomas Dinges <blender@dingto.org> 2014-06-14 00:23:58 +0400
committer: Thomas Dinges <blender@dingto.org> 2014-06-14 00:26:20 +0400
commit: 866c7fb6e63d128fa4800e28e0a091f874112344 (patch)
tree: 096daad79ca3eb7c47e339e7b1c568caf47a5733 /intern/cycles
parent: b4aa51f8d736f5431799fdf1df5f678a732ef6b9 (diff)
9 files changed, 186 insertions, 0 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index a1b0030491e..5a6dc36b213 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -20,8 +20,10 @@ if(WIN32 AND MSVC)
 	# /arch:AVX for VC2012 and above
 	if(NOT MSVC_VERSION LESS 1700)
 		set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX")
+		set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2")
 	elseif(NOT CMAKE_CL_64)
 		set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2")
+		set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
 	endif()
 
 	# there is no /arch:SSE3, but intrinsics are available anyway
@@ -30,11 +32,13 @@ if(WIN32 AND MSVC)
 		set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 		set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 	else()
 		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 	endif()
 
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
@@ -48,6 +52,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
 		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
 		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
+		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mbmi -mbmi2 -mfpmath=sse")
 	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -57,6 +62,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
 		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
 		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
+		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 —mfma -mbmi -mbmi2")
 	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
@@ -67,6 +73,7 @@ if(CXX_HAS_SSE)
 		-DWITH_KERNEL_SSE3
 		-DWITH_KERNEL_SSE41
 		-DWITH_KERNEL_AVX
+		-DWITH_KERNEL_AVX2
 	)
 endif()
 
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 542bb82cf2a..dab8f25de4a 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -39,6 +39,7 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
 sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
 sources.remove(path.join('kernel', 'kernel_sse41.cpp'))
 sources.remove(path.join('kernel', 'kernel_avx.cpp'))
+sources.remove(path.join('kernel', 'kernel_avx2.cpp'))
 
 incs = [] 
 defs = []
@@ -98,6 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc':
     if env['MSVC_VERSION'] >= '12.0':
         kernel_flags['sse41'] = kernel_flags['sse3']
         kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX'
+        kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2'
 else:
     # -mavx only available with relatively new gcc/clang
     kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse'
@@ -106,6 +108,7 @@ else:
 
     if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'):
         kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx'
+        kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mbmi -mbmi2'
 
 for kernel_type in kernel_flags.keys():
     defs.append('WITH_KERNEL_' + kernel_type.upper())
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 71bf2d23d6e..7308d036fe3 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -62,6 +62,7 @@ public:
 		system_cpu_support_sse3();
 		system_cpu_support_sse41();
 		system_cpu_support_avx();
+		system_cpu_support_avx2();
 	}
 
 	~CPUDevice()
@@ -167,6 +168,28 @@ public:
 			int start_sample = tile.start_sample;
 			int end_sample = tile.start_sample + tile.num_samples;
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+			if(system_cpu_support_avx2()) {
+				for(int sample = start_sample; sample < end_sample; sample++) {
+					if (task.get_cancel() || task_pool.canceled()) {
+						if(task.need_finish_queue == false)
+							break;
+					}
+
+					for(int y = tile.y; y < tile.y + tile.h; y++) {
+						for(int x = tile.x; x < tile.x + tile.w; x++) {
+							kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state,
+													  sample, x, y, tile.offset, tile.stride);
+						}
+					}
+
+					tile.sample = sample + 1;
+
+					task.update_progress(tile);
+				}
+			}
+			else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 			if(system_cpu_support_avx()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
@@ -293,6 +316,15 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+			if(system_cpu_support_avx2()) {
+				for(int y = task.y; y < task.y + task.h; y++)
+					for(int x = task.x; x < task.x + task.w; x++)
+						kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+															 sample_scale, x, y, task.offset, task.stride);
+			}
+			else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 			if(system_cpu_support_avx()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@@ -337,6 +369,15 @@ public:
 			}
 		}
 		else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+			if(system_cpu_support_avx2()) {
+				for(int y = task.y; y < task.y + task.h; y++)
+					for(int x = task.x; x < task.x + task.w; x++)
+						kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+													   sample_scale, x, y, task.offset, task.stride);
+			}
+			else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 			if(system_cpu_support_avx()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@@ -390,6 +431,18 @@ public:
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2()) {
+			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+
+				if(task.get_cancel() || task_pool.canceled())
+					break;
+			}
+		}
+		else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 62d56b46509..9896a55cf02 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -214,12 +214,14 @@ if(CXX_HAS_SSE)
 		kernel_sse3.cpp
 		kernel_sse41.cpp
 		kernel_avx.cpp
+		kernel_avx2.cpp
 	)
 
 	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index b169b15b9b5..264e5e3e4d0 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
 	int type, int i, int sample);
 #endif
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+	int sample, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+	int type, int i, int sample);
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
new file mode 100644
index 00000000000..339421a002b
--- /dev/null
+++ b/intern/cycles/kernel/kernel_avx2.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
+#endif
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx2(void);
+void __dummy_function_cycles_avx2(void) {}
+
+#endif
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 0a6013cddd4..5d0fea34761 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -65,10 +65,15 @@
 #define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #endif
 
+#ifdef WITH_KERNEL_AVX2
+#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#endif
+
 /* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */
 #if defined(_MSC_VER) && (_MSC_VER < 1700)
 #undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #endif
 
 #endif
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 0764f7d9345..7c0445577e2 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -127,9 +127,12 @@ struct CPUCapabilities {
 	bool sse42;
 	bool sse4a;
 	bool avx;
+	bool avx2;
 	bool xop;
 	bool fma3;
 	bool fma4;
+	bool bmi1;
+	bool bmi2;
 };
 
 static CPUCapabilities& system_cpu_capabilities()
@@ -180,6 +183,11 @@ static CPUCapabilities& system_cpu_capabilities()
 #endif
 				caps.avx = (xcr_feature_mask & 0x6) == 0x6;
 			}
+
+			__cpuid(result, 0x00000007);
+			caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
+			caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
+			caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
 		}
 
 #if 0
@@ -221,6 +229,11 @@ bool system_cpu_support_avx()
 	CPUCapabilities& caps = system_cpu_capabilities();
 	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
 }
+bool system_cpu_support_avx2()
+{
+	CPUCapabilities& caps = system_cpu_capabilities();
+	return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+}
 #else
 
 bool system_cpu_support_sse2()
@@ -242,6 +255,10 @@ bool system_cpu_support_avx()
 {
 	return false;
 }
+bool system_cpu_support_avx2()
+{
+	return false;
+}
 
 #endif
 
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 4409ea752cd..0e8868c7dfc 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -28,6 +28,7 @@ bool system_cpu_support_sse2();
 bool system_cpu_support_sse3();
 bool system_cpu_support_sse41();
 bool system_cpu_support_avx();
+bool system_cpu_support_avx2();
 
 CCL_NAMESPACE_END
author	Thomas Dinges <blender@dingto.org>	2014-06-14 00:23:58 +0400
committer	Thomas Dinges <blender@dingto.org>	2014-06-14 00:26:20 +0400
commit	866c7fb6e63d128fa4800e28e0a091f874112344 (patch)
tree	096daad79ca3eb7c47e339e7b1c568caf47a5733 /intern/cycles
parent	b4aa51f8d736f5431799fdf1df5f678a732ef6b9 (diff)