Cycles: Add an AVX2 CPU kernel.

This kernel is compiled with AVX2, FMA3, and BMI compiler flags. At the moment only Intel Haswell benefits from this, but future AMD CPUs will have these instructions as well. Makes rendering on Haswell CPUs a few percent faster, only benchmarked with clang on OS X though. Part of my GSoC 2014.
author: Thomas Dinges <blender@dingto.org> 2014-06-14 00:23:58 +0400
committer: Thomas Dinges <blender@dingto.org> 2014-06-14 00:26:20 +0400
commit: 866c7fb6e63d128fa4800e28e0a091f874112344 (patch)
tree: 096daad79ca3eb7c47e339e7b1c568caf47a5733 /intern/cycles/kernel/kernel.h
parent: b4aa51f8d736f5431799fdf1df5f678a732ef6b9 (diff)
1 files changed, 11 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index b169b15b9b5..264e5e3e4d0 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
 	int type, int i, int sample);
 #endif
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+	int sample, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+	int type, int i, int sample);
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_H__ */
author	Thomas Dinges <blender@dingto.org>	2014-06-14 00:23:58 +0400
committer	Thomas Dinges <blender@dingto.org>	2014-06-14 00:26:20 +0400
commit	866c7fb6e63d128fa4800e28e0a091f874112344 (patch)
tree	096daad79ca3eb7c47e339e7b1c568caf47a5733 /intern/cycles/kernel/kernel.h
parent	b4aa51f8d736f5431799fdf1df5f678a732ef6b9 (diff)