4 files changed, 171 insertions, 22 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 438cb1cdd9f..6fc4ca2729d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2357,17 +2357,17 @@ endif()
 
 # See TEST_SSE_SUPPORT() for how this is defined.
 
-if(WITH_RAYOPTIMIZATION)
-	if(SUPPORT_SSE_BUILD)
-		set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}")
-		add_definitions(-D__SSE__ -D__MMX__)
-	endif()
-	if(SUPPORT_SSE2_BUILD)
-		set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
-		add_definitions(-D__SSE2__)
-		if(NOT SUPPORT_SSE_BUILD) # dont double up
-			add_definitions(-D__MMX__)
-		endif()
+# Do it globally, SSE2 is required for quite some time now.
+# Doing it now allows to use SSE/SSE2 in inline headers.
+if(SUPPORT_SSE_BUILD)
+	set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}")
+	add_definitions(-D__SSE__ -D__MMX__)
+endif()
+if(SUPPORT_SSE2_BUILD)
+	set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
+	add_definitions(-D__SSE2__)
+	if(NOT SUPPORT_SSE_BUILD) # dont double up
+		add_definitions(-D__MMX__)
 	endif()
 endif()
 
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index f70c12bd26e..7ab2b784812 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -35,6 +35,10 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
 #include "BLI_math_base.h"
 
 /* copied from BLI_utildefines.h */
@@ -311,4 +315,98 @@ MINLINE int signum_i(float a)
 	else          return  0;
 }
 
+/* Internal helpers for SSE2 implementation.
+ *
+ * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!!
+ */
+
+#ifdef __SSE2__
+
+/* Calculate initial guess for arg^exp based on float representation
+ * This method gives a constant bias, which can be easily compensated by
+ * multiplicating with bias_coeff.
+ * Gives better results for exponents near 1 (e. g. 4/5).
+ * exp = exponent, encoded as uint32_t
+ * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as
+ * uint32_t
+ *
+ * We hope that exp and e2coeff gets properly inlined
+ */
+MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp,
+                                        const int e2coeff,
+                                        const __m128 arg)
+{
+	__m128 ret;
+	ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
+	ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
+	ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
+	ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+	return ret;
+}
+
+/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
+MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(
+    const __m128 old_result,
+    const __m128 x)
+{
+	__m128 approx2 = _mm_mul_ps(old_result, old_result);
+	__m128 approx4 = _mm_mul_ps(approx2, approx2);
+	__m128 t = _mm_div_ps(x, approx4);
+	__m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
+	return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f));
+}
+
+/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
+MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
+{
+	/* max, avg and |avg| errors were calculated in gcc without FMA instructions
+	 * The final precision should be better than powf in glibc */
+
+	/* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize
+	 * avg error.
+	 */
+	/* 0x3F4CCCCD = 4/5 */
+	/* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
+	/* error max = 0.17	avg = 0.0018	|avg| = 0.05 */
+	__m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg);
+	__m128 arg2 = _mm_mul_ps(arg, arg);
+	__m128 arg4 = _mm_mul_ps(arg2, arg2);
+	/* error max = 0.018        avg = 0.0031    |avg| = 0.0031  */
+	x = _bli_math_improve_5throot_solution(x, arg4);
+	/* error max = 0.00021    avg = 1.6e-05    |avg| = 1.6e-05 */
+	x = _bli_math_improve_5throot_solution(x, arg4);
+	/* error max = 6.1e-07    avg = 5.2e-08    |avg| = 1.1e-07 */
+	x = _bli_math_improve_5throot_solution(x, arg4);
+	return _mm_mul_ps(x, _mm_mul_ps(x, x));
+}
+
+/* Calculate powf(x, 1.0f / 2.4) */
+MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
+{
+	/* 5/12 is too small, so compute the 4th root of 20/12 instead.
+	 * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
+	 * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
+	 */
+	__m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
+	__m128 xover = _mm_mul_ps(arg, xf);
+	__m128 xfm1 = _mm_rsqrt_ps(xf);
+	__m128 x2 = _mm_mul_ps(arg, arg);
+	__m128 xunder = _mm_mul_ps(x2, xfm1);
+	/* sqrt2 * over + 2 * sqrt2 * under */
+	__m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
+	                         _mm_add_ps(xover, xunder));
+	xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+	xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+	return xavg;
+}
+
+MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask,
+                                          const __m128 a,
+                                          const __m128 b)
+{
+	return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
+}
+
+#endif  /* __SSE2__ */
+
 #endif /* __MATH_BASE_INLINE_C__ */
diff --git a/source/blender/blenlib/intern/math_color_inline.c b/source/blender/blenlib/intern/math_color_inline.c
index 45466226e72..c268481a5d9 100644
--- a/source/blender/blenlib/intern/math_color_inline.c
+++ b/source/blender/blenlib/intern/math_color_inline.c
@@ -28,6 +28,7 @@
  */
 
 
+#include "BLI_math_base.h"
 #include "BLI_math_color.h"
 #include "BLI_utildefines.h"
 
@@ -38,6 +39,52 @@
 
 /******************************** Color Space ********************************/
 
+#ifdef __SSE2__
+
+MALWAYS_INLINE __m128 srgb_to_linearrgb_v4_simd(const __m128 c)
+{
+	__m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f));
+	__m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)),
+	                       _mm_set1_ps(0.0f));
+	__m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)),
+	                            _mm_set1_ps(1.0f/1.055f)); /* fma */
+	__m128 gte = _bli_math_fastpow24(gtebase);
+	return _bli_math_blend_sse(cmp, lt, gte);
+}
+
+MALWAYS_INLINE __m128 linearrgb_to_srgb_v4_simd(const __m128 c)
+{
+	__m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.0031308f));
+	__m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(12.92f)),
+	                       _mm_set1_ps(0.0f));
+	__m128 gte = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.055f),
+	                                   _bli_math_fastpow512(c)),
+	                        _mm_set1_ps(-0.055f));
+	return _bli_math_blend_sse(cmp, lt, gte);
+}
+
+MINLINE void srgb_to_linearrgb_v3_v3(float linear[3], const float srgb[3])
+{
+	float r[4] = {srgb[0], srgb[1], srgb[2], 0.0f};
+	__m128 *rv = (__m128 *)&r;
+	*rv = srgb_to_linearrgb_v4_simd(*rv);
+	linear[0] = r[0];
+	linear[1] = r[1];
+	linear[2] = r[2];
+}
+
+MINLINE void linearrgb_to_srgb_v3_v3(float srgb[3], const float linear[3])
+{
+	float r[4] = {linear[0], linear[1], linear[2], 0.0f};
+	__m128 *rv = (__m128 *)&r;
+	*rv = linearrgb_to_srgb_v4_simd(*rv);
+	srgb[0] = r[0];
+	srgb[1] = r[1];
+	srgb[2] = r[2];
+}
+
+#else  /* __SSE2__ */
+
 MINLINE void srgb_to_linearrgb_v3_v3(float linear[3], const float srgb[3])
 {
 	linear[0] = srgb_to_linearrgb(srgb[0]);
@@ -51,6 +98,7 @@ MINLINE void linearrgb_to_srgb_v3_v3(float srgb[3], const float linear[3])
 	srgb[1] = linearrgb_to_srgb(linear[1]);
 	srgb[2] = linearrgb_to_srgb(linear[2]);
 }
+#endif  /* __SSE2__ */
 
 MINLINE void srgb_to_linearrgb_v4(float linear[4], const float srgb[4])
 {
@@ -98,10 +146,14 @@ MINLINE void srgb_to_linearrgb_predivide_v4(float linear[4], const float srgb[4]
 		inv_alpha = 1.0f / alpha;
 	}
 
-	linear[0] = srgb_to_linearrgb(srgb[0] * inv_alpha) * alpha;
-	linear[1] = srgb_to_linearrgb(srgb[1] * inv_alpha) * alpha;
-	linear[2] = srgb_to_linearrgb(srgb[2] * inv_alpha) * alpha;
+	linear[0] = srgb[0] * inv_alpha;
+	linear[1] = srgb[1] * inv_alpha;
+	linear[2] = srgb[2] * inv_alpha;
 	linear[3] = srgb[3];
+	srgb_to_linearrgb_v3_v3(linear, linear);
+	linear[0] *= alpha;
+	linear[1] *= alpha;
+	linear[2] *= alpha;
 }
 
 MINLINE void linearrgb_to_srgb_predivide_v4(float srgb[4], const float linear[4])
@@ -117,10 +169,14 @@ MINLINE void linearrgb_to_srgb_predivide_v4(float srgb[4], const float linear[4]
 		inv_alpha = 1.0f / alpha;
 	}
 
-	srgb[0] = linearrgb_to_srgb(linear[0] * inv_alpha) * alpha;
-	srgb[1] = linearrgb_to_srgb(linear[1] * inv_alpha) * alpha;
-	srgb[2] = linearrgb_to_srgb(linear[2] * inv_alpha) * alpha;
+	srgb[0] = linear[0] * inv_alpha;
+	srgb[1] = linear[1] * inv_alpha;
+	srgb[2] = linear[2] * inv_alpha;
 	srgb[3] = linear[3];
+	linearrgb_to_srgb_v3_v3(srgb, srgb);
+	srgb[0] *= alpha;
+	srgb[1] *= alpha;
+	srgb[2] *= alpha;
 }
 
 /* LUT accelerated conversions */
diff --git a/source/blender/compositor/CMakeLists.txt b/source/blender/compositor/CMakeLists.txt
index 35e5ec98e57..3180e7e4154 100644
--- a/source/blender/compositor/CMakeLists.txt
+++ b/source/blender/compositor/CMakeLists.txt
@@ -542,11 +542,6 @@ list(APPEND INC
 	${CMAKE_CURRENT_BINARY_DIR}/operations
 )
 
-if(MSVC AND NOT CMAKE_CL_64)
-	set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-endif()
-
 data_to_c(${CMAKE_CURRENT_SOURCE_DIR}/operations/COM_OpenCLKernels.cl
           ${CMAKE_CURRENT_BINARY_DIR}/operations/COM_OpenCLKernels.cl.h SRC)