diff options
-rw-r--r-- | CMakeLists.txt | 22 | ||||
-rw-r--r-- | source/blender/blenlib/intern/math_base_inline.c | 98 | ||||
-rw-r--r-- | source/blender/blenlib/intern/math_color_inline.c | 68 | ||||
-rw-r--r-- | source/blender/compositor/CMakeLists.txt | 5 |
4 files changed, 171 insertions, 22 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 438cb1cdd9f..6fc4ca2729d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2357,17 +2357,17 @@ endif() # See TEST_SSE_SUPPORT() for how this is defined. -if(WITH_RAYOPTIMIZATION) - if(SUPPORT_SSE_BUILD) - set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}") - add_definitions(-D__SSE__ -D__MMX__) - endif() - if(SUPPORT_SSE2_BUILD) - set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}") - add_definitions(-D__SSE2__) - if(NOT SUPPORT_SSE_BUILD) # dont double up - add_definitions(-D__MMX__) - endif() +# Do it globally, SSE2 is required for quite some time now. +# Doing it now allows to use SSE/SSE2 in inline headers. +if(SUPPORT_SSE_BUILD) + set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}") + add_definitions(-D__SSE__ -D__MMX__) +endif() +if(SUPPORT_SSE2_BUILD) + set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}") + add_definitions(-D__SSE2__) + if(NOT SUPPORT_SSE_BUILD) # dont double up + add_definitions(-D__MMX__) endif() endif() diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c index f70c12bd26e..7ab2b784812 100644 --- a/source/blender/blenlib/intern/math_base_inline.c +++ b/source/blender/blenlib/intern/math_base_inline.c @@ -35,6 +35,10 @@ #include <stdlib.h> #include <string.h> +#ifdef __SSE2__ +# include <emmintrin.h> +#endif + #include "BLI_math_base.h" /* copied from BLI_utildefines.h */ @@ -311,4 +315,98 @@ MINLINE int signum_i(float a) else return 0; } +/* Internal helpers for SSE2 implementation. + * + * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!! + */ + +#ifdef __SSE2__ + +/* Calculate initial guess for arg^exp based on float representation + * This method gives a constant bias, which can be easily compensated by + * multiplicating with bias_coeff. + * Gives better results for exponents near 1 (e. g. 4/5). + * exp = exponent, encoded as uint32_t + * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as + * uint32_t + * + * We hope that exp and e2coeff gets properly inlined + */ +MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp, + const int e2coeff, + const __m128 arg) +{ + __m128 ret; + ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff))); + ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); + ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp))); + ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); + return ret; +} + +/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */ +MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution( + const __m128 old_result, + const __m128 x) +{ + __m128 approx2 = _mm_mul_ps(old_result, old_result); + __m128 approx4 = _mm_mul_ps(approx2, approx2); + __m128 t = _mm_div_ps(x, approx4); + __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */ + return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f)); +} + +/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */ +MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg) +{ + /* max, avg and |avg| errors were calculated in gcc without FMA instructions + * The final precision should be better than powf in glibc */ + + /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize + * avg error. + */ + /* 0x3F4CCCCD = 4/5 */ + /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */ + /* error max = 0.17 avg = 0.0018 |avg| = 0.05 */ + __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg); + __m128 arg2 = _mm_mul_ps(arg, arg); + __m128 arg4 = _mm_mul_ps(arg2, arg2); + /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */ + x = _bli_math_improve_5throot_solution(x, arg4); + /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */ + x = _bli_math_improve_5throot_solution(x, arg4); + /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */ + x = _bli_math_improve_5throot_solution(x, arg4); + return _mm_mul_ps(x, _mm_mul_ps(x, x)); +} + +/* Calculate powf(x, 1.0f / 2.4) */ +MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg) +{ + /* 5/12 is too small, so compute the 4th root of 20/12 instead. + * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. + * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 + */ + __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg); + __m128 xover = _mm_mul_ps(arg, xf); + __m128 xfm1 = _mm_rsqrt_ps(xf); + __m128 x2 = _mm_mul_ps(arg, arg); + __m128 xunder = _mm_mul_ps(x2, xfm1); + /* sqrt2 * over + 2 * sqrt2 * under */ + __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), + _mm_add_ps(xover, xunder)); + xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); + return xavg; +} + +MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask, + const __m128 a, + const __m128 b) +{ + return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b)); +} + +#endif /* __SSE2__ */ + #endif /* __MATH_BASE_INLINE_C__ */ diff --git a/source/blender/blenlib/intern/math_color_inline.c b/source/blender/blenlib/intern/math_color_inline.c index 45466226e72..c268481a5d9 100644 --- a/source/blender/blenlib/intern/math_color_inline.c +++ b/source/blender/blenlib/intern/math_color_inline.c @@ -28,6 +28,7 @@ */ +#include "BLI_math_base.h" #include "BLI_math_color.h" #include "BLI_utildefines.h" @@ -38,6 +39,52 @@ /******************************** Color Space ********************************/ +#ifdef __SSE2__ + +MALWAYS_INLINE __m128 srgb_to_linearrgb_v4_simd(const __m128 c) +{ + __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f)); + __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)), + _mm_set1_ps(0.0f)); + __m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)), + _mm_set1_ps(1.0f/1.055f)); /* fma */ + __m128 gte = _bli_math_fastpow24(gtebase); + return _bli_math_blend_sse(cmp, lt, gte); +} + +MALWAYS_INLINE __m128 linearrgb_to_srgb_v4_simd(const __m128 c) +{ + __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.0031308f)); + __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(12.92f)), + _mm_set1_ps(0.0f)); + __m128 gte = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.055f), + _bli_math_fastpow512(c)), + _mm_set1_ps(-0.055f)); + return _bli_math_blend_sse(cmp, lt, gte); +} + +MINLINE void srgb_to_linearrgb_v3_v3(float linear[3], const float srgb[3]) +{ + float r[4] = {srgb[0], srgb[1], srgb[2], 0.0f}; + __m128 *rv = (__m128 *)&r; + *rv = srgb_to_linearrgb_v4_simd(*rv); + linear[0] = r[0]; + linear[1] = r[1]; + linear[2] = r[2]; +} + +MINLINE void linearrgb_to_srgb_v3_v3(float srgb[3], const float linear[3]) +{ + float r[4] = {linear[0], linear[1], linear[2], 0.0f}; + __m128 *rv = (__m128 *)&r; + *rv = linearrgb_to_srgb_v4_simd(*rv); + srgb[0] = r[0]; + srgb[1] = r[1]; + srgb[2] = r[2]; +} + +#else /* __SSE2__ */ + MINLINE void srgb_to_linearrgb_v3_v3(float linear[3], const float srgb[3]) { linear[0] = srgb_to_linearrgb(srgb[0]); @@ -51,6 +98,7 @@ MINLINE void linearrgb_to_srgb_v3_v3(float srgb[3], const float linear[3]) srgb[1] = linearrgb_to_srgb(linear[1]); srgb[2] = linearrgb_to_srgb(linear[2]); } +#endif /* __SSE2__ */ MINLINE void srgb_to_linearrgb_v4(float linear[4], const float srgb[4]) { @@ -98,10 +146,14 @@ MINLINE void srgb_to_linearrgb_predivide_v4(float linear[4], const float srgb[4] inv_alpha = 1.0f / alpha; } - linear[0] = srgb_to_linearrgb(srgb[0] * inv_alpha) * alpha; - linear[1] = srgb_to_linearrgb(srgb[1] * inv_alpha) * alpha; - linear[2] = srgb_to_linearrgb(srgb[2] * inv_alpha) * alpha; + linear[0] = srgb[0] * inv_alpha; + linear[1] = srgb[1] * inv_alpha; + linear[2] = srgb[2] * inv_alpha; linear[3] = srgb[3]; + srgb_to_linearrgb_v3_v3(linear, linear); + linear[0] *= alpha; + linear[1] *= alpha; + linear[2] *= alpha; } MINLINE void linearrgb_to_srgb_predivide_v4(float srgb[4], const float linear[4]) @@ -117,10 +169,14 @@ MINLINE void linearrgb_to_srgb_predivide_v4(float srgb[4], const float linear[4] inv_alpha = 1.0f / alpha; } - srgb[0] = linearrgb_to_srgb(linear[0] * inv_alpha) * alpha; - srgb[1] = linearrgb_to_srgb(linear[1] * inv_alpha) * alpha; - srgb[2] = linearrgb_to_srgb(linear[2] * inv_alpha) * alpha; + srgb[0] = linear[0] * inv_alpha; + srgb[1] = linear[1] * inv_alpha; + srgb[2] = linear[2] * inv_alpha; srgb[3] = linear[3]; + linearrgb_to_srgb_v3_v3(srgb, srgb); + srgb[0] *= alpha; + srgb[1] *= alpha; + srgb[2] *= alpha; } /* LUT accelerated conversions */ diff --git a/source/blender/compositor/CMakeLists.txt b/source/blender/compositor/CMakeLists.txt index 35e5ec98e57..3180e7e4154 100644 --- a/source/blender/compositor/CMakeLists.txt +++ b/source/blender/compositor/CMakeLists.txt @@ -542,11 +542,6 @@ list(APPEND INC ${CMAKE_CURRENT_BINARY_DIR}/operations ) -if(MSVC AND NOT CMAKE_CL_64) - set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2") -endif() - data_to_c(${CMAKE_CURRENT_SOURCE_DIR}/operations/COM_OpenCLKernels.cl ${CMAKE_CURRENT_BINARY_DIR}/operations/COM_OpenCLKernels.cl.h SRC) |