Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAras Pranckevicius <aras@nesnausk.org>2022-06-15 20:51:12 +0300
committerAras Pranckevicius <aras@nesnausk.org>2022-06-15 20:51:25 +0300
commit004d858138fced20d45cf3cc0149fcef1922c8a0 (patch)
tree456ba308c9119d6796c85b93467485174b64dcc8 /source/blender/blenlib/intern
parent7e89bbb2ff3e060807beaef200d9173befeba528 (diff)
math: improve accuracy of Linear->sRGB conversion SIMD path
srgb_to_linearrgb_v3_v3 is using an approximation of powf that is SIMD. However, while the accuracy of it is ok, a larger issue is that it produces different results on Intel compared to ARM architectures. On ARM (e.g. AppleSilicon), the result of the SIMD code path is much closer to the reference implementation. This seems to be because of _mm_rsqrt_ps usage in _bli_math_fastpow512. The ARM/NEON code path emulates inverse square root with a combination of vrsqrteq_f32 followed by two Newton-Raphson iterations, because blender uses the SSE2NEON_PRECISE_SQRT define. This commit adds similar NR iterations to the "actual SSE" code path as well. Max error of srgb->linear->srgb conversion roundtrip goes from 0.000211 down to about 0.000062. Reviewed By: Sergey Sharybin Differential Revision: https://developer.blender.org/D15193
Diffstat (limited to 'source/blender/blenlib/intern')
-rw-r--r--source/blender/blenlib/intern/math_base_inline.c20
1 files changed, 17 insertions, 3 deletions
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index a983821f15e..4a213f5fe74 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -767,6 +767,20 @@ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
return _mm_mul_ps(x, _mm_mul_ps(x, x));
}
+MALWAYS_INLINE __m128 _bli_math_rsqrt(__m128 in)
+{
+ __m128 r = _mm_rsqrt_ps(in);
+ /* Only do additional Newton-Raphson iterations when using actual SSE
+ * code path. When we are emulating SSE on NEON via sse2neon, the
+ * additional NR iterations are already done inside _mm_rsqrt_ps
+ * emulation. */
+# if defined(__SSE2__)
+ r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
+ _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(in, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+# endif
+ return r;
+}
+
/* Calculate powf(x, 1.0f / 2.4) */
MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
{
@@ -776,14 +790,14 @@ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
*/
__m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
__m128 xover = _mm_mul_ps(arg, xf);
- __m128 xfm1 = _mm_rsqrt_ps(xf);
+ __m128 xfm1 = _bli_math_rsqrt(xf);
__m128 x2 = _mm_mul_ps(arg, arg);
__m128 xunder = _mm_mul_ps(x2, xfm1);
/* sqrt2 * over + 2 * sqrt2 * under */
__m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
_mm_add_ps(xover, xunder));
- xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
- xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+ xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
+ xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
return xavg;
}