math: improve accuracy of Linear->sRGB conversion SIMD path

srgb_to_linearrgb_v3_v3 is using an approximation of powf that is SIMD. However, while the accuracy of it is ok, a larger issue is that it produces different results on Intel compared to ARM architectures. On ARM (e.g. AppleSilicon), the result of the SIMD code path is much closer to the reference implementation. This seems to be because of _mm_rsqrt_ps usage in _bli_math_fastpow512. The ARM/NEON code path emulates inverse square root with a combination of vrsqrteq_f32 followed by two Newton-Raphson iterations, because blender uses the SSE2NEON_PRECISE_SQRT define. This commit adds similar NR iterations to the "actual SSE" code path as well. Max error of srgb->linear->srgb conversion roundtrip goes from 0.000211 down to about 0.000062. Reviewed By: Sergey Sharybin Differential Revision: https://developer.blender.org/D15193
author: Aras Pranckevicius <aras@nesnausk.org> 2022-06-15 20:51:12 +0300
committer: Aras Pranckevicius <aras@nesnausk.org> 2022-06-15 20:51:25 +0300
commit: 004d858138fced20d45cf3cc0149fcef1922c8a0 (patch)
tree: 456ba308c9119d6796c85b93467485174b64dcc8 /source/blender/blenlib/intern
parent: 7e89bbb2ff3e060807beaef200d9173befeba528 (diff)
1 files changed, 17 insertions, 3 deletions
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index a983821f15e..4a213f5fe74 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -767,6 +767,20 @@ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
   return _mm_mul_ps(x, _mm_mul_ps(x, x));
 }
 
+MALWAYS_INLINE __m128 _bli_math_rsqrt(__m128 in)
+{
+  __m128 r = _mm_rsqrt_ps(in);
+  /* Only do additional Newton-Raphson iterations when using actual SSE
+   * code path. When we are emulating SSE on NEON via sse2neon, the
+   * additional NR iterations are already done inside _mm_rsqrt_ps
+   * emulation. */
+#  if defined(__SSE2__)
+  r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
+                 _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(in, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#  endif
+  return r;
+}
+
 /* Calculate powf(x, 1.0f / 2.4) */
 MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
 {
@@ -776,14 +790,14 @@ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
    */
   __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
   __m128 xover = _mm_mul_ps(arg, xf);
-  __m128 xfm1 = _mm_rsqrt_ps(xf);
+  __m128 xfm1 = _bli_math_rsqrt(xf);
   __m128 x2 = _mm_mul_ps(arg, arg);
   __m128 xunder = _mm_mul_ps(x2, xfm1);
   /* sqrt2 * over + 2 * sqrt2 * under */
   __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
                            _mm_add_ps(xover, xunder));
-  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+  xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
+  xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
   return xavg;
 }
author	Aras Pranckevicius <aras@nesnausk.org>	2022-06-15 20:51:12 +0300
committer	Aras Pranckevicius <aras@nesnausk.org>	2022-06-15 20:51:25 +0300
commit	004d858138fced20d45cf3cc0149fcef1922c8a0 (patch)
tree	456ba308c9119d6796c85b93467485174b64dcc8 /source/blender/blenlib/intern
parent	7e89bbb2ff3e060807beaef200d9173befeba528 (diff)