2 files changed, 85 insertions, 3 deletions
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index a983821f15e..4a213f5fe74 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -767,6 +767,20 @@ MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
   return _mm_mul_ps(x, _mm_mul_ps(x, x));
 }
 
+MALWAYS_INLINE __m128 _bli_math_rsqrt(__m128 in)
+{
+  __m128 r = _mm_rsqrt_ps(in);
+  /* Only do additional Newton-Raphson iterations when using actual SSE
+   * code path. When we are emulating SSE on NEON via sse2neon, the
+   * additional NR iterations are already done inside _mm_rsqrt_ps
+   * emulation. */
+#  if defined(__SSE2__)
+  r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
+                 _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(in, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#  endif
+  return r;
+}
+
 /* Calculate powf(x, 1.0f / 2.4) */
 MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
 {
@@ -776,14 +790,14 @@ MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
    */
   __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
   __m128 xover = _mm_mul_ps(arg, xf);
-  __m128 xfm1 = _mm_rsqrt_ps(xf);
+  __m128 xfm1 = _bli_math_rsqrt(xf);
   __m128 x2 = _mm_mul_ps(arg, arg);
   __m128 xunder = _mm_mul_ps(x2, xfm1);
   /* sqrt2 * over + 2 * sqrt2 * under */
   __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
                            _mm_add_ps(xover, xunder));
-  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-  xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+  xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
+  xavg = _mm_mul_ps(xavg, _bli_math_rsqrt(xavg));
   return xavg;
 }
 
diff --git a/source/blender/blenlib/tests/BLI_math_color_test.cc b/source/blender/blenlib/tests/BLI_math_color_test.cc
index 7f2c0a3f1ca..4d928477870 100644
--- a/source/blender/blenlib/tests/BLI_math_color_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_color_test.cc
@@ -74,3 +74,71 @@ TEST(math_color, LinearRGBTosRGBRoundtrip)
     EXPECT_NEAR(orig_linear_color, linear_color, 1e-5);
   }
 }
+
+TEST(math_color, linearrgb_to_srgb_v3_v3)
+{
+  float srgb_color[3];
+  {
+    const float kTolerance = 1.0e-8f;
+    const float linear_color[3] = {0.0023f, 0.0024f, 0.0025f};
+    linearrgb_to_srgb_v3_v3(srgb_color, linear_color);
+    EXPECT_NEAR(0.029716f, srgb_color[0], kTolerance);
+    EXPECT_NEAR(0.031008f, srgb_color[1], kTolerance);
+    EXPECT_NEAR(0.032300f, srgb_color[2], kTolerance);
+  }
+
+  {
+    /* SIMD implementation of linear->srgb for larger inputs
+     * is less accurate; use larger tolerance. */
+    const float kTolerance = 3.6e-5f;
+    const float linear_color[3] = {0.71f, 0.75f, 0.78f};
+    linearrgb_to_srgb_v3_v3(srgb_color, linear_color);
+    EXPECT_NEAR(0.859696f, srgb_color[0], kTolerance);
+    EXPECT_NEAR(0.880825f, srgb_color[1], kTolerance);
+    EXPECT_NEAR(0.896244f, srgb_color[2], kTolerance);
+  }
+
+  {
+    /* Not a common, but possible case: values beyond 1.0 range. */
+    const float kTolerance = 2.3e-4f;
+    const float linear_color[3] = {1.5f, 2.8f, 5.6f};
+    linearrgb_to_srgb_v3_v3(srgb_color, linear_color);
+    EXPECT_NEAR(1.19418f, srgb_color[0], kTolerance);
+    EXPECT_NEAR(1.56520f, srgb_color[1], kTolerance);
+    EXPECT_NEAR(2.10771f, srgb_color[2], kTolerance);
+  }
+}
+
+TEST(math_color, srgb_to_linearrgb_v3_v3)
+{
+  float linear_color[3];
+  {
+    const float kTolerance = 1.0e-8f;
+    const float srgb_color[3] = {0.0023f, 0.0024f, 0.0025f};
+    srgb_to_linearrgb_v3_v3(linear_color, srgb_color);
+    EXPECT_NEAR(0.000178019f, linear_color[0], kTolerance);
+    EXPECT_NEAR(0.000185759f, linear_color[1], kTolerance);
+    EXPECT_NEAR(0.000193498f, linear_color[2], kTolerance);
+  }
+
+  {
+    /* SIMD implementation of linear->srgb for larger inputs
+     * is less accurate; use larger tolerance. */
+    const float kTolerance = 1.5e-7f;
+    const float srgb_color[3] = {0.71f, 0.72f, 0.73f};
+    srgb_to_linearrgb_v3_v3(linear_color, srgb_color);
+    EXPECT_NEAR(0.4623615f, linear_color[0], kTolerance);
+    EXPECT_NEAR(0.4770000f, linear_color[1], kTolerance);
+    EXPECT_NEAR(0.4919052f, linear_color[2], kTolerance);
+  }
+
+  {
+    /* Not a common, but possible case: values beyond 1.0 range. */
+    const float kTolerance = 7.7e-6f;
+    const float srgb_color[3] = {1.1f, 2.5f, 5.6f};
+    srgb_to_linearrgb_v3_v3(linear_color, srgb_color);
+    EXPECT_NEAR(1.24277f, linear_color[0], kTolerance);
+    EXPECT_NEAR(8.35473f, linear_color[1], kTolerance);
+    EXPECT_NEAR(56.23833f, linear_color[2], kTolerance);
+  }
+}