Refactor: replace Cycles sse/avx types by vectorized float4/int4/float8/int8

The distinction existed for legacy reasons, to easily port of Embree intersection code without affecting the main vector types. However we are now using SIMD for these types as well, so no good reason to keep the distinction. Also more consistently pass these vector types by value in inline functions. Previously it was partially changed for functions used by Metal to avoid having to add address space qualifiers, simple to do it everywhere. Also removes function declarations for vector math headers, serves no real purpose. Differential Revision: https://developer.blender.org/D16146
author: Brecht Van Lommel <brecht> 2022-11-01 17:16:55 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2022-11-08 14:28:40 +0300
commit: e1b3d9112730bc3b569ffff732a1558752ded146 (patch)
tree: 27caad945dcc9ce7313b4c84cd0efbab6f70503c /intern/cycles/util/color.h
parent: 32ec0521c542bb78a0080f8091856ec085030f09 (diff)
1 files changed, 23 insertions, 25 deletions
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index 537f8ab6771..93e984120f2 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
  * exp = exponent, encoded as uint32_t
  * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
  */
-template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg)
+template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg)
 {
-  ssef ret;
-  ret = arg * cast(ssei(e2coeff));
-  ret = ssef(cast(ret));
-  ret = ret * cast(ssei(exp));
-  ret = cast(ssei(ret));
+  float4 ret = arg * cast(make_int4(e2coeff));
+  ret = make_float4(cast(ret));
+  ret = ret * cast(make_int4(exp));
+  ret = cast(make_int4(ret));
   return ret;
 }
 
 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
+ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x)
 {
-  ssef approx2 = old_result * old_result;
-  ssef approx4 = approx2 * approx2;
-  ssef t = x / approx4;
-  ssef summ = madd(ssef(4.0f), old_result, t);
-  return summ * ssef(1.0f / 5.0f);
+  float4 approx2 = old_result * old_result;
+  float4 approx4 = approx2 * approx2;
+  float4 t = x / approx4;
+  float4 summ = madd(make_float4(4.0f), old_result, t);
+  return summ * make_float4(1.0f / 5.0f);
 }
 
 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline ssef fastpow24(const ssef &arg)
+ccl_device_inline float4 fastpow24(const float4 &arg)
 {
   /* max, avg and |avg| errors were calculated in gcc without FMA instructions
    * The final precision should be better than powf in glibc */
@@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
   /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
   /* 0x3F4CCCCD = 4/5 */
   /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
-  ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
-  ssef arg2 = arg * arg;
-  ssef arg4 = arg2 * arg2;
+  float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(
+      arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
+  float4 arg2 = arg * arg;
+  float4 arg4 = arg2 * arg2;
 
   /* error max = 0.018     avg = 0.0031    |avg| = 0.0031 */
   x = improve_5throot_solution(x, arg4);
@@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
   return x * (x * x);
 }
 
-ccl_device ssef color_srgb_to_linear(const ssef &c)
+ccl_device float4 color_srgb_to_linear(const float4 &c)
 {
-  sseb cmp = c < ssef(0.04045f);
-  ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f));
-  ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */
-  ssef gte = fastpow24(gtebase);
+  int4 cmp = c < make_float4(0.04045f);
+  float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f));
+  float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */
+  float4 gte = fastpow24(gtebase);
   return select(cmp, lt, gte);
 }
 #endif /* __KERNEL_SSE2__ */
@@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c)
 ccl_device float4 color_srgb_to_linear_v4(float4 c)
 {
 #ifdef __KERNEL_SSE2__
-  ssef r_ssef;
-  float4 &r = (float4 &)r_ssef;
-  r = c;
-  r_ssef = color_srgb_to_linear(r_ssef);
+  float4 r = c;
+  r = color_srgb_to_linear(r);
   r.w = c.w;
   return r;
 #else
author	Brecht Van Lommel <brecht>	2022-11-01 17:16:55 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2022-11-08 14:28:40 +0300
commit	e1b3d9112730bc3b569ffff732a1558752ded146 (patch)
tree	27caad945dcc9ce7313b4c84cd0efbab6f70503c /intern/cycles/util/color.h
parent	32ec0521c542bb78a0080f8091856ec085030f09 (diff)