Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util/math_float4.h')
-rw-r--r--intern/cycles/util/math_float4.h475
1 files changed, 255 insertions, 220 deletions
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c2721873037..301d2d789c0 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
* Copyright 2011-2022 Blender Foundation */
#ifndef __UTIL_MATH_FLOAT4_H__
@@ -10,85 +11,6 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float4 operator-(const float4 &a);
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator*(const float4 &a, float f);
-ccl_device_inline float4 operator*(float f, const float4 &a);
-ccl_device_inline float4 operator/(const float4 &a, float f);
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+(const float4 &a, const float f);
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator-(const float4 &a, const float f);
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, float f);
-ccl_device_inline float4 operator/=(float4 &a, float f);
-
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
-ccl_device_inline bool operator==(const float4 &a, const float4 &b);
-
-ccl_device_inline float distance(const float4 &a, const float4 &b);
-ccl_device_inline float dot(const float4 &a, const float4 &b);
-ccl_device_inline float len_squared(const float4 &a);
-ccl_device_inline float4 rcp(const float4 &a);
-ccl_device_inline float4 sqrt(const float4 &a);
-ccl_device_inline float4 sqr(const float4 &a);
-ccl_device_inline float4 cross(const float4 &a, const float4 &b);
-ccl_device_inline bool is_zero(const float4 &a);
-ccl_device_inline float average(const float4 &a);
-ccl_device_inline float len(const float4 &a);
-ccl_device_inline float4 normalize(const float4 &a);
-ccl_device_inline float4 safe_normalize(const float4 &a);
-ccl_device_inline float4 min(const float4 &a, const float4 &b);
-ccl_device_inline float4 max(const float4 &a, const float4 &b);
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
-ccl_device_inline float4 fabs(const float4 &a);
-ccl_device_inline float4 floor(const float4 &a);
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_METAL__*/
-
-ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
-ccl_device_inline float4 safe_divide(const float4 a, const float b);
-
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b);
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b);
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b);
-
-# ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b);
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
-# endif
-#endif /* __KERNEL_SSE__ */
-
-ccl_device_inline float reduce_min(const float4 a);
-ccl_device_inline float reduce_max(const float4 a);
-ccl_device_inline float reduce_add(const float4 a);
-
-ccl_device_inline bool isequal(const float4 a, const float4 b);
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-#endif /* !__KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
ccl_device_inline float4 zero_float4()
{
#ifdef __KERNEL_SSE__
@@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4()
return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
}
+ccl_device_inline int4 cast(const float4 a)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_castps_si128(a));
+#else
+ return make_int4(
+ __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w));
+#endif
+}
+
#if !defined(__KERNEL_METAL__)
ccl_device_inline float4 operator-(const float4 &a)
{
@@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a)
# endif
}
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator*(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_mul_ps(a.m128, b.m128));
@@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator*(const float4 &a, float f)
+ccl_device_inline float4 operator*(const float4 a, float f)
{
# if defined(__KERNEL_SSE__)
return a * make_float4(f);
@@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f)
# endif
}
-ccl_device_inline float4 operator*(float f, const float4 &a)
+ccl_device_inline float4 operator*(float f, const float4 a)
{
return a * f;
}
-ccl_device_inline float4 operator/(const float4 &a, float f)
+ccl_device_inline float4 operator/(const float4 a, float f)
{
return a * (1.0f / f);
}
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator/(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_div_ps(a.m128, b.m128));
@@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator+(const float4 &a, const float f)
-{
- return a + make_float4(f, f, f, f);
-}
-
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator+(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_add_ps(a.m128, b.m128));
@@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator-(const float4 &a, const float f)
+ccl_device_inline float4 operator+(const float4 a, const float f)
{
- return a - make_float4(f, f, f, f);
+ return a + make_float4(f);
}
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_sub_ps(a.m128, b.m128));
@@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float f)
+{
+ return a - make_float4(f);
+}
+
+ccl_device_inline float4 operator+=(float4 &a, const float4 b)
{
return a = a + b;
}
-ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-=(float4 &a, const float4 b)
{
return a = a - b;
}
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator*=(float4 &a, const float4 b)
{
return a = a * b;
}
@@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
return a = a / f;
}
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
@@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator>=(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
@@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<=(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
@@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline bool operator==(const float4 &a, const float4 &b)
+ccl_device_inline bool operator==(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -240,160 +172,148 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float distance(const float4 &a, const float4 &b)
-{
- return len(a - b);
-}
-
-ccl_device_inline float dot(const float4 &a, const float4 &b)
+ccl_device_inline const float4 operator^(const float4 a, const float4 b)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- __m128 t = vmulq_f32(a, b);
- return vaddvq_f32(t);
-# else
- return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-# endif
+# ifdef __KERNEL_SSE__
+ return float4(_mm_xor_ps(a.m128, b.m128));
# else
- return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+ return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)),
+ __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)),
+ __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)),
+ __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w)));
# endif
}
-ccl_device_inline float len_squared(const float4 &a)
-{
- return dot(a, a);
-}
-
-ccl_device_inline float4 rcp(const float4 &a)
+ccl_device_inline float4 min(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
- /* Don't use _mm_rcp_ps due to poor precision. */
- return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+ return float4(_mm_min_ps(a.m128, b.m128));
# else
- return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+ return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
# endif
}
-ccl_device_inline float4 sqrt(const float4 &a)
+ccl_device_inline float4 max(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
- return float4(_mm_sqrt_ps(a.m128));
+ return float4(_mm_max_ps(a.m128, b.m128));
# else
- return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+ return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
# endif
}
-ccl_device_inline float4 sqr(const float4 &a)
+ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
{
- return a * a;
+ return min(max(a, mn), mx);
}
+#endif /* !__KERNEL_METAL__*/
-ccl_device_inline float4 cross(const float4 &a, const float4 &b)
+ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
{
-# ifdef __KERNEL_SSE__
- return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
- (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
+ return float4(vfmaq_f32(c, a, b));
+# elif defined(__KERNEL_AVX2__)
+ return float4(_mm_fmadd_ps(a, b, c));
# else
- return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+ return a * b + c;
# endif
+#else
+ return a * b + c;
+#endif
}
-ccl_device_inline bool is_zero(const float4 &a)
+ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
{
-# ifdef __KERNEL_SSE__
- return a == zero_float4();
+#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
+ return float4(vfmaq_f32(vnegq_f32(c), a, b));
+# elif defined(__KERNEL_AVX2__)
+ return float4(_mm_fmsub_ps(a, b, c));
# else
- return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+ return a * b - c;
# endif
+#else
+ return a * b - c;
+#endif
}
-ccl_device_inline float average(const float4 &a)
+#ifdef __KERNEL_SSE__
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 b)
{
- return reduce_add(a) * 0.25f;
+# ifdef __KERNEL_NEON__
+ return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+# else
+ return float4(
+ _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+# endif
}
-ccl_device_inline float len(const float4 &a)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
{
- return sqrtf(dot(a, a));
+ return float4(_mm_movelh_ps(a, a));
}
-ccl_device_inline float4 normalize(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
{
- return a / len(a);
+ return float4(_mm_movehl_ps(a, a));
}
-ccl_device_inline float4 safe_normalize(const float4 &a)
+# ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
{
- float t = len(a);
- return (t != 0.0f) ? a / t : a;
+ return float4(_mm_moveldup_ps(b));
}
-ccl_device_inline float4 min(const float4 &a, const float4 &b)
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
{
-# ifdef __KERNEL_SSE__
- return float4(_mm_min_ps(a.m128, b.m128));
-# else
- return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-# endif
+ return float4(_mm_movehdup_ps(b));
}
+# endif /* __KERNEL_SSE3__ */
-ccl_device_inline float4 max(const float4 &a, const float4 &b)
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 a, const float4 b)
{
-# ifdef __KERNEL_SSE__
- return float4(_mm_max_ps(a.m128, b.m128));
+# ifdef __KERNEL_NEON__
+ return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
# else
- return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+ return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
# endif
}
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
+template<size_t i0> __forceinline const float4 shuffle(const float4 b)
{
- return min(max(a, mn), mx);
+ return shuffle<i0, i0, i0, i0>(b);
}
-
-ccl_device_inline float4 fabs(const float4 &a)
+template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vabsq_f32(a));
-# else
- return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-# endif
-# else
- return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-# endif
-}
-
-ccl_device_inline float4 floor(const float4 &a)
-{
-# ifdef __KERNEL_SSE__
- return float4(_mm_floor_ps(a));
+# ifdef __KERNEL_NEON__
+ return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
# else
- return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+ return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
# endif
}
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
{
- return a + t * (b - a);
+ return float4(_mm_movelh_ps(a, b));
}
-ccl_device_inline float4 saturate(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
{
- return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+ return float4(_mm_movehl_ps(b, a));
}
-ccl_device_inline float4 exp(float4 v)
+template<size_t i> __forceinline float extract(const float4 a)
{
- return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+ return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
}
-
-ccl_device_inline float4 log(float4 v)
+template<> __forceinline float extract<0>(const float4 a)
{
- return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+ return _mm_cvtss_f32(a);
}
-
-#endif /* !__KERNEL_METAL__*/
+#endif
ccl_device_inline float reduce_add(const float4 a)
{
@@ -440,77 +360,192 @@ ccl_device_inline float reduce_max(const float4 a)
#endif
}
-ccl_device_inline bool isequal(const float4 a, const float4 b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float dot(const float4 a, const float4 b)
{
-#if defined(__KERNEL_METAL__)
- return all(a == b);
-#else
- return a == b;
-#endif
+# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ __m128 t = vmulq_f32(a, b);
+ return vaddvq_f32(t);
+# else
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+# endif
+# else
+ return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+# endif
}
+#endif /* !defined(__KERNEL_METAL__) */
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b)
+ccl_device_inline float len(const float4 a)
{
-# if defined(__KERNEL_NEON__)
- return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+ return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float len_squared(const float4 a)
+{
+ return dot(a, a);
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float4 a, const float4 b)
+{
+ return len(a - b);
+}
+
+ccl_device_inline float4 rcp(const float4 a)
+{
+# ifdef __KERNEL_SSE__
+ /* Don't use _mm_rcp_ps due to poor precision. */
+ return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
# else
- return float4(_mm_castsi128_ps(
- _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+ return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
# endif
}
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b)
+ccl_device_inline float4 sqrt(const float4 a)
{
-# if defined(__KERNEL_NEON__)
- return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+# ifdef __KERNEL_SSE__
+ return float4(_mm_sqrt_ps(a.m128));
# else
- return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+ return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
# endif
}
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
+ccl_device_inline float4 sqr(const float4 a)
{
- return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+ return a * a;
}
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b)
+ccl_device_inline float4 cross(const float4 a, const float4 b)
{
- return float4(_mm_movelh_ps(a.m128, b.m128));
+# ifdef __KERNEL_SSE__
+ return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
+ (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+# else
+ return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+# endif
}
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b)
+ccl_device_inline bool is_zero(const float4 a)
{
- return float4(_mm_movehl_ps(b.m128, a.m128));
+# ifdef __KERNEL_SSE__
+ return a == zero_float4();
+# else
+ return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+# endif
}
-# ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b)
+ccl_device_inline float average(const float4 a)
{
- return float4(_mm_moveldup_ps(b));
+ return reduce_add(a) * 0.25f;
}
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b)
+ccl_device_inline float4 normalize(const float4 a)
{
- return float4(_mm_movehdup_ps(b));
+ return a / len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+ float t = len(a);
+ return (t != 0.0f) ? a / t : a;
+}
+
+ccl_device_inline float4 fabs(const float4 a)
+{
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vabsq_f32(a));
+# else
+ return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+# endif
+# else
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+# endif
+}
+
+ccl_device_inline float4 floor(const float4 a)
+{
+# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_NEON__)
+ return float4(vrndmq_f32(a));
+# else
+ return float4(_mm_floor_ps(a));
+# endif
+# else
+ return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+# endif
+}
+
+ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
+{
+# ifdef __KERNEL_SSE__
+ const float4 f = floor(x);
+ *i = int4(_mm_cvttps_epi32(f.m128));
+ return x - f;
+# else
+ float4 r;
+ r.x = floorfrac(x.x, &i->x);
+ r.y = floorfrac(x.y, &i->y);
+ r.z = floorfrac(x.z, &i->z);
+ r.w = floorfrac(x.w, &i->w);
+ return r;
+# endif
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline float4 saturate(const float4 a)
+{
+ return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+ return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+ return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
+#endif /* !__KERNEL_METAL__*/
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+ return all(a == b);
+#else
+ return a == b;
+#endif
}
-# endif /* __KERNEL_SSE3__ */
-#endif /* __KERNEL_SSE__ */
#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
+ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE41__
return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+# else
+ return float4(
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
+# endif
# else
return make_float4(
(mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
# endif
}
-ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
+ccl_device_inline float4 mask(const int4 mask, const float4 a)
{
/* Replace elements of x with zero where mask isn't set. */
return select(mask, a, zero_float4());