Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt1
-rw-r--r--intern/cycles/util/defines.h2
-rw-r--r--intern/cycles/util/math.h6
-rw-r--r--intern/cycles/util/math_fast.h2
-rw-r--r--intern/cycles/util/math_float3.h15
-rw-r--r--intern/cycles/util/math_float4.h140
-rw-r--r--intern/cycles/util/math_float8.h419
-rw-r--r--intern/cycles/util/math_intersect.h68
-rw-r--r--intern/cycles/util/string.cpp18
-rw-r--r--intern/cycles/util/string.h2
-rw-r--r--intern/cycles/util/transform.cpp19
-rw-r--r--intern/cycles/util/transform.h79
-rw-r--r--intern/cycles/util/types_float8.h15
-rw-r--r--intern/cycles/util/types_float8_impl.h23
14 files changed, 648 insertions, 161 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index fddac1dbbcf..9bc9f00e142 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -63,6 +63,7 @@ set(SRC_HEADERS
math_float2.h
math_float3.h
math_float4.h
+ math_float8.h
math_int2.h
math_int3.h
math_int4.h
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 115a747cf1c..d0df1a221fc 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -81,7 +81,7 @@
/* macros */
/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
# define LIKELY(x) __builtin_expect(!!(x), 1)
# define UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index af2f1ea092d..2631304c84b 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -511,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
return make_float4(a.x, a.y, a.z, 1.0f);
}
+ccl_device_inline float4 float3_to_float4(const float3 a, const float w)
+{
+ return make_float4(a.x, a.y, a.z, w);
+}
+
ccl_device_inline float inverse_lerp(float a, float b, float x)
{
return (x - a) / (b - a);
@@ -535,6 +540,7 @@ CCL_NAMESPACE_END
#include "util/math_float2.h"
#include "util/math_float3.h"
#include "util/math_float4.h"
+#include "util/math_float8.h"
#include "util/rect.h"
diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h
index 2221e7a9835..142a664a1d2 100644
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x)
return fast_exp2f(x / M_LN2_F);
}
-#if defined(__KERNEL_CPU__) && !defined(_MSC_VER)
+#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
/* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
* T78047 and T78869 for details. Just disable for now, it only makes
* a small difference in denoising performance. */
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index c02b4cdbf0d..c408eadf195 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -147,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
ccl_device_inline float3 operator/(const float3 &a, const float f)
{
- float invf = 1.0f / f;
- return a * invf;
+# if defined(__KERNEL_SSE__)
+ return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
+# else
+ return make_float3(a.x / f, a.y / f, a.z / f);
+# endif
}
ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
@@ -284,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
ccl_device_inline float3 cross(const float3 &a, const float3 &b)
{
- float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
- return r;
+# ifdef __KERNEL_SSE__
+ return float3(shuffle<1, 2, 0, 3>(
+ msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+# else
+ return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+# endif
}
ccl_device_inline float3 normalize(const float3 &a)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index 073c65c2d6a..c2721873037 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -55,6 +55,7 @@ ccl_device_inline float4 floor(const float4 &a);
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
#endif /* !__KERNEL_METAL__*/
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
ccl_device_inline float4 safe_divide(const float4 a, const float b);
#ifdef __KERNEL_SSE__
@@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
# endif
#endif /* __KERNEL_SSE__ */
+ccl_device_inline float reduce_min(const float4 a);
+ccl_device_inline float reduce_max(const float4 a);
+ccl_device_inline float reduce_add(const float4 a);
+
+ccl_device_inline bool isequal(const float4 a, const float4 b);
+
#ifndef __KERNEL_GPU__
ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-ccl_device_inline float4 reduce_min(const float4 &a);
-ccl_device_inline float4 reduce_max(const float4 &a);
-ccl_device_inline float4 reduce_add(const float4 &a);
#endif /* !__KERNEL_GPU__ */
/*******************************************************************************
@@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a)
# endif
}
-ccl_device_inline float4 reduce_add(const float4 &a)
-{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vdupq_n_f32(vaddvq_f32(a)));
-# elif defined(__KERNEL_SSE3__)
- float4 h(_mm_hadd_ps(a.m128, a.m128));
- return float4(_mm_hadd_ps(h.m128, h.m128));
-# else
- float4 h(shuffle<1, 0, 3, 2>(a) + a);
- return shuffle<2, 3, 0, 1>(h) + h;
-# endif
-# else
- float sum = (a.x + a.y) + (a.z + a.w);
- return make_float4(sum, sum, sum, sum);
-# endif
-}
-
ccl_device_inline float average(const float4 &a)
{
- return reduce_add(a).x * 0.25f;
+ return reduce_add(a) * 0.25f;
}
ccl_device_inline float len(const float4 &a)
@@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
return a + t * (b - a);
}
+ccl_device_inline float4 saturate(const float4 &a)
+{
+ return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+ return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+ return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
#endif /* !__KERNEL_METAL__*/
+ccl_device_inline float reduce_add(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return vaddvq_f32(a);
+# elif defined(__KERNEL_SSE3__)
+ float4 h(_mm_hadd_ps(a.m128, a.m128));
+ return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+# else
+ float4 h(shuffle<1, 0, 3, 2>(a) + a);
+ return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
+# endif
+#else
+ return a.x + a.y + a.z + a.w;
+#endif
+}
+
+ccl_device_inline float reduce_min(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return vminvq_f32(a);
+# else
+ float4 h = min(shuffle<1, 0, 3, 2>(a), a);
+ return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
+# endif
+#else
+ return min(min(a.x, a.y), min(a.z, a.w));
+#endif
+}
+
+ccl_device_inline float reduce_max(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return vmaxvq_f32(a);
+# else
+ float4 h = max(shuffle<1, 0, 3, 2>(a), a);
+ return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
+# endif
+#else
+ return max(max(a.x, a.y), max(a.z, a.w));
+#endif
+}
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+ return all(a == b);
+#else
+ return a == b;
+#endif
+}
+
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
@@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
return select(mask, a, zero_float4());
}
-ccl_device_inline float4 reduce_min(const float4 &a)
-{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vdupq_n_f32(vminvq_f32(a)));
-# else
- float4 h = min(shuffle<1, 0, 3, 2>(a), a);
- return min(shuffle<2, 3, 0, 1>(h), h);
-# endif
-# else
- return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-# endif
-}
-
-ccl_device_inline float4 reduce_max(const float4 &a)
-{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vdupq_n_f32(vmaxvq_f32(a)));
-# else
- float4 h = max(shuffle<1, 0, 3, 2>(a), a);
- return max(shuffle<2, 3, 0, 1>(h), h);
-# endif
-# else
- return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-# endif
-}
-
ccl_device_inline float4 load_float4(ccl_private const float *v)
{
# ifdef __KERNEL_SSE__
@@ -505,6 +532,14 @@ ccl_device_inline float4 safe_divide(const float4 a, const float b)
return (b != 0.0f) ? a / b : zero_float4();
}
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b)
+{
+ return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
+ (b.y != 0.0f) ? a.y / b.y : 0.0f,
+ (b.z != 0.0f) ? a.z / b.z : 0.0f,
+ (b.w != 0.0f) ? a.w / b.w : 0.0f);
+}
+
ccl_device_inline bool isfinite_safe(float4 v)
{
return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
@@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite(float4 v)
return v;
}
+ccl_device_inline float4 pow(float4 v, float e)
+{
+ return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e));
+}
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
new file mode 100644
index 00000000000..8ed8d56a034
--- /dev/null
+++ b/intern/cycles/util/math_float8.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_FLOAT8_H__
+#define __UTIL_MATH_FLOAT8_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+ccl_device_inline float8 operator+(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator+(const float8 &a, const float f);
+ccl_device_inline float8 operator+(const float f, const float8 &a);
+
+ccl_device_inline float8 operator-(const float8 &a);
+ccl_device_inline float8 operator-(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator-(const float8 &a, const float f);
+ccl_device_inline float8 operator-(const float f, const float8 &a);
+
+ccl_device_inline float8 operator*(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator*(const float8 &a, const float f);
+ccl_device_inline float8 operator*(const float f, const float8 &a);
+
+ccl_device_inline float8 operator/(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator/(const float8 &a, float f);
+ccl_device_inline float8 operator/(const float f, const float8 &a);
+
+ccl_device_inline float8 operator+=(float8 &a, const float8 &b);
+
+ccl_device_inline float8 operator*=(float8 &a, const float8 &b);
+ccl_device_inline float8 operator*=(float8 &a, float f);
+
+ccl_device_inline float8 operator/=(float8 &a, float f);
+
+ccl_device_inline bool operator==(const float8 &a, const float8 &b);
+
+ccl_device_inline float8 rcp(const float8 &a);
+ccl_device_inline float8 sqrt(const float8 &a);
+ccl_device_inline float8 sqr(const float8 &a);
+ccl_device_inline bool is_zero(const float8 &a);
+ccl_device_inline float average(const float8 &a);
+ccl_device_inline float8 min(const float8 &a, const float8 &b);
+ccl_device_inline float8 max(const float8 &a, const float8 &b);
+ccl_device_inline float8 clamp(const float8 &a, const float8 &mn, const float8 &mx);
+ccl_device_inline float8 fabs(const float8 &a);
+ccl_device_inline float8 mix(const float8 &a, const float8 &b, float t);
+
+ccl_device_inline float8 safe_divide(const float8 a, const float b);
+ccl_device_inline float8 safe_divide(const float8 a, const float8 b);
+
+ccl_device_inline float reduce_min(const float8 &a);
+ccl_device_inline float reduce_max(const float8 &a);
+ccl_device_inline float reduce_add(const float8 &a);
+
+ccl_device_inline float8 saturate(const float8 &a);
+ccl_device_inline bool isequal(const float8 a, const float8 b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+ccl_device_inline float8 zero_float8()
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_setzero_ps());
+#else
+ return make_float8(0.0f);
+#endif
+}
+
+ccl_device_inline float8 one_float8()
+{
+ return make_float8(1.0f);
+}
+
+ccl_device_inline float8 operator+(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_add_ps(a.m256, b.m256));
+#else
+ return make_float8(
+ a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#endif
+}
+
+ccl_device_inline float8 operator+(const float8 &a, const float f)
+{
+ return a + make_float8(f);
+}
+
+ccl_device_inline float8 operator+(const float f, const float8 &a)
+{
+ return make_float8(f) + a;
+}
+
+ccl_device_inline float8 operator-(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+ __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+ return float8(_mm256_xor_ps(a.m256, mask));
+#else
+ return make_float8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+#endif
+}
+
+ccl_device_inline float8 operator-(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_sub_ps(a.m256, b.m256));
+#else
+ return make_float8(
+ a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#endif
+}
+
+ccl_device_inline float8 operator-(const float8 &a, const float f)
+{
+ return a - make_float8(f);
+}
+
+ccl_device_inline float8 operator-(const float f, const float8 &a)
+{
+ return make_float8(f) - a;
+}
+
+ccl_device_inline float8 operator*(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_mul_ps(a.m256, b.m256));
+#else
+ return make_float8(
+ a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8 operator*(const float8 &a, const float f)
+{
+ return a * make_float8(f);
+}
+
+ccl_device_inline float8 operator*(const float f, const float8 &a)
+{
+ return make_float8(f) * a;
+}
+
+ccl_device_inline float8 operator/(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_div_ps(a.m256, b.m256));
+#else
+ return make_float8(
+ a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
+#endif
+}
+
+ccl_device_inline float8 operator/(const float8 &a, const float f)
+{
+ return a / make_float8(f);
+}
+
+ccl_device_inline float8 operator/(const float f, const float8 &a)
+{
+ return make_float8(f) / a;
+}
+
+ccl_device_inline float8 operator+=(float8 &a, const float8 &b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline float8 operator-=(float8 &a, const float8 &b)
+{
+ return a = a - b;
+}
+
+ccl_device_inline float8 operator*=(float8 &a, const float8 &b)
+{
+ return a = a * b;
+}
+
+ccl_device_inline float8 operator*=(float8 &a, float f)
+{
+ return a = a * f;
+}
+
+ccl_device_inline float8 operator/=(float8 &a, float f)
+{
+ return a = a / f;
+}
+
+ccl_device_inline bool operator==(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return (_mm256_movemask_ps(_mm256_castsi256_ps(
+ _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
+ 0b11111111) == 0b11111111;
+#else
+ return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f &&
+ a.g == b.g && a.h == b.h);
+#endif
+}
+
+ccl_device_inline float8 rcp(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_rcp_ps(a.m256));
+#else
+ return make_float8(1.0f / a.a,
+ 1.0f / a.b,
+ 1.0f / a.c,
+ 1.0f / a.d,
+ 1.0f / a.e,
+ 1.0f / a.f,
+ 1.0f / a.g,
+ 1.0f / a.h);
+#endif
+}
+
+ccl_device_inline float8 sqrt(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_sqrt_ps(a.m256));
+#else
+ return make_float8(sqrtf(a.a),
+ sqrtf(a.b),
+ sqrtf(a.c),
+ sqrtf(a.d),
+ sqrtf(a.e),
+ sqrtf(a.f),
+ sqrtf(a.g),
+ sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline float8 sqr(const float8 &a)
+{
+ return a * a;
+}
+
+ccl_device_inline bool is_zero(const float8 &a)
+{
+ return a == make_float8(0.0f);
+}
+
+ccl_device_inline float average(const float8 &a)
+{
+ return reduce_add(a) / 8.0f;
+}
+
+ccl_device_inline float8 min(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_min_ps(a.m256, b.m256));
+#else
+ return make_float8(min(a.a, b.a),
+ min(a.b, b.b),
+ min(a.c, b.c),
+ min(a.d, b.d),
+ min(a.e, b.e),
+ min(a.f, b.f),
+ min(a.g, b.g),
+ min(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8 max(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_max_ps(a.m256, b.m256));
+#else
+ return make_float8(max(a.a, b.a),
+ max(a.b, b.b),
+ max(a.c, b.c),
+ max(a.d, b.d),
+ max(a.e, b.e),
+ max(a.f, b.f),
+ max(a.g, b.g),
+ max(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8 clamp(const float8 &a, const float8 &mn, const float8 &mx)
+{
+ return min(max(a, mn), mx);
+}
+
+ccl_device_inline float8 fabs(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+ return float8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+ return make_float8(fabsf(a.a),
+ fabsf(a.b),
+ fabsf(a.c),
+ fabsf(a.d),
+ fabsf(a.e),
+ fabsf(a.f),
+ fabsf(a.g),
+ fabsf(a.h));
+#endif
+}
+
+ccl_device_inline float8 mix(const float8 &a, const float8 &b, float t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline float reduce_min(const float8 &a)
+{
+ return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_max(const float8 &a)
+{
+ return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_add(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+ float8 b(_mm256_hadd_ps(a.m256, a.m256));
+ float8 h(_mm256_hadd_ps(b.m256, b.m256));
+ return h[0] + h[4];
+#else
+ return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float8 saturate(const float8 &a)
+{
+ return clamp(a, make_float8(0.0f), make_float8(1.0f));
+}
+
+ccl_device_inline bool isequal(const float8 a, const float8 b)
+{
+ return a == b;
+}
+
+ccl_device_inline float8 safe_divide(const float8 a, const float b)
+{
+ return (b != 0.0f) ? a / b : make_float8(0.0f);
+}
+
+ccl_device_inline float8 safe_divide(const float8 a, const float8 b)
+{
+ return make_float8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+ (b.b != 0.0f) ? a.b / b.b : 0.0f,
+ (b.c != 0.0f) ? a.c / b.c : 0.0f,
+ (b.d != 0.0f) ? a.d / b.d : 0.0f,
+ (b.e != 0.0f) ? a.e / b.e : 0.0f,
+ (b.f != 0.0f) ? a.f / b.f : 0.0f,
+ (b.g != 0.0f) ? a.g / b.g : 0.0f,
+ (b.h != 0.0f) ? a.h / b.h : 0.0f);
+}
+
+ccl_device_inline float8 ensure_finite(float8 v)
+{
+ v.a = ensure_finite(v.a);
+ v.b = ensure_finite(v.b);
+ v.c = ensure_finite(v.c);
+ v.d = ensure_finite(v.d);
+ v.e = ensure_finite(v.e);
+ v.f = ensure_finite(v.f);
+ v.g = ensure_finite(v.g);
+ v.h = ensure_finite(v.h);
+
+ return v;
+}
+
+ccl_device_inline bool isfinite_safe(float8 v)
+{
+ return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
+ isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
+}
+
+ccl_device_inline float8 pow(float8 v, float e)
+{
+ return make_float8(powf(v.a, e),
+ powf(v.b, e),
+ powf(v.c, e),
+ powf(v.d, e),
+ powf(v.e, e),
+ powf(v.f, e),
+ powf(v.g, e),
+ powf(v.h, e));
+}
+
+ccl_device_inline float8 exp(float8 v)
+{
+ return make_float8(
+ expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
+}
+
+ccl_device_inline float8 log(float8 v)
+{
+ return make_float8(
+ logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
+}
+
+ccl_device_inline float dot(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+ float8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+ return t[0] + t[4];
+#else
+ return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
+ (a.g * b.g) + (a.h * b.h);
+#endif
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index b0de0b25a45..3e5891b2507 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN
ccl_device bool ray_sphere_intersect(float3 ray_P,
float3 ray_D,
- float ray_t,
+ float ray_tmin,
+ float ray_tmax,
float3 sphere_P,
float sphere_radius,
ccl_private float3 *isect_P,
@@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
return false;
}
const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
- if (t < ray_t) {
+ if (t > ray_tmin && t < ray_tmax) {
*isect_t = t;
*isect_P = ray_P + ray_D * t;
return true;
@@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
float3 ray_D,
- float ray_t,
+ float ray_tmin,
+ float ray_tmax,
float3 disk_P,
float disk_radius,
ccl_private float3 *isect_P,
@@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
}
/* Compute t to intersection point. */
const float t = -disk_t / div;
- if (t < 0.0f || t > ray_t) {
+ if (!(t > ray_tmin && t < ray_tmax)) {
return false;
}
/* Test if within radius. */
@@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
ccl_device bool ray_disk_intersect(float3 ray_P,
float3 ray_D,
- float ray_t,
+ float ray_tmin,
+ float ray_tmax,
float3 disk_P,
float3 disk_N,
float disk_radius,
@@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
}
float3 P = ray_P + t * ray_D;
float3 T = P - disk_P;
- if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) {
+
+ if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) {
*isect_P = ray_P + t * ray_D;
*isect_t = t;
return true;
@@ -101,9 +105,10 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
return false;
}
-ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
- float3 ray_dir,
- float ray_t,
+ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
+ const float3 ray_D,
+ const float ray_tmin,
+ const float ray_tmax,
const float3 tri_a,
const float3 tri_b,
const float3 tri_c,
@@ -111,14 +116,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
ccl_private float *isect_v,
ccl_private float *isect_t)
{
-#define dot3(a, b) dot(a, b)
- const float3 P = ray_P;
- const float3 dir = ray_dir;
+ /* This implementation matches the Plücker coordinates triangle intersection
+ * in Embree. */
/* Calculate vertices relative to ray origin. */
- const float3 v0 = tri_c - P;
- const float3 v1 = tri_a - P;
- const float3 v2 = tri_b - P;
+ const float3 v0 = tri_c - ray_P;
+ const float3 v1 = tri_a - ray_P;
+ const float3 v2 = tri_b - ray_P;
/* Calculate triangle edges. */
const float3 e0 = v2 - v0;
@@ -126,42 +130,38 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
const float3 e2 = v1 - v2;
/* Perform edge tests. */
- const float U = dot(cross(v2 + v0, e0), ray_dir);
- const float V = dot(cross(v0 + v1, e1), ray_dir);
- const float W = dot(cross(v1 + v2, e2), ray_dir);
+ const float U = dot(cross(v2 + v0, e0), ray_D);
+ const float V = dot(cross(v0 + v1, e1), ray_D);
+ const float W = dot(cross(v1 + v2, e2), ray_D);
+ const float eps = FLT_EPSILON * fabsf(U + V + W);
const float minUVW = min(U, min(V, W));
const float maxUVW = max(U, max(V, W));
- if (minUVW < 0.0f && maxUVW > 0.0f) {
+ if (!(minUVW >= -eps || maxUVW <= eps)) {
return false;
}
/* Calculate geometry normal and denominator. */
const float3 Ng1 = cross(e1, e0);
- // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
const float3 Ng = Ng1 + Ng1;
- const float den = dot3(Ng, dir);
+ const float den = dot(Ng, ray_D);
/* Avoid division by 0. */
if (UNLIKELY(den == 0.0f)) {
return false;
}
/* Perform depth test. */
- const float T = dot3(v0, Ng);
- const int sign_den = (__float_as_int(den) & 0x80000000);
- const float sign_T = xor_signmask(T, sign_den);
- if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) {
+ const float T = dot(v0, Ng);
+ const float t = T / den;
+ if (!(t >= ray_tmin && t <= ray_tmax)) {
return false;
}
- const float inv_den = 1.0f / den;
- *isect_u = U * inv_den;
- *isect_v = V * inv_den;
- *isect_t = T * inv_den;
+ *isect_u = U / den;
+ *isect_v = V / den;
+ *isect_t = t;
return true;
-
-#undef dot3
}
/* Tests for an intersection between a ray and a quad defined by
@@ -171,8 +171,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
*/
ccl_device bool ray_quad_intersect(float3 ray_P,
float3 ray_D,
- float ray_mint,
- float ray_maxt,
+ float ray_tmin,
+ float ray_tmax,
float3 quad_P,
float3 quad_u,
float3 quad_v,
@@ -185,7 +185,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
{
/* Perform intersection test. */
float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
- if (t < ray_mint || t > ray_maxt) {
+ if (!(t > ray_tmin && t < ray_tmax)) {
return false;
}
const float3 hit = ray_P + t * ray_D;
diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp
index 66ff866ee10..0c318cea44a 100644
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
}
}
+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+ assert(needle.size() == other.size());
+ size_t pos = 0;
+ while (pos != string::npos) {
+ pos = haystack.find(needle, pos);
+ if (pos != string::npos) {
+ memcpy(haystack.data() + pos, other.data(), other.size());
+ pos += other.size();
+ }
+ }
+}
+
string string_remove_trademark(const string &s)
{
string result = s;
@@ -164,6 +177,11 @@ string to_string(const char *str)
return string(str);
}
+string to_string(const float4 &v)
+{
+ return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
string string_to_lower(const string &s)
{
string r = s;
diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h
index a74feee1750..ecbe9e106c6 100644
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
const string &separators = "\t ",
bool skip_empty_tokens = true);
void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
bool string_startswith(string_view s, string_view start);
bool string_endswith(string_view s, string_view end);
string string_strip(const string &s);
string string_remove_trademark(const string &s);
string string_from_bool(const bool var);
string to_string(const char *str);
+string to_string(const float4 &v);
string string_to_lower(const string &s);
/* Wide char strings are only used on Windows to deal with non-ASCII
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index 0bf5de57a20..0b87e88871d 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -99,15 +99,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
memcpy(M, &tfm, sizeof(M));
if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
- /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
- * never be in this situation, but try to invert it anyway with tweak */
- M[0][0] += 1e-8f;
- M[1][1] += 1e-8f;
- M[2][2] += 1e-8f;
-
- if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
- return projection_identity();
- }
+ return projection_identity();
}
memcpy(&tfmR, R, sizeof(R));
@@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
return tfmR;
}
-Transform transform_inverse(const Transform &tfm)
-{
- ProjectionTransform projection(tfm);
- return projection_to_transform(projection_inverse(projection));
-}
-
Transform transform_transposed_inverse(const Transform &tfm)
{
- ProjectionTransform projection(tfm);
- ProjectionTransform iprojection = projection_inverse(projection);
+ ProjectionTransform iprojection(transform_inverse(tfm));
return projection_to_transform(projection_transpose(iprojection));
}
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index a460581d1f3..71164efbac1 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -63,10 +63,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
_MM_TRANSPOSE4_PS(x, y, z, w);
- ssef tmp = shuffle<0>(aa) * x;
- tmp = madd(shuffle<1>(aa), y, tmp);
+ ssef tmp = w;
tmp = madd(shuffle<2>(aa), z, tmp);
- tmp += w;
+ tmp = madd(shuffle<1>(aa), y, tmp);
+ tmp = madd(shuffle<0>(aa), x, tmp);
return float3(tmp.m128);
#elif defined(__KERNEL_METAL__)
@@ -93,9 +93,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
_MM_TRANSPOSE4_PS(x, y, z, w);
- ssef tmp = shuffle<0>(aa) * x;
+ ssef tmp = shuffle<2>(aa) * z;
tmp = madd(shuffle<1>(aa), y, tmp);
- tmp = madd(shuffle<2>(aa), z, tmp);
+ tmp = madd(shuffle<0>(aa), x, tmp);
return float3(tmp.m128);
#elif defined(__KERNEL_METAL__)
@@ -312,7 +312,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
t->z[column] = value.z;
}
-Transform transform_inverse(const Transform &a);
Transform transform_transposed_inverse(const Transform &a);
ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale)
@@ -392,39 +391,47 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
}
-ccl_device_inline Transform transform_quick_inverse(Transform M)
+ccl_device_inline Transform transform_inverse(const Transform tfm)
{
- /* possible optimization: can we avoid doing this altogether and construct
- * the inverse matrix directly from negated translation, transposed rotation,
- * scale can be inverted but what about shearing? */
- Transform R;
- float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
- M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+ /* This implementation matches the one in Embree exactly, to ensure consistent
+ * results with the ray intersection of instances. */
+ float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+ float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+ float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+ float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+ /* Compute determinant. */
+ float det = dot(x, cross(y, z));
+
if (det == 0.0f) {
- M.x.x += 1e-8f;
- M.y.y += 1e-8f;
- M.z.z += 1e-8f;
- det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
- M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+ /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+ * never be in this situation, but try to invert it anyway with tweak.
+ *
+ * This logic does not match Embree which would just give an invalid
+ * matrix. A better solution would be to remove this and ensure any object
+ * matrix is valid. */
+ x.x += 1e-8f;
+ y.y += 1e-8f;
+ z.z += 1e-8f;
+
+ det = dot(x, cross(y, z));
+ if (det == 0.0f) {
+ det = FLT_MAX;
+ }
}
- det = (det != 0.0f) ? 1.0f / det : 0.0f;
-
- float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z,
- M.z.y * M.x.z - M.z.z * M.x.y,
- M.y.z * M.x.y - M.y.y * M.x.z);
- float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x,
- M.z.z * M.x.x - M.z.x * M.x.z,
- M.y.x * M.x.z - M.y.z * M.x.x);
- float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y,
- M.z.x * M.x.y - M.z.y * M.x.x,
- M.y.y * M.x.x - M.y.x * M.x.y);
- float3 T = -make_float3(M.x.w, M.y.w, M.z.w);
-
- R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
- R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
- R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
-
- return R;
+
+ /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+ const float3 inverse_x = cross(y, z) / det;
+ const float3 inverse_y = cross(z, x) / det;
+ const float3 inverse_z = cross(x, y) / det;
+
+ /* Compute translation and fill transform. */
+ Transform itfm;
+ itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
+ itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
+ itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
+
+ return itfm;
}
ccl_device_inline void transform_compose(ccl_private Transform *tfm,
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index d71149946f7..f04dc675c84 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -11,11 +11,13 @@
CCL_NAMESPACE_BEGIN
-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
-
+#ifdef __KERNEL_GPU__
+struct float8
+#else
struct ccl_try_align(32) float8
+#endif
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
union {
__m256 m256;
struct {
@@ -32,18 +34,19 @@ struct ccl_try_align(32) float8
__forceinline float8 &operator=(const float8 &a);
-# else /* __KERNEL_AVX2__ */
+#else /* __KERNEL_AVX2__ */
float a, b, c, d, e, f, g, h;
-# endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
+#ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
+#endif
};
ccl_device_inline float8 make_float8(float f);
ccl_device_inline float8
make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index 0694f5205a5..21931c55071 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -15,8 +15,7 @@
CCL_NAMESPACE_BEGIN
-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
__forceinline float8::float8()
{
}
@@ -44,8 +43,9 @@ __forceinline float8 &float8::operator=(const float8 &f)
m256 = f.m256;
return *this;
}
-# endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
+#ifndef __KERNEL_GPU__
__forceinline float float8::operator[](int i) const
{
util_assert(i >= 0);
@@ -59,30 +59,29 @@ __forceinline float &float8::operator[](int i)
util_assert(i < 8);
return *(&a + i);
}
+#endif
ccl_device_inline float8 make_float8(float f)
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
float8 r(_mm256_set1_ps(f));
-# else
+#else
float8 r = {f, f, f, f, f, f, f, f};
-# endif
+#endif
return r;
}
ccl_device_inline float8
make_float8(float a, float b, float c, float d, float e, float f, float g, float h)
{
-# ifdef __KERNEL_AVX2__
- float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
-# else
+#ifdef __KERNEL_AVX2__
+ float8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#else
float8 r = {a, b, c, d, e, f, g, h};
-# endif
+#endif
return r;
}
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
-
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */