diff options
Diffstat (limited to 'intern/cycles/util')
-rw-r--r-- | intern/cycles/util/CMakeLists.txt | 1 | ||||
-rw-r--r-- | intern/cycles/util/defines.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/math.h | 6 | ||||
-rw-r--r-- | intern/cycles/util/math_fast.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/math_float3.h | 15 | ||||
-rw-r--r-- | intern/cycles/util/math_float4.h | 140 | ||||
-rw-r--r-- | intern/cycles/util/math_float8.h | 419 | ||||
-rw-r--r-- | intern/cycles/util/math_intersect.h | 68 | ||||
-rw-r--r-- | intern/cycles/util/string.cpp | 18 | ||||
-rw-r--r-- | intern/cycles/util/string.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/transform.cpp | 19 | ||||
-rw-r--r-- | intern/cycles/util/transform.h | 79 | ||||
-rw-r--r-- | intern/cycles/util/types_float8.h | 15 | ||||
-rw-r--r-- | intern/cycles/util/types_float8_impl.h | 23 |
14 files changed, 648 insertions, 161 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index fddac1dbbcf..9bc9f00e142 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -63,6 +63,7 @@ set(SRC_HEADERS math_float2.h math_float3.h math_float4.h + math_float8.h math_int2.h math_int3.h math_int4.h diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index 115a747cf1c..d0df1a221fc 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -81,7 +81,7 @@ /* macros */ /* hints for branch prediction, only use in code that runs a _lot_ */ -#if defined(__GNUC__) && defined(__KERNEL_CPU__) +#if defined(__GNUC__) && !defined(__KERNEL_GPU__) # define LIKELY(x) __builtin_expect(!!(x), 1) # define UNLIKELY(x) __builtin_expect(!!(x), 0) #else diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index af2f1ea092d..2631304c84b 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -511,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a) return make_float4(a.x, a.y, a.z, 1.0f); } +ccl_device_inline float4 float3_to_float4(const float3 a, const float w) +{ + return make_float4(a.x, a.y, a.z, w); +} + ccl_device_inline float inverse_lerp(float a, float b, float x) { return (x - a) / (b - a); @@ -535,6 +540,7 @@ CCL_NAMESPACE_END #include "util/math_float2.h" #include "util/math_float3.h" #include "util/math_float4.h" +#include "util/math_float8.h" #include "util/rect.h" diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h index 2221e7a9835..142a664a1d2 100644 --- a/intern/cycles/util/math_fast.h +++ b/intern/cycles/util/math_fast.h @@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x) return fast_exp2f(x / M_LN2_F); } -#if defined(__KERNEL_CPU__) && !defined(_MSC_VER) +#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER) /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see * T78047 and T78869 for details. Just disable for now, it only makes * a small difference in denoising performance. */ diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index c02b4cdbf0d..c408eadf195 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -147,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a) ccl_device_inline float3 operator/(const float3 &a, const float f) { - float invf = 1.0f / f; - return a * invf; +# if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, _mm_set1_ps(f))); +# else + return make_float3(a.x / f, a.y / f, a.z / f); +# endif } ccl_device_inline float3 operator/(const float3 &a, const float3 &b) @@ -284,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b) ccl_device_inline float3 cross(const float3 &a, const float3 &b) { - float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); - return r; +# ifdef __KERNEL_SSE__ + return float3(shuffle<1, 2, 0, 3>( + msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b)))); +# else + return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +# endif } ccl_device_inline float3 normalize(const float3 &a) diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index 073c65c2d6a..c2721873037 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -55,6 +55,7 @@ ccl_device_inline float4 floor(const float4 &a); ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); #endif /* !__KERNEL_METAL__*/ +ccl_device_inline float4 safe_divide(const float4 a, const float4 b); ccl_device_inline float4 safe_divide(const float4 a, const float b); #ifdef __KERNEL_SSE__ @@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b); # endif #endif /* __KERNEL_SSE__ */ +ccl_device_inline float reduce_min(const float4 a); +ccl_device_inline float reduce_max(const float4 a); +ccl_device_inline float reduce_add(const float4 a); + +ccl_device_inline bool isequal(const float4 a, const float4 b); + #ifndef __KERNEL_GPU__ ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b); -ccl_device_inline float4 reduce_min(const float4 &a); -ccl_device_inline float4 reduce_max(const float4 &a); -ccl_device_inline float4 reduce_add(const float4 &a); #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a) # endif } -ccl_device_inline float4 reduce_add(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vaddvq_f32(a))); -# elif defined(__KERNEL_SSE3__) - float4 h(_mm_hadd_ps(a.m128, a.m128)); - return float4(_mm_hadd_ps(h.m128, h.m128)); -# else - float4 h(shuffle<1, 0, 3, 2>(a) + a); - return shuffle<2, 3, 0, 1>(h) + h; -# endif -# else - float sum = (a.x + a.y) + (a.z + a.w); - return make_float4(sum, sum, sum, sum); -# endif -} - ccl_device_inline float average(const float4 &a) { - return reduce_add(a).x * 0.25f; + return reduce_add(a) * 0.25f; } ccl_device_inline float len(const float4 &a) @@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) return a + t * (b - a); } +ccl_device_inline float4 saturate(const float4 &a) +{ + return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w)); +} + +ccl_device_inline float4 exp(float4 v) +{ + return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z)); +} + +ccl_device_inline float4 log(float4 v) +{ + return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z)); +} + #endif /* !__KERNEL_METAL__*/ +ccl_device_inline float reduce_add(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vaddvq_f32(a); +# elif defined(__KERNEL_SSE3__) + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128)); +# else + float4 h(shuffle<1, 0, 3, 2>(a) + a); + return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h); +# endif +#else + return a.x + a.y + a.z + a.w; +#endif +} + +ccl_device_inline float reduce_min(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vminvq_f32(a); +# else + float4 h = min(shuffle<1, 0, 3, 2>(a), a); + return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h)); +# endif +#else + return min(min(a.x, a.y), min(a.z, a.w)); +#endif +} + +ccl_device_inline float reduce_max(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vmaxvq_f32(a); +# else + float4 h = max(shuffle<1, 0, 3, 2>(a), a); + return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h)); +# endif +#else + return max(max(a.x, a.y), max(a.z, a.w)); +#endif +} + +ccl_device_inline bool isequal(const float4 a, const float4 b) +{ +#if defined(__KERNEL_METAL__) + return all(a == b); +#else + return a == b; +#endif +} + #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &b) @@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a) return select(mask, a, zero_float4()); } -ccl_device_inline float4 reduce_min(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vminvq_f32(a))); -# else - float4 h = min(shuffle<1, 0, 3, 2>(a), a); - return min(shuffle<2, 3, 0, 1>(h), h); -# endif -# else - return make_float4(min(min(a.x, a.y), min(a.z, a.w))); -# endif -} - -ccl_device_inline float4 reduce_max(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vmaxvq_f32(a))); -# else - float4 h = max(shuffle<1, 0, 3, 2>(a), a); - return max(shuffle<2, 3, 0, 1>(h), h); -# endif -# else - return make_float4(max(max(a.x, a.y), max(a.z, a.w))); -# endif -} - ccl_device_inline float4 load_float4(ccl_private const float *v) { # ifdef __KERNEL_SSE__ @@ -505,6 +532,14 @@ ccl_device_inline float4 safe_divide(const float4 a, const float b) return (b != 0.0f) ? a / b : zero_float4(); } +ccl_device_inline float4 safe_divide(const float4 a, const float4 b) +{ + return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f, + (b.y != 0.0f) ? a.y / b.y : 0.0f, + (b.z != 0.0f) ? a.z / b.z : 0.0f, + (b.w != 0.0f) ? a.w / b.w : 0.0f); +} + ccl_device_inline bool isfinite_safe(float4 v) { return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w); @@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite(float4 v) return v; } +ccl_device_inline float4 pow(float4 v, float e) +{ + return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e)); +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_FLOAT4_H__ */ diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h new file mode 100644 index 00000000000..8ed8d56a034 --- /dev/null +++ b/intern/cycles/util/math_float8.h @@ -0,0 +1,419 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2022 Blender Foundation */ + +#ifndef __UTIL_MATH_FLOAT8_H__ +#define __UTIL_MATH_FLOAT8_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util/types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +ccl_device_inline float8 operator+(const float8 &a, const float8 &b); +ccl_device_inline float8 operator+(const float8 &a, const float f); +ccl_device_inline float8 operator+(const float f, const float8 &a); + +ccl_device_inline float8 operator-(const float8 &a); +ccl_device_inline float8 operator-(const float8 &a, const float8 &b); +ccl_device_inline float8 operator-(const float8 &a, const float f); +ccl_device_inline float8 operator-(const float f, const float8 &a); + +ccl_device_inline float8 operator*(const float8 &a, const float8 &b); +ccl_device_inline float8 operator*(const float8 &a, const float f); +ccl_device_inline float8 operator*(const float f, const float8 &a); + +ccl_device_inline float8 operator/(const float8 &a, const float8 &b); +ccl_device_inline float8 operator/(const float8 &a, float f); +ccl_device_inline float8 operator/(const float f, const float8 &a); + +ccl_device_inline float8 operator+=(float8 &a, const float8 &b); + +ccl_device_inline float8 operator*=(float8 &a, const float8 &b); +ccl_device_inline float8 operator*=(float8 &a, float f); + +ccl_device_inline float8 operator/=(float8 &a, float f); + +ccl_device_inline bool operator==(const float8 &a, const float8 &b); + +ccl_device_inline float8 rcp(const float8 &a); +ccl_device_inline float8 sqrt(const float8 &a); +ccl_device_inline float8 sqr(const float8 &a); +ccl_device_inline bool is_zero(const float8 &a); +ccl_device_inline float average(const float8 &a); +ccl_device_inline float8 min(const float8 &a, const float8 &b); +ccl_device_inline float8 max(const float8 &a, const float8 &b); +ccl_device_inline float8 clamp(const float8 &a, const float8 &mn, const float8 &mx); +ccl_device_inline float8 fabs(const float8 &a); +ccl_device_inline float8 mix(const float8 &a, const float8 &b, float t); + +ccl_device_inline float8 safe_divide(const float8 a, const float b); +ccl_device_inline float8 safe_divide(const float8 a, const float8 b); + +ccl_device_inline float reduce_min(const float8 &a); +ccl_device_inline float reduce_max(const float8 &a); +ccl_device_inline float reduce_add(const float8 &a); + +ccl_device_inline float8 saturate(const float8 &a); +ccl_device_inline bool isequal(const float8 a, const float8 b); + +/******************************************************************************* + * Definition. + */ + +ccl_device_inline float8 zero_float8() +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_setzero_ps()); +#else + return make_float8(0.0f); +#endif +} + +ccl_device_inline float8 one_float8() +{ + return make_float8(1.0f); +} + +ccl_device_inline float8 operator+(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_add_ps(a.m256, b.m256)); +#else + return make_float8( + a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h); +#endif +} + +ccl_device_inline float8 operator+(const float8 &a, const float f) +{ + return a + make_float8(f); +} + +ccl_device_inline float8 operator+(const float f, const float8 &a) +{ + return make_float8(f) + a; +} + +ccl_device_inline float8 operator-(const float8 &a) +{ +#ifdef __KERNEL_AVX2__ + __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return float8(_mm256_xor_ps(a.m256, mask)); +#else + return make_float8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h); +#endif +} + +ccl_device_inline float8 operator-(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_sub_ps(a.m256, b.m256)); +#else + return make_float8( + a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h); +#endif +} + +ccl_device_inline float8 operator-(const float8 &a, const float f) +{ + return a - make_float8(f); +} + +ccl_device_inline float8 operator-(const float f, const float8 &a) +{ + return make_float8(f) - a; +} + +ccl_device_inline float8 operator*(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_mul_ps(a.m256, b.m256)); +#else + return make_float8( + a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h); +#endif +} + +ccl_device_inline float8 operator*(const float8 &a, const float f) +{ + return a * make_float8(f); +} + +ccl_device_inline float8 operator*(const float f, const float8 &a) +{ + return make_float8(f) * a; +} + +ccl_device_inline float8 operator/(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_div_ps(a.m256, b.m256)); +#else + return make_float8( + a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h); +#endif +} + +ccl_device_inline float8 operator/(const float8 &a, const float f) +{ + return a / make_float8(f); +} + +ccl_device_inline float8 operator/(const float f, const float8 &a) +{ + return make_float8(f) / a; +} + +ccl_device_inline float8 operator+=(float8 &a, const float8 &b) +{ + return a = a + b; +} + +ccl_device_inline float8 operator-=(float8 &a, const float8 &b) +{ + return a = a - b; +} + +ccl_device_inline float8 operator*=(float8 &a, const float8 &b) +{ + return a = a * b; +} + +ccl_device_inline float8 operator*=(float8 &a, float f) +{ + return a = a * f; +} + +ccl_device_inline float8 operator/=(float8 &a, float f) +{ + return a = a / f; +} + +ccl_device_inline bool operator==(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return (_mm256_movemask_ps(_mm256_castsi256_ps( + _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) & + 0b11111111) == 0b11111111; +#else + return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f && + a.g == b.g && a.h == b.h); +#endif +} + +ccl_device_inline float8 rcp(const float8 &a) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_rcp_ps(a.m256)); +#else + return make_float8(1.0f / a.a, + 1.0f / a.b, + 1.0f / a.c, + 1.0f / a.d, + 1.0f / a.e, + 1.0f / a.f, + 1.0f / a.g, + 1.0f / a.h); +#endif +} + +ccl_device_inline float8 sqrt(const float8 &a) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_sqrt_ps(a.m256)); +#else + return make_float8(sqrtf(a.a), + sqrtf(a.b), + sqrtf(a.c), + sqrtf(a.d), + sqrtf(a.e), + sqrtf(a.f), + sqrtf(a.g), + sqrtf(a.h)); +#endif +} + +ccl_device_inline float8 sqr(const float8 &a) +{ + return a * a; +} + +ccl_device_inline bool is_zero(const float8 &a) +{ + return a == make_float8(0.0f); +} + +ccl_device_inline float average(const float8 &a) +{ + return reduce_add(a) / 8.0f; +} + +ccl_device_inline float8 min(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_min_ps(a.m256, b.m256)); +#else + return make_float8(min(a.a, b.a), + min(a.b, b.b), + min(a.c, b.c), + min(a.d, b.d), + min(a.e, b.e), + min(a.f, b.f), + min(a.g, b.g), + min(a.h, b.h)); +#endif +} + +ccl_device_inline float8 max(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_max_ps(a.m256, b.m256)); +#else + return make_float8(max(a.a, b.a), + max(a.b, b.b), + max(a.c, b.c), + max(a.d, b.d), + max(a.e, b.e), + max(a.f, b.f), + max(a.g, b.g), + max(a.h, b.h)); +#endif +} + +ccl_device_inline float8 clamp(const float8 &a, const float8 &mn, const float8 &mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float8 fabs(const float8 &a) +{ +#ifdef __KERNEL_AVX2__ + return float8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); +#else + return make_float8(fabsf(a.a), + fabsf(a.b), + fabsf(a.c), + fabsf(a.d), + fabsf(a.e), + fabsf(a.f), + fabsf(a.g), + fabsf(a.h)); +#endif +} + +ccl_device_inline float8 mix(const float8 &a, const float8 &b, float t) +{ + return a + t * (b - a); +} + +ccl_device_inline float reduce_min(const float8 &a) +{ + return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h))); +} + +ccl_device_inline float reduce_max(const float8 &a) +{ + return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h))); +} + +ccl_device_inline float reduce_add(const float8 &a) +{ +#ifdef __KERNEL_AVX2__ + float8 b(_mm256_hadd_ps(a.m256, a.m256)); + float8 h(_mm256_hadd_ps(b.m256, b.m256)); + return h[0] + h[4]; +#else + return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h; +#endif +} + +ccl_device_inline float8 saturate(const float8 &a) +{ + return clamp(a, make_float8(0.0f), make_float8(1.0f)); +} + +ccl_device_inline bool isequal(const float8 a, const float8 b) +{ + return a == b; +} + +ccl_device_inline float8 safe_divide(const float8 a, const float b) +{ + return (b != 0.0f) ? a / b : make_float8(0.0f); +} + +ccl_device_inline float8 safe_divide(const float8 a, const float8 b) +{ + return make_float8((b.a != 0.0f) ? a.a / b.a : 0.0f, + (b.b != 0.0f) ? a.b / b.b : 0.0f, + (b.c != 0.0f) ? a.c / b.c : 0.0f, + (b.d != 0.0f) ? a.d / b.d : 0.0f, + (b.e != 0.0f) ? a.e / b.e : 0.0f, + (b.f != 0.0f) ? a.f / b.f : 0.0f, + (b.g != 0.0f) ? a.g / b.g : 0.0f, + (b.h != 0.0f) ? a.h / b.h : 0.0f); +} + +ccl_device_inline float8 ensure_finite(float8 v) +{ + v.a = ensure_finite(v.a); + v.b = ensure_finite(v.b); + v.c = ensure_finite(v.c); + v.d = ensure_finite(v.d); + v.e = ensure_finite(v.e); + v.f = ensure_finite(v.f); + v.g = ensure_finite(v.g); + v.h = ensure_finite(v.h); + + return v; +} + +ccl_device_inline bool isfinite_safe(float8 v) +{ + return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) && + isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h); +} + +ccl_device_inline float8 pow(float8 v, float e) +{ + return make_float8(powf(v.a, e), + powf(v.b, e), + powf(v.c, e), + powf(v.d, e), + powf(v.e, e), + powf(v.f, e), + powf(v.g, e), + powf(v.h, e)); +} + +ccl_device_inline float8 exp(float8 v) +{ + return make_float8( + expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h)); +} + +ccl_device_inline float8 log(float8 v) +{ + return make_float8( + logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h)); +} + +ccl_device_inline float dot(const float8 &a, const float8 &b) +{ +#ifdef __KERNEL_AVX2__ + float8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF)); + return t[0] + t[4]; +#else + return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) + + (a.g * b.g) + (a.h * b.h); +#endif +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT8_H__ */ diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index b0de0b25a45..3e5891b2507 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN ccl_device bool ray_sphere_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 sphere_P, float sphere_radius, ccl_private float3 *isect_P, @@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, return false; } const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ - if (t < ray_t) { + if (t > ray_tmin && t < ray_tmax) { *isect_t = t; *isect_P = ray_P + ray_D * t; return true; @@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, ccl_device bool ray_aligned_disk_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 disk_P, float disk_radius, ccl_private float3 *isect_P, @@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, } /* Compute t to intersection point. */ const float t = -disk_t / div; - if (t < 0.0f || t > ray_t) { + if (!(t > ray_tmin && t < ray_tmax)) { return false; } /* Test if within radius. */ @@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, ccl_device bool ray_disk_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 disk_P, float3 disk_N, float disk_radius, @@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P, } float3 P = ray_P + t * ray_D; float3 T = P - disk_P; - if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) { + + if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) { *isect_P = ray_P + t * ray_D; *isect_t = t; return true; @@ -101,9 +105,10 @@ ccl_device bool ray_disk_intersect(float3 ray_P, return false; } -ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, - float3 ray_dir, - float ray_t, +ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, + const float3 ray_D, + const float ray_tmin, + const float ray_tmax, const float3 tri_a, const float3 tri_b, const float3 tri_c, @@ -111,14 +116,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, ccl_private float *isect_v, ccl_private float *isect_t) { -#define dot3(a, b) dot(a, b) - const float3 P = ray_P; - const float3 dir = ray_dir; + /* This implementation matches the Plücker coordinates triangle intersection + * in Embree. */ /* Calculate vertices relative to ray origin. */ - const float3 v0 = tri_c - P; - const float3 v1 = tri_a - P; - const float3 v2 = tri_b - P; + const float3 v0 = tri_c - ray_P; + const float3 v1 = tri_a - ray_P; + const float3 v2 = tri_b - ray_P; /* Calculate triangle edges. */ const float3 e0 = v2 - v0; @@ -126,42 +130,38 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, const float3 e2 = v1 - v2; /* Perform edge tests. */ - const float U = dot(cross(v2 + v0, e0), ray_dir); - const float V = dot(cross(v0 + v1, e1), ray_dir); - const float W = dot(cross(v1 + v2, e2), ray_dir); + const float U = dot(cross(v2 + v0, e0), ray_D); + const float V = dot(cross(v0 + v1, e1), ray_D); + const float W = dot(cross(v1 + v2, e2), ray_D); + const float eps = FLT_EPSILON * fabsf(U + V + W); const float minUVW = min(U, min(V, W)); const float maxUVW = max(U, max(V, W)); - if (minUVW < 0.0f && maxUVW > 0.0f) { + if (!(minUVW >= -eps || maxUVW <= eps)) { return false; } /* Calculate geometry normal and denominator. */ const float3 Ng1 = cross(e1, e0); - // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); const float3 Ng = Ng1 + Ng1; - const float den = dot3(Ng, dir); + const float den = dot(Ng, ray_D); /* Avoid division by 0. */ if (UNLIKELY(den == 0.0f)) { return false; } /* Perform depth test. */ - const float T = dot3(v0, Ng); - const int sign_den = (__float_as_int(den) & 0x80000000); - const float sign_T = xor_signmask(T, sign_den); - if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) { + const float T = dot(v0, Ng); + const float t = T / den; + if (!(t >= ray_tmin && t <= ray_tmax)) { return false; } - const float inv_den = 1.0f / den; - *isect_u = U * inv_den; - *isect_v = V * inv_den; - *isect_t = T * inv_den; + *isect_u = U / den; + *isect_v = V / den; + *isect_t = t; return true; - -#undef dot3 } /* Tests for an intersection between a ray and a quad defined by @@ -171,8 +171,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, */ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, - float ray_mint, - float ray_maxt, + float ray_tmin, + float ray_tmax, float3 quad_P, float3 quad_u, float3 quad_v, @@ -185,7 +185,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P, { /* Perform intersection test. */ float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); - if (t < ray_mint || t > ray_maxt) { + if (!(t > ray_tmin && t < ray_tmax)) { return false; } const float3 hit = ray_P + t * ray_D; diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp index 66ff866ee10..0c318cea44a 100644 --- a/intern/cycles/util/string.cpp +++ b/intern/cycles/util/string.cpp @@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other) } } +void string_replace_same_length(string &haystack, const string &needle, const string &other) +{ + assert(needle.size() == other.size()); + size_t pos = 0; + while (pos != string::npos) { + pos = haystack.find(needle, pos); + if (pos != string::npos) { + memcpy(haystack.data() + pos, other.data(), other.size()); + pos += other.size(); + } + } +} + string string_remove_trademark(const string &s) { string result = s; @@ -164,6 +177,11 @@ string to_string(const char *str) return string(str); } +string to_string(const float4 &v) +{ + return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w); +} + string string_to_lower(const string &s) { string r = s; diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h index a74feee1750..ecbe9e106c6 100644 --- a/intern/cycles/util/string.h +++ b/intern/cycles/util/string.h @@ -38,12 +38,14 @@ void string_split(vector<string> &tokens, const string &separators = "\t ", bool skip_empty_tokens = true); void string_replace(string &haystack, const string &needle, const string &other); +void string_replace_same_length(string &haystack, const string &needle, const string &other); bool string_startswith(string_view s, string_view start); bool string_endswith(string_view s, string_view end); string string_strip(const string &s); string string_remove_trademark(const string &s); string string_from_bool(const bool var); string to_string(const char *str); +string to_string(const float4 &v); string string_to_lower(const string &s); /* Wide char strings are only used on Windows to deal with non-ASCII diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp index 0bf5de57a20..0b87e88871d 100644 --- a/intern/cycles/util/transform.cpp +++ b/intern/cycles/util/transform.cpp @@ -99,15 +99,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) memcpy(M, &tfm, sizeof(M)); if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should - * never be in this situation, but try to invert it anyway with tweak */ - M[0][0] += 1e-8f; - M[1][1] += 1e-8f; - M[2][2] += 1e-8f; - - if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - return projection_identity(); - } + return projection_identity(); } memcpy(&tfmR, R, sizeof(R)); @@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) return tfmR; } -Transform transform_inverse(const Transform &tfm) -{ - ProjectionTransform projection(tfm); - return projection_to_transform(projection_inverse(projection)); -} - Transform transform_transposed_inverse(const Transform &tfm) { - ProjectionTransform projection(tfm); - ProjectionTransform iprojection = projection_inverse(projection); + ProjectionTransform iprojection(transform_inverse(tfm)); return projection_to_transform(projection_transpose(iprojection)); } diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index a460581d1f3..71164efbac1 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -63,10 +63,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f _MM_TRANSPOSE4_PS(x, y, z, w); - ssef tmp = shuffle<0>(aa) * x; - tmp = madd(shuffle<1>(aa), y, tmp); + ssef tmp = w; tmp = madd(shuffle<2>(aa), z, tmp); - tmp += w; + tmp = madd(shuffle<1>(aa), y, tmp); + tmp = madd(shuffle<0>(aa), x, tmp); return float3(tmp.m128); #elif defined(__KERNEL_METAL__) @@ -93,9 +93,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con _MM_TRANSPOSE4_PS(x, y, z, w); - ssef tmp = shuffle<0>(aa) * x; + ssef tmp = shuffle<2>(aa) * z; tmp = madd(shuffle<1>(aa), y, tmp); - tmp = madd(shuffle<2>(aa), z, tmp); + tmp = madd(shuffle<0>(aa), x, tmp); return float3(tmp.m128); #elif defined(__KERNEL_METAL__) @@ -312,7 +312,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val t->z[column] = value.z; } -Transform transform_inverse(const Transform &a); Transform transform_transposed_inverse(const Transform &a); ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale) @@ -392,39 +391,47 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t) #endif /* defined(__KERNEL_GPU_RAYTRACING__) */ } -ccl_device_inline Transform transform_quick_inverse(Transform M) +ccl_device_inline Transform transform_inverse(const Transform tfm) { - /* possible optimization: can we avoid doing this altogether and construct - * the inverse matrix directly from negated translation, transposed rotation, - * scale can be inverted but what about shearing? */ - Transform R; - float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) + - M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z); + /* This implementation matches the one in Embree exactly, to ensure consistent + * results with the ray intersection of instances. */ + float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); + float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); + float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); + float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); + + /* Compute determinant. */ + float det = dot(x, cross(y, z)); + if (det == 0.0f) { - M.x.x += 1e-8f; - M.y.y += 1e-8f; - M.z.z += 1e-8f; - det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) + - M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z); + /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should + * never be in this situation, but try to invert it anyway with tweak. + * + * This logic does not match Embree which would just give an invalid + * matrix. A better solution would be to remove this and ensure any object + * matrix is valid. */ + x.x += 1e-8f; + y.y += 1e-8f; + z.z += 1e-8f; + + det = dot(x, cross(y, z)); + if (det == 0.0f) { + det = FLT_MAX; + } } - det = (det != 0.0f) ? 1.0f / det : 0.0f; - - float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z, - M.z.y * M.x.z - M.z.z * M.x.y, - M.y.z * M.x.y - M.y.y * M.x.z); - float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x, - M.z.z * M.x.x - M.z.x * M.x.z, - M.y.x * M.x.z - M.y.z * M.x.x); - float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y, - M.z.x * M.x.y - M.z.y * M.x.x, - M.y.y * M.x.x - M.y.x * M.x.y); - float3 T = -make_float3(M.x.w, M.y.w, M.z.w); - - R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T)); - R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T)); - R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T)); - - return R; + + /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ + const float3 inverse_x = cross(y, z) / det; + const float3 inverse_y = cross(z, x) / det; + const float3 inverse_z = cross(x, y) / det; + + /* Compute translation and fill transform. */ + Transform itfm; + itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w)); + itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w)); + itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w)); + + return itfm; } ccl_device_inline void transform_compose(ccl_private Transform *tfm, diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h index d71149946f7..f04dc675c84 100644 --- a/intern/cycles/util/types_float8.h +++ b/intern/cycles/util/types_float8.h @@ -11,11 +11,13 @@ CCL_NAMESPACE_BEGIN -#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) - +#ifdef __KERNEL_GPU__ +struct float8 +#else struct ccl_try_align(32) float8 +#endif { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ union { __m256 m256; struct { @@ -32,18 +34,19 @@ struct ccl_try_align(32) float8 __forceinline float8 &operator=(const float8 &a); -# else /* __KERNEL_AVX2__ */ +#else /* __KERNEL_AVX2__ */ float a, b, c, d, e, f, g, h; -# endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX2__ */ +#ifndef __KERNEL_GPU__ __forceinline float operator[](int i) const; __forceinline float &operator[](int i); +#endif }; ccl_device_inline float8 make_float8(float f); ccl_device_inline float8 make_float8(float a, float b, float c, float d, float e, float f, float g, float h); -#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h index 0694f5205a5..21931c55071 100644 --- a/intern/cycles/util/types_float8_impl.h +++ b/intern/cycles/util/types_float8_impl.h @@ -15,8 +15,7 @@ CCL_NAMESPACE_BEGIN -#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ __forceinline float8::float8() { } @@ -44,8 +43,9 @@ __forceinline float8 &float8::operator=(const float8 &f) m256 = f.m256; return *this; } -# endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX2__ */ +#ifndef __KERNEL_GPU__ __forceinline float float8::operator[](int i) const { util_assert(i >= 0); @@ -59,30 +59,29 @@ __forceinline float &float8::operator[](int i) util_assert(i < 8); return *(&a + i); } +#endif ccl_device_inline float8 make_float8(float f) { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ float8 r(_mm256_set1_ps(f)); -# else +#else float8 r = {f, f, f, f, f, f, f, f}; -# endif +#endif return r; } ccl_device_inline float8 make_float8(float a, float b, float c, float d, float e, float f, float g, float h) { -# ifdef __KERNEL_AVX2__ - float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h)); -# else +#ifdef __KERNEL_AVX2__ + float8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h)); +#else float8 r = {a, b, c, d, e, f, g, h}; -# endif +#endif return r; } -#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */ - CCL_NAMESPACE_END #endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */ |