Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt16
-rw-r--r--intern/cycles/util/atomic.h110
-rw-r--r--intern/cycles/util/color.h6
-rw-r--r--intern/cycles/util/debug.cpp12
-rw-r--r--intern/cycles/util/debug.h31
-rw-r--r--intern/cycles/util/defines.h45
-rw-r--r--intern/cycles/util/half.h8
-rw-r--r--intern/cycles/util/hash.h143
-rw-r--r--intern/cycles/util/log.h19
-rw-r--r--intern/cycles/util/math.h82
-rw-r--r--intern/cycles/util/math_fast.h2
-rw-r--r--intern/cycles/util/math_float3.h77
-rw-r--r--intern/cycles/util/math_float4.h148
-rw-r--r--intern/cycles/util/math_float8.h419
-rw-r--r--intern/cycles/util/math_intersect.h155
-rw-r--r--intern/cycles/util/opengl.h2
-rw-r--r--intern/cycles/util/progress.h20
-rw-r--r--intern/cycles/util/string.cpp18
-rw-r--r--intern/cycles/util/string.h2
-rw-r--r--intern/cycles/util/system.cpp89
-rw-r--r--intern/cycles/util/system.h11
-rw-r--r--intern/cycles/util/task.cpp2
-rw-r--r--intern/cycles/util/time.cpp2
-rw-r--r--intern/cycles/util/transform.cpp29
-rw-r--r--intern/cycles/util/transform.h78
-rw-r--r--intern/cycles/util/transform_avx2.cpp13
-rw-r--r--intern/cycles/util/transform_inverse.h76
-rw-r--r--intern/cycles/util/transform_sse41.cpp13
-rw-r--r--intern/cycles/util/types.h23
-rw-r--r--intern/cycles/util/types_float2.h14
-rw-r--r--intern/cycles/util/types_float2_impl.h19
-rw-r--r--intern/cycles/util/types_float3.h33
-rw-r--r--intern/cycles/util/types_float3_impl.h46
-rw-r--r--intern/cycles/util/types_float4.h18
-rw-r--r--intern/cycles/util/types_float4_impl.h53
-rw-r--r--intern/cycles/util/types_float8.h37
-rw-r--r--intern/cycles/util/types_float8_impl.h60
-rw-r--r--intern/cycles/util/types_int2.h11
-rw-r--r--intern/cycles/util/types_int2_impl.h11
-rw-r--r--intern/cycles/util/types_int3.h28
-rw-r--r--intern/cycles/util/types_int3_impl.h47
-rw-r--r--intern/cycles/util/types_int4.h20
-rw-r--r--intern/cycles/util/types_int4_impl.h68
-rw-r--r--intern/cycles/util/types_spectrum.h34
-rw-r--r--intern/cycles/util/types_uchar2.h11
-rw-r--r--intern/cycles/util/types_uchar2_impl.h6
-rw-r--r--intern/cycles/util/types_uchar3.h6
-rw-r--r--intern/cycles/util/types_uchar3_impl.h6
-rw-r--r--intern/cycles/util/types_uchar4.h11
-rw-r--r--intern/cycles/util/types_uchar4_impl.h6
-rw-r--r--intern/cycles/util/types_uint2.h11
-rw-r--r--intern/cycles/util/types_uint2_impl.h11
-rw-r--r--intern/cycles/util/types_uint3.h11
-rw-r--r--intern/cycles/util/types_uint3_impl.h11
-rw-r--r--intern/cycles/util/types_uint4.h11
-rw-r--r--intern/cycles/util/types_uint4_impl.h11
-rw-r--r--intern/cycles/util/types_ushort4.h2
-rw-r--r--intern/cycles/util/types_vector3.h26
-rw-r--r--intern/cycles/util/types_vector3_impl.h30
-rw-r--r--intern/cycles/util/vector.h1
60 files changed, 1596 insertions, 725 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index fddac1dbbcf..997d574a3b0 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -3,7 +3,6 @@
set(INC
..
- ../../glew-mx
)
set(INC_SYS
@@ -26,6 +25,8 @@ set(SRC
thread.cpp
time.cpp
transform.cpp
+ transform_avx2.cpp
+ transform_sse41.cpp
windows.cpp
)
@@ -63,6 +64,7 @@ set(SRC_HEADERS
math_float2.h
math_float3.h
math_float4.h
+ math_float8.h
math_int2.h
math_int3.h
math_int4.h
@@ -115,6 +117,7 @@ set(SRC_HEADERS
types_int3_impl.h
types_int4.h
types_int4_impl.h
+ types_spectrum.h
types_uchar2.h
types_uchar2_impl.h
types_uchar3.h
@@ -128,8 +131,6 @@ set(SRC_HEADERS
types_uint4.h
types_uint4_impl.h
types_ushort4.h
- types_vector3.h
- types_vector3_impl.h
unique_ptr.h
vector.h
version.h
@@ -137,9 +138,14 @@ set(SRC_HEADERS
xml.h
)
+if(CXX_HAS_SSE)
+ set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+ set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-add_definitions(${GL_DEFINITIONS})
-
cycles_add_library(cycles_util "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h
index f89eb28b0b7..1ebf085ae13 100644
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -106,6 +106,116 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
# endif /* __KERNEL_METAL__ */
+# ifdef __KERNEL_ONEAPI__
+
+ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, float x)
+{
+ sycl::atomic_ref<float,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_add(x);
+}
+
+ccl_device_inline float atomic_compare_and_swap_float(ccl_global float *source,
+ float old_val,
+ float new_val)
+{
+ sycl::atomic_ref<float,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*source);
+ atomic.compare_exchange_weak(old_val, new_val);
+ return old_val;
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p,
+ unsigned int x)
+{
+ sycl::atomic_ref<unsigned int,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_add(x);
+}
+
+ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
+{
+ sycl::atomic_ref<int,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_add(x);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
+ unsigned int x)
+{
+ sycl::atomic_ref<unsigned int,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_sub(x);
+}
+
+ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, int x)
+{
+ sycl::atomic_ref<int,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_sub(x);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p)
+{
+ return atomic_fetch_and_add_uint32(p, 1);
+}
+
+ccl_device_inline int atomic_fetch_and_inc_uint32(ccl_global int *p)
+{
+ return atomic_fetch_and_add_uint32(p, 1);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p)
+{
+ return atomic_fetch_and_sub_uint32(p, 1);
+}
+
+ccl_device_inline int atomic_fetch_and_dec_uint32(ccl_global int *p)
+{
+ return atomic_fetch_and_sub_uint32(p, 1);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p,
+ unsigned int x)
+{
+ sycl::atomic_ref<unsigned int,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_or(x);
+}
+
+ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
+{
+ sycl::atomic_ref<int,
+ sycl::memory_order::relaxed,
+ sycl::memory_scope::device,
+ sycl::access::address_space::ext_intel_global_device_space>
+ atomic(*p);
+ return atomic.fetch_or(x);
+}
+
+# endif /* __KERNEL_ONEAPI__ */
+
#endif /* __KERNEL_GPU__ */
#endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index 795c3754976..537f8ab6771 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -318,14 +318,14 @@ ccl_device float3 color_highlight_compress(float3 color, ccl_private float3 *var
{
color += one_float3();
if (variance) {
- *variance *= sqr3(one_float3() / color);
+ *variance *= sqr(one_float3() / color);
}
- return log3(color);
+ return log(color);
}
ccl_device float3 color_highlight_uncompress(float3 color)
{
- return exp3(color) - one_float3();
+ return exp(color) - one_float3();
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp
index 65d108bb9d1..8210e21f951 100644
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -13,7 +13,6 @@
CCL_NAMESPACE_BEGIN
DebugFlags::CPU::CPU()
- : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO)
{
reset();
}
@@ -25,7 +24,7 @@ void DebugFlags::CPU::reset()
do { \
flag = (getenv(env) == NULL); \
if (!flag) { \
- VLOG(1) << "Disabling " << STRINGIFY(flag) << " instruction set."; \
+ VLOG_INFO << "Disabling " << STRINGIFY(flag) << " instruction set."; \
} \
} while (0)
@@ -41,17 +40,17 @@ void DebugFlags::CPU::reset()
bvh_layout = BVH_LAYOUT_AUTO;
}
-DebugFlags::CUDA::CUDA() : adaptive_compile(false)
+DebugFlags::CUDA::CUDA()
{
reset();
}
-DebugFlags::HIP::HIP() : adaptive_compile(false)
+DebugFlags::HIP::HIP()
{
reset();
}
-DebugFlags::Metal::Metal() : adaptive_compile(false)
+DebugFlags::Metal::Metal()
{
reset();
}
@@ -84,14 +83,13 @@ void DebugFlags::OptiX::reset()
use_debug = false;
}
-DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
+DebugFlags::DebugFlags()
{
/* Nothing for now. */
}
void DebugFlags::reset()
{
- viewport_static_bvh = false;
cpu.reset();
cuda.reset();
optix.reset();
diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h
index 3565fdea17f..ab200649f59 100644
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -17,11 +17,6 @@ CCL_NAMESPACE_BEGIN
*/
class DebugFlags {
public:
- /* Use static BVH in viewport, to match final render exactly. */
- bool viewport_static_bvh;
-
- bool running_inside_blender;
-
/* Descriptor of CPU feature-set to be used. */
struct CPU {
CPU();
@@ -30,11 +25,11 @@ class DebugFlags {
void reset();
/* Flags describing which instructions sets are allowed for use. */
- bool avx2;
- bool avx;
- bool sse41;
- bool sse3;
- bool sse2;
+ bool avx2 = true;
+ bool avx = true;
+ bool sse41 = true;
+ bool sse3 = true;
+ bool sse2 = true;
/* Check functions to see whether instructions up to the given one
* are allowed for use.
@@ -65,7 +60,7 @@ class DebugFlags {
* By default the fastest will be used. For debugging the BVH used by other
* CPUs and GPUs can be selected here instead.
*/
- BVHLayout bvh_layout;
+ BVHLayout bvh_layout = BVH_LAYOUT_AUTO;
};
/* Descriptor of CUDA feature-set to be used. */
@@ -77,7 +72,7 @@ class DebugFlags {
/* Whether adaptive feature based runtime compile is enabled or not.
* Requires the CUDA Toolkit and only works on Linux at the moment. */
- bool adaptive_compile;
+ bool adaptive_compile = false;
};
/* Descriptor of HIP feature-set to be used. */
@@ -88,7 +83,7 @@ class DebugFlags {
void reset();
/* Whether adaptive feature based runtime compile is enabled or not. */
- bool adaptive_compile;
+ bool adaptive_compile = false;
};
/* Descriptor of OptiX feature-set to be used. */
@@ -100,7 +95,7 @@ class DebugFlags {
/* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
* validations, and lower optimization level. */
- bool use_debug;
+ bool use_debug = false;
};
/* Descriptor of Metal feature-set to be used. */
@@ -111,7 +106,7 @@ class DebugFlags {
void reset();
/* Whether adaptive feature based runtime compile is enabled or not. */
- bool adaptive_compile;
+ bool adaptive_compile = false;
};
/* Get instance of debug flags registry. */
@@ -142,15 +137,9 @@ class DebugFlags {
private:
DebugFlags();
-#if (__cplusplus > 199711L)
public:
explicit DebugFlags(DebugFlags const & /*other*/) = delete;
void operator=(DebugFlags const & /*other*/) = delete;
-#else
- private:
- explicit DebugFlags(DebugFlags const & /*other*/);
- void operator=(DebugFlags const & /*other*/);
-#endif
};
typedef DebugFlags &DebugFlagsRef;
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 115a747cf1c..1969529eff0 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -81,7 +81,7 @@
/* macros */
/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
# define LIKELY(x) __builtin_expect(!!(x), 1)
# define UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
@@ -89,46 +89,6 @@
# define UNLIKELY(x) (x)
#endif
-#if defined(__GNUC__) || defined(__clang__)
-# if defined(__cplusplus)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x)
-{
- return x;
-}
-# define TYPEOF(x) decltype(decltype_helper(x))
-# else
-# define TYPEOF(x) typeof(x)
-# endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-# define CHECK_TYPE(var, type) \
- { \
- TYPEOF(var) * __tmp; \
- __tmp = (type *)NULL; \
- (void)__tmp; \
- } \
- (void)0
-
-# define CHECK_TYPE_PAIR(var_a, var_b) \
- { \
- TYPEOF(var_a) * __tmp; \
- __tmp = (typeof(var_b) *)NULL; \
- (void)__tmp; \
- } \
- (void)0
-#else
-# define CHECK_TYPE(var, type)
-# define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val)))
-
#ifndef __KERNEL_GPU__
# include <cassert>
# define util_assert(statement) assert(statement)
@@ -136,4 +96,7 @@ template<typename T> static inline T decltype_helper(T x)
# define util_assert(statement)
#endif
+#define CONCAT_HELPER(a, ...) a##__VA_ARGS__
+#define CONCAT(a, ...) CONCAT_HELPER(a, __VA_ARGS__)
+
#endif /* __UTIL_DEFINES_H__ */
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index 434bc12d670..c668638eb02 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -35,7 +35,7 @@ ccl_device_inline float half_to_float(half h_in)
#else
/* CUDA has its own half data type, no need to define then */
-# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__)
/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
* unsigned shorts. */
class half {
@@ -73,7 +73,7 @@ struct half4 {
ccl_device_inline half float_to_half_image(float f)
{
-#if defined(__KERNEL_METAL__)
+#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
return half(min(f, 65504.0f));
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
return __float2half(min(f, 65504.0f));
@@ -103,6 +103,8 @@ ccl_device_inline float half_to_float_image(half h)
{
#if defined(__KERNEL_METAL__)
return half_to_float(h);
+#elif defined(__KERNEL_ONEAPI__)
+ return float(h);
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
return __half2float(h);
#else
@@ -136,7 +138,7 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
ccl_device_inline half float_to_half_display(const float f)
{
-#if defined(__KERNEL_METAL__)
+#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
return half(min(f, 65504.0f));
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
return __float2half(min(f, 65504.0f));
diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h
index 081b33025d8..4f83f331229 100644
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@@ -4,10 +4,28 @@
#ifndef __UTIL_HASH_H__
#define __UTIL_HASH_H__
+#include "util/math.h"
#include "util/types.h"
CCL_NAMESPACE_BEGIN
+/* [0, uint_max] -> [0.0, 1.0) */
+ccl_device_forceinline float uint_to_float_excl(uint n)
+{
+ // Note: we divide by 4294967808 instead of 2^32 because the latter
+ // leads to a [0.0, 1.0] mapping instead of [0.0, 1.0) due to floating
+ // point rounding error. 4294967808 unfortunately leaves (precisely)
+ // one unused ulp between the max number this outputs and 1.0, but
+ // that's the best you can do with this construction.
+ return (float)n * (1.0f / 4294967808.0f);
+}
+
+/* [0, uint_max] -> [0.0, 1.0] */
+ccl_device_forceinline float uint_to_float_incl(uint n)
+{
+ return (float)n * (1.0f / (float)0xFFFFFFFFu);
+}
+
/* ***** Jenkins Lookup3 Hash Functions ***** */
/* Source: http://burtleburtle.net/bob/c/lookup3.c */
@@ -116,22 +134,22 @@ ccl_device_inline uint hash_uint4(uint kx, uint ky, uint kz, uint kw)
ccl_device_inline float hash_uint_to_float(uint kx)
{
- return (float)hash_uint(kx) / (float)0xFFFFFFFFu;
+ return uint_to_float_incl(hash_uint(kx));
}
ccl_device_inline float hash_uint2_to_float(uint kx, uint ky)
{
- return (float)hash_uint2(kx, ky) / (float)0xFFFFFFFFu;
+ return uint_to_float_incl(hash_uint2(kx, ky));
}
ccl_device_inline float hash_uint3_to_float(uint kx, uint ky, uint kz)
{
- return (float)hash_uint3(kx, ky, kz) / (float)0xFFFFFFFFu;
+ return uint_to_float_incl(hash_uint3(kx, ky, kz));
}
ccl_device_inline float hash_uint4_to_float(uint kx, uint ky, uint kz, uint kw)
{
- return (float)hash_uint4(kx, ky, kz, kw) / (float)0xFFFFFFFFu;
+ return uint_to_float_incl(hash_uint4(kx, ky, kz, kw));
}
/* Hashing float or float[234] into a float in the range [0, 1]. */
@@ -359,6 +377,123 @@ ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
#endif
+/* ***** Hash Prospector Hash Functions *****
+ *
+ * These are based on the high-quality 32-bit hash/mixing functions from
+ * https://github.com/skeeto/hash-prospector
+ */
+
+ccl_device_inline uint hash_hp_uint(uint i)
+{
+ // The actual mixing function from Hash Prospector.
+ i ^= i >> 16;
+ i *= 0x21f0aaad;
+ i ^= i >> 15;
+ i *= 0xd35a2d97;
+ i ^= i >> 15;
+
+ // The xor is just to make input zero not map to output zero.
+ // The number is randomly selected and isn't special.
+ return i ^ 0xe6fe3beb;
+}
+
+/* Seedable version of hash_hp_uint() above. */
+ccl_device_inline uint hash_hp_seeded_uint(uint i, uint seed)
+{
+ // Manipulate the seed so it doesn't interact poorly with n when they
+ // are both e.g. incrementing. This isn't fool-proof, but is good
+ // enough for practical use.
+ seed ^= seed << 19;
+
+ return hash_hp_uint(i ^ seed);
+}
+
+/* Outputs [0.0, 1.0). */
+ccl_device_inline float hash_hp_float(uint i)
+{
+ return uint_to_float_excl(hash_hp_uint(i));
+}
+
+/* Outputs [0.0, 1.0). */
+ccl_device_inline float hash_hp_seeded_float(uint i, uint seed)
+{
+ return uint_to_float_excl(hash_hp_seeded_uint(i, seed));
+}
+
+/* ***** Modified Wang Hash Functions *****
+ *
+ * These are based on a bespoke modified version of the Wang hash, and
+ * can serve as a faster hash when quality isn't critical.
+ *
+ * The original Wang hash is documented here:
+ * https://www.burtleburtle.net/bob/hash/integer.html
+ */
+
+ccl_device_inline uint hash_wang_seeded_uint(uint i, uint seed)
+{
+ i = (i ^ 61) ^ seed;
+ i += i << 3;
+ i ^= i >> 4;
+ i *= 0x27d4eb2d;
+ return i;
+}
+
+/* Outputs [0.0, 1.0). */
+ccl_device_inline float hash_wang_seeded_float(uint i, uint seed)
+{
+ return uint_to_float_excl(hash_wang_seeded_uint(i, seed));
+}
+
+/* ***** Index Shuffling Hash Function *****
+ *
+ * This function takes an index, the length of the thing the index points
+ * into, and returns a shuffled index. For example, if you pass indices
+ * 0 through 19 to this function with a length parameter of 20, it will
+ * return the indices in a shuffled order with no repeats. Indices
+ * larger than the length parameter will simply repeat the same shuffled
+ * pattern over and over.
+ *
+ * This is useful for iterating over an array in random shuffled order
+ * without having to shuffle the array itself.
+ *
+ * Passing different seeds results in different random shuffles.
+ *
+ * This function runs in average O(1) time.
+ *
+ * See https://andrew-helmer.github.io/permute/ for details on how this
+ * works.
+ */
+ccl_device_inline uint hash_shuffle_uint(uint i, uint length, uint seed)
+{
+ i = i % length;
+ uint mask = (1 << (32 - count_leading_zeros(length - 1))) - 1;
+
+ do {
+ i ^= seed;
+ i *= 0xe170893d;
+ i ^= seed >> 16;
+ i ^= (i & mask) >> 4;
+ i ^= seed >> 8;
+ i *= 0x0929eb3f;
+ i ^= seed >> 23;
+ i ^= (i & mask) >> 1;
+ i *= 1 | seed >> 27;
+ i *= 0x6935fa69;
+ i ^= (i & mask) >> 11;
+ i *= 0x74dcb303;
+ i ^= (i & mask) >> 2;
+ i *= 0x9e501cc3;
+ i ^= (i & mask) >> 2;
+ i *= 0xc860a3df;
+ i &= mask;
+ i ^= i >> 5;
+ } while (i >= length);
+
+ return i;
+}
+
+/* ********** */
+
#ifndef __KERNEL_GPU__
static inline uint hash_string(const char *str)
{
diff --git a/intern/cycles/util/log.h b/intern/cycles/util/log.h
index b33c826d6f5..3780d03c0d1 100644
--- a/intern/cycles/util/log.h
+++ b/intern/cycles/util/log.h
@@ -69,9 +69,22 @@ class LogMessageVoidify {
# define LOG_ASSERT(expression) LOG_SUPPRESS()
#endif
-#define VLOG_ONCE(level, flag) \
- if (!flag) \
- flag = true, VLOG(level)
+/* Verbose logging categories. */
+
+/* Warnings. */
+#define VLOG_WARNING VLOG(1)
+/* Info about devices, scene contents and features used. */
+#define VLOG_INFO VLOG(2)
+#define VLOG_INFO_IS_ON VLOG_IS_ON(2)
+/* Work being performed and timing/memory stats about that work. */
+#define VLOG_WORK VLOG(3)
+#define VLOG_WORK_IS_ON VLOG_IS_ON(3)
+/* Detailed device timing stats. */
+#define VLOG_DEVICE_STATS VLOG(4)
+#define VLOG_DEVICE_STATS_IS_ON VLOG_IS_ON(4)
+/* Verbose debug messages. */
+#define VLOG_DEBUG VLOG(5)
+#define VLOG_DEBUG_IS_ON VLOG_IS_ON(5)
struct int2;
struct float3;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 555a5304764..0905b3ec5c9 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -79,7 +79,7 @@ CCL_NAMESPACE_BEGIN
/* Scalar */
-#ifndef __HIP__
+#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
# ifdef _WIN32
ccl_device_inline float fmaxf(float a, float b)
{
@@ -92,12 +92,18 @@ ccl_device_inline float fminf(float a, float b)
}
# endif /* _WIN32 */
-#endif /* __HIP__ */
+#endif /* __HIP__, __KERNEL_ONEAPI__ */
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+# ifndef __KERNEL_ONEAPI__
using std::isfinite;
using std::isnan;
using std::sqrt;
+# else
+using sycl::sqrt;
+# define isfinite(x) sycl::isfinite((x))
+# define isnan(x) sycl::isnan((x))
+# endif
ccl_device_inline int abs(int x)
{
@@ -297,8 +303,15 @@ ccl_device_inline float4 __int4_as_float4(int4 i)
#endif /* !defined(__KERNEL_METAL__) */
#if defined(__KERNEL_METAL__)
-# define isnan_safe(v) isnan(v)
-# define isfinite_safe(v) isfinite(v)
+ccl_device_forceinline bool isnan_safe(float f)
+{
+ return isnan(f);
+}
+
+ccl_device_forceinline bool isfinite_safe(float f)
+{
+ return isfinite(f);
+}
#else
template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
{
@@ -498,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
return make_float4(a.x, a.y, a.z, 1.0f);
}
+ccl_device_inline float4 float3_to_float4(const float3 a, const float w)
+{
+ return make_float4(a.x, a.y, a.z, w);
+}
+
ccl_device_inline float inverse_lerp(float a, float b, float x)
{
return (x - a) / (b - a);
@@ -522,6 +540,7 @@ CCL_NAMESPACE_END
#include "util/math_float2.h"
#include "util/math_float3.h"
#include "util/math_float4.h"
+#include "util/math_float8.h"
#include "util/rect.h"
@@ -576,26 +595,26 @@ ccl_device_inline void make_orthonormals(const float3 N,
/* Color division */
-ccl_device_inline float3 safe_invert_color(float3 a)
+ccl_device_inline Spectrum safe_invert_color(Spectrum a)
{
- float x, y, z;
-
- x = (a.x != 0.0f) ? 1.0f / a.x : 0.0f;
- y = (a.y != 0.0f) ? 1.0f / a.y : 0.0f;
- z = (a.z != 0.0f) ? 1.0f / a.z : 0.0f;
+ FOREACH_SPECTRUM_CHANNEL (i) {
+ GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ?
+ 1.0f / GET_SPECTRUM_CHANNEL(a, i) :
+ 0.0f;
+ }
- return make_float3(x, y, z);
+ return a;
}
-ccl_device_inline float3 safe_divide_color(float3 a, float3 b)
+ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b)
{
- float x, y, z;
-
- x = (b.x != 0.0f) ? a.x / b.x : 0.0f;
- y = (b.y != 0.0f) ? a.y / b.y : 0.0f;
- z = (b.z != 0.0f) ? a.z / b.z : 0.0f;
+ FOREACH_SPECTRUM_CHANNEL (i) {
+ GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ?
+ GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) :
+ 0.0f;
+ }
- return make_float3(x, y, z);
+ return a;
}
ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
@@ -786,6 +805,11 @@ ccl_device_inline uint popcount(uint x)
return i & 1;
}
# endif
+#elif defined(__KERNEL_ONEAPI__)
+# define popcount(x) sycl::popcount(x)
+#elif defined(__KERNEL_HIP__)
+/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
+# define popcount(x) __popcll(x)
#elif !defined(__KERNEL_METAL__)
# define popcount(x) __popc(x)
#endif
@@ -796,6 +820,8 @@ ccl_device_inline uint count_leading_zeros(uint x)
return __clz(x);
#elif defined(__KERNEL_METAL__)
return clz(x);
+#elif defined(__KERNEL_ONEAPI__)
+ return sycl::clz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
@@ -814,6 +840,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
return (__ffs(x) - 1);
#elif defined(__KERNEL_METAL__)
return ctz(x);
+#elif defined(__KERNEL_ONEAPI__)
+ return sycl::ctz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
@@ -858,16 +886,16 @@ ccl_device_inline float2 map_to_tube(const float3 co)
ccl_device_inline float2 map_to_sphere(const float3 co)
{
- float l = len(co);
+ float l = dot(co, co);
float u, v;
if (l > 0.0f) {
if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
u = 0.0f; /* Otherwise domain error. */
}
else {
- u = (1.0f - atan2f(co.x, co.y) / M_PI_F) / 2.0f;
+ u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F);
}
- v = 1.0f - safe_acosf(co.z / l) / M_PI_F;
+ v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F;
}
else {
u = v = 0.0f;
@@ -925,7 +953,11 @@ ccl_device_inline uint prev_power_of_two(uint x)
ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
{
/* Use a native instruction if it exists. */
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__KERNEL_CUDA__)
+ return __brev(x);
+#elif defined(__KERNEL_METAL__)
+ return reverse_bits(x);
+#elif defined(__aarch64__) || defined(_M_ARM64)
/* Assume the rbit is always available on 64bit ARM architecture. */
__asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
return x;
@@ -934,10 +966,6 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
* This 32-bit Thumb instruction is available in ARMv6T2 and above. */
__asm__("rbit %0, %1" : "=r"(x) : "r"(x));
return x;
-#elif defined(__KERNEL_CUDA__)
- return __brev(x);
-#elif defined(__KERNEL_METAL__)
- return reverse_bits(x);
#elif __has_builtin(__builtin_bitreverse32)
return __builtin_bitreverse32(x);
#else
diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h
index 2221e7a9835..142a664a1d2 100644
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x)
return fast_exp2f(x / M_LN2_F);
}
-#if defined(__KERNEL_CPU__) && !defined(_MSC_VER)
+#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
/* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
* T78047 and T78869 for details. Just disable for now, it only makes
* a small difference in denoising performance. */
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 365c322dd7e..c408eadf195 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -53,26 +53,25 @@ ccl_device_inline float3 ceil(const float3 &a);
ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
#endif /* !defined(__KERNEL_METAL__) */
-ccl_device_inline float min3(float3 a);
-ccl_device_inline float max3(float3 a);
+ccl_device_inline float reduce_min(float3 a);
+ccl_device_inline float reduce_max(float3 a);
ccl_device_inline float len(const float3 a);
ccl_device_inline float len_squared(const float3 a);
ccl_device_inline float3 project(const float3 v, const float3 v_proj);
-ccl_device_inline float3 saturate3(float3 a);
ccl_device_inline float3 safe_normalize(const float3 a);
ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
+ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
+ccl_device_inline float3 safe_divide(const float3 a, const float b);
ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr3(float3 a);
+ccl_device_inline float3 sqr(float3 a);
ccl_device_inline bool is_zero(const float3 a);
ccl_device_inline float reduce_add(const float3 a);
ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b);
+ccl_device_inline bool isequal(const float3 a, const float3 b);
/*******************************************************************************
* Definition.
@@ -148,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
ccl_device_inline float3 operator/(const float3 &a, const float f)
{
- float invf = 1.0f / f;
- return a * invf;
+# if defined(__KERNEL_SSE__)
+ return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
+# else
+ return make_float3(a.x / f, a.y / f, a.z / f);
+# endif
}
ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
@@ -285,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
ccl_device_inline float3 cross(const float3 &a, const float3 &b)
{
- float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
- return r;
+# ifdef __KERNEL_SSE__
+ return float3(shuffle<1, 2, 0, 3>(
+ msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+# else
+ return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+# endif
}
ccl_device_inline float3 normalize(const float3 &a)
@@ -377,14 +383,30 @@ ccl_device_inline float3 rcp(const float3 &a)
return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
# endif
}
+
+ccl_device_inline float3 saturate(float3 a)
+{
+ return make_float3(saturatef(a.x), saturatef(a.y), saturatef(a.z));
+}
+
+ccl_device_inline float3 exp(float3 v)
+{
+ return make_float3(expf(v.x), expf(v.y), expf(v.z));
+}
+
+ccl_device_inline float3 log(float3 v)
+{
+ return make_float3(logf(v.x), logf(v.y), logf(v.z));
+}
+
#endif /* !__KERNEL_METAL__ */
-ccl_device_inline float min3(float3 a)
+ccl_device_inline float reduce_min(float3 a)
{
return min(min(a.x, a.y), a.z);
}
-ccl_device_inline float max3(float3 a)
+ccl_device_inline float reduce_max(float3 a)
{
return max(max(a.x, a.y), a.z);
}
@@ -433,11 +455,6 @@ ccl_device_inline float3 project(const float3 v, const float3 v_proj)
return (len_squared != 0.0f) ? (dot(v, v_proj) / len_squared) * v_proj : zero_float3();
}
-ccl_device_inline float3 saturate3(float3 a)
-{
- return make_float3(saturatef(a.x), saturatef(a.y), saturatef(a.z));
-}
-
ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t)
{
*t = len(a);
@@ -457,14 +474,14 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t
return (*t != 0.0f) ? a / (*t) : a;
}
-ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b)
+ccl_device_inline float3 safe_divide(const float3 a, const float3 b)
{
return make_float3((b.x != 0.0f) ? a.x / b.x : 0.0f,
(b.y != 0.0f) ? a.y / b.y : 0.0f,
(b.z != 0.0f) ? a.z / b.z : 0.0f);
}
-ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b)
+ccl_device_inline float3 safe_divide(const float3 a, const float b)
{
return (b != 0.0f) ? a / b : zero_float3();
}
@@ -474,7 +491,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
return a + t * (b - a);
}
-ccl_device_inline float3 sqr3(float3 a)
+ccl_device_inline float3 sqr(float3 a)
{
return a * a;
}
@@ -504,7 +521,7 @@ ccl_device_inline float average(const float3 a)
return reduce_add(a) * (1.0f / 3.0f);
}
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
+ccl_device_inline bool isequal(const float3 a, const float3 b)
{
#if defined(__KERNEL_METAL__)
return all(a == b);
@@ -513,21 +530,11 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
#endif
}
-ccl_device_inline float3 pow3(float3 v, float e)
+ccl_device_inline float3 pow(float3 v, float e)
{
return make_float3(powf(v.x, e), powf(v.y, e), powf(v.z, e));
}
-ccl_device_inline float3 exp3(float3 v)
-{
- return make_float3(expf(v.x), expf(v.y), expf(v.z));
-}
-
-ccl_device_inline float3 log3(float3 v)
-{
- return make_float3(logf(v.x), logf(v.y), logf(v.z));
-}
-
ccl_device_inline int3 quick_floor_to_int3(const float3 a)
{
#ifdef __KERNEL_SSE__
@@ -540,12 +547,12 @@ ccl_device_inline int3 quick_floor_to_int3(const float3 a)
#endif
}
-ccl_device_inline bool isfinite3_safe(float3 v)
+ccl_device_inline bool isfinite_safe(float3 v)
{
return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
}
-ccl_device_inline float3 ensure_finite3(float3 v)
+ccl_device_inline float3 ensure_finite(float3 v)
{
if (!isfinite_safe(v.x))
v.x = 0.0f;
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index ae9dfe75a9c..c2721873037 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -55,7 +55,8 @@ ccl_device_inline float4 floor(const float4 &a);
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
#endif /* !__KERNEL_METAL__*/
-ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
+ccl_device_inline float4 safe_divide(const float4 a, const float b);
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
@@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
# endif
#endif /* __KERNEL_SSE__ */
+ccl_device_inline float reduce_min(const float4 a);
+ccl_device_inline float reduce_max(const float4 a);
+ccl_device_inline float reduce_add(const float4 a);
+
+ccl_device_inline bool isequal(const float4 a, const float4 b);
+
#ifndef __KERNEL_GPU__
ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-ccl_device_inline float4 reduce_min(const float4 &a);
-ccl_device_inline float4 reduce_max(const float4 &a);
-ccl_device_inline float4 reduce_add(const float4 &a);
#endif /* !__KERNEL_GPU__ */
/*******************************************************************************
@@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a)
# endif
}
-ccl_device_inline float4 reduce_add(const float4 &a)
-{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vdupq_n_f32(vaddvq_f32(a)));
-# elif defined(__KERNEL_SSE3__)
- float4 h(_mm_hadd_ps(a.m128, a.m128));
- return float4(_mm_hadd_ps(h.m128, h.m128));
-# else
- float4 h(shuffle<1, 0, 3, 2>(a) + a);
- return shuffle<2, 3, 0, 1>(h) + h;
-# endif
-# else
- float sum = (a.x + a.y) + (a.z + a.w);
- return make_float4(sum, sum, sum, sum);
-# endif
-}
-
ccl_device_inline float average(const float4 &a)
{
- return reduce_add(a).x * 0.25f;
+ return reduce_add(a) * 0.25f;
}
ccl_device_inline float len(const float4 &a)
@@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
return a + t * (b - a);
}
+ccl_device_inline float4 saturate(const float4 &a)
+{
+ return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+ return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+ return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
#endif /* !__KERNEL_METAL__*/
+ccl_device_inline float reduce_add(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return vaddvq_f32(a);
+# elif defined(__KERNEL_SSE3__)
+ float4 h(_mm_hadd_ps(a.m128, a.m128));
+ return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+# else
+ float4 h(shuffle<1, 0, 3, 2>(a) + a);
+ return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
+# endif
+#else
+ return a.x + a.y + a.z + a.w;
+#endif
+}
+
+ccl_device_inline float reduce_min(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return vminvq_f32(a);
+# else
+ float4 h = min(shuffle<1, 0, 3, 2>(a), a);
+ return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
+# endif
+#else
+ return min(min(a.x, a.y), min(a.z, a.w));
+#endif
+}
+
+ccl_device_inline float reduce_max(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return vmaxvq_f32(a);
+# else
+ float4 h = max(shuffle<1, 0, 3, 2>(a), a);
+ return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
+# endif
+#else
+ return max(max(a.x, a.y), max(a.z, a.w));
+#endif
+}
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+ return all(a == b);
+#else
+ return a == b;
+#endif
+}
+
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
@@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
return select(mask, a, zero_float4());
}
-ccl_device_inline float4 reduce_min(const float4 &a)
-{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vdupq_n_f32(vminvq_f32(a)));
-# else
- float4 h = min(shuffle<1, 0, 3, 2>(a), a);
- return min(shuffle<2, 3, 0, 1>(h), h);
-# endif
-# else
- return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-# endif
-}
-
-ccl_device_inline float4 reduce_max(const float4 &a)
-{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vdupq_n_f32(vmaxvq_f32(a)));
-# else
- float4 h = max(shuffle<1, 0, 3, 2>(a), a);
- return max(shuffle<2, 3, 0, 1>(h), h);
-# endif
-# else
- return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-# endif
-}
-
ccl_device_inline float4 load_float4(ccl_private const float *v)
{
# ifdef __KERNEL_SSE__
@@ -500,17 +527,25 @@ ccl_device_inline float4 load_float4(ccl_private const float *v)
#endif /* !__KERNEL_GPU__ */
-ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b)
+ccl_device_inline float4 safe_divide(const float4 a, const float b)
{
return (b != 0.0f) ? a / b : zero_float4();
}
-ccl_device_inline bool isfinite4_safe(float4 v)
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b)
+{
+ return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
+ (b.y != 0.0f) ? a.y / b.y : 0.0f,
+ (b.z != 0.0f) ? a.z / b.z : 0.0f,
+ (b.w != 0.0f) ? a.w / b.w : 0.0f);
+}
+
+ccl_device_inline bool isfinite_safe(float4 v)
{
return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
}
-ccl_device_inline float4 ensure_finite4(float4 v)
+ccl_device_inline float4 ensure_finite(float4 v)
{
if (!isfinite_safe(v.x))
v.x = 0.0f;
@@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite4(float4 v)
return v;
}
+ccl_device_inline float4 pow(float4 v, float e)
+{
+ return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e));
+}
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
new file mode 100644
index 00000000000..b538cfbe70b
--- /dev/null
+++ b/intern/cycles/util/math_float8.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_FLOAT8_H__
+#define __UTIL_MATH_FLOAT8_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator+(const float8_t a, const float f);
+ccl_device_inline float8_t operator+(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator-(const float8_t a);
+ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator-(const float8_t a, const float f);
+ccl_device_inline float8_t operator-(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator*(const float8_t a, const float f);
+ccl_device_inline float8_t operator*(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator/(const float8_t a, float f);
+ccl_device_inline float8_t operator/(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
+
+ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
+ccl_device_inline float8_t operator*=(float8_t a, float f);
+
+ccl_device_inline float8_t operator/=(float8_t a, float f);
+
+ccl_device_inline bool operator==(const float8_t a, const float8_t b);
+
+ccl_device_inline float8_t rcp(const float8_t a);
+ccl_device_inline float8_t sqrt(const float8_t a);
+ccl_device_inline float8_t sqr(const float8_t a);
+ccl_device_inline bool is_zero(const float8_t a);
+ccl_device_inline float average(const float8_t a);
+ccl_device_inline float8_t min(const float8_t a, const float8_t b);
+ccl_device_inline float8_t max(const float8_t a, const float8_t b);
+ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
+ccl_device_inline float8_t fabs(const float8_t a);
+ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
+ccl_device_inline float8_t saturate(const float8_t a);
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
+ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
+
+ccl_device_inline float reduce_min(const float8_t a);
+ccl_device_inline float reduce_max(const float8_t a);
+ccl_device_inline float reduce_add(const float8_t a);
+
+ccl_device_inline bool isequal(const float8_t a, const float8_t b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+ccl_device_inline float8_t zero_float8_t()
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_setzero_ps());
+#else
+ return make_float8_t(0.0f);
+#endif
+}
+
+ccl_device_inline float8_t one_float8_t()
+{
+ return make_float8_t(1.0f);
+}
+
+ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_add_ps(a.m256, b.m256));
+#else
+ return make_float8_t(
+ a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator+(const float8_t a, const float f)
+{
+ return a + make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator+(const float f, const float8_t a)
+{
+ return make_float8_t(f) + a;
+}
+
+ccl_device_inline float8_t operator-(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+ __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+ return float8_t(_mm256_xor_ps(a.m256, mask));
+#else
+ return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+#endif
+}
+
+ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#else
+ return make_float8_t(
+ a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator-(const float8_t a, const float f)
+{
+ return a - make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator-(const float f, const float8_t a)
+{
+ return make_float8_t(f) - a;
+}
+
+ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#else
+ return make_float8_t(
+ a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator*(const float8_t a, const float f)
+{
+ return a * make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator*(const float f, const float8_t a)
+{
+ return make_float8_t(f) * a;
+}
+
+ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_div_ps(a.m256, b.m256));
+#else
+ return make_float8_t(
+ a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator/(const float8_t a, const float f)
+{
+ return a / make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator/(const float f, const float8_t a)
+{
+ return make_float8_t(f) / a;
+}
+
+ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+{
+ return a = a - b;
+}
+
+ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+{
+ return a = a * b;
+}
+
+ccl_device_inline float8_t operator*=(float8_t a, float f)
+{
+ return a = a * f;
+}
+
+ccl_device_inline float8_t operator/=(float8_t a, float f)
+{
+ return a = a / f;
+}
+
+ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return (_mm256_movemask_ps(_mm256_castsi256_ps(
+ _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
+ 0b11111111) == 0b11111111;
+#else
+ return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f &&
+ a.g == b.g && a.h == b.h);
+#endif
+}
+
+ccl_device_inline float8_t rcp(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_rcp_ps(a.m256));
+#else
+ return make_float8_t(1.0f / a.a,
+ 1.0f / a.b,
+ 1.0f / a.c,
+ 1.0f / a.d,
+ 1.0f / a.e,
+ 1.0f / a.f,
+ 1.0f / a.g,
+ 1.0f / a.h);
+#endif
+}
+
+ccl_device_inline float8_t sqrt(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_sqrt_ps(a.m256));
+#else
+ return make_float8_t(sqrtf(a.a),
+ sqrtf(a.b),
+ sqrtf(a.c),
+ sqrtf(a.d),
+ sqrtf(a.e),
+ sqrtf(a.f),
+ sqrtf(a.g),
+ sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline float8_t sqr(const float8_t a)
+{
+ return a * a;
+}
+
+ccl_device_inline bool is_zero(const float8_t a)
+{
+ return a == make_float8_t(0.0f);
+}
+
+ccl_device_inline float average(const float8_t a)
+{
+ return reduce_add(a) / 8.0f;
+}
+
+ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_min_ps(a.m256, b.m256));
+#else
+ return make_float8_t(min(a.a, b.a),
+ min(a.b, b.b),
+ min(a.c, b.c),
+ min(a.d, b.d),
+ min(a.e, b.e),
+ min(a.f, b.f),
+ min(a.g, b.g),
+ min(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_max_ps(a.m256, b.m256));
+#else
+ return make_float8_t(max(a.a, b.a),
+ max(a.b, b.b),
+ max(a.c, b.c),
+ max(a.d, b.d),
+ max(a.e, b.e),
+ max(a.f, b.f),
+ max(a.g, b.g),
+ max(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+{
+ return min(max(a, mn), mx);
+}
+
+ccl_device_inline float8_t fabs(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+ return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+ return make_float8_t(fabsf(a.a),
+ fabsf(a.b),
+ fabsf(a.c),
+ fabsf(a.d),
+ fabsf(a.e),
+ fabsf(a.f),
+ fabsf(a.g),
+ fabsf(a.h));
+#endif
+}
+
+ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline float8_t saturate(const float8_t a)
+{
+ return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+}
+
+ccl_device_inline float8_t exp(float8_t v)
+{
+ return make_float8_t(
+ expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
+}
+
+ccl_device_inline float8_t log(float8_t v)
+{
+ return make_float8_t(
+ logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
+}
+
+ccl_device_inline float dot(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+ float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+ return t[0] + t[4];
+#else
+ return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
+ (a.g * b.g) + (a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8_t pow(float8_t v, float e)
+{
+ return make_float8_t(powf(v.a, e),
+ powf(v.b, e),
+ powf(v.c, e),
+ powf(v.d, e),
+ powf(v.e, e),
+ powf(v.f, e),
+ powf(v.g, e),
+ powf(v.h, e));
+}
+
+ccl_device_inline float reduce_min(const float8_t a)
+{
+ return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_max(const float8_t a)
+{
+ return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_add(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+ float8_t b(_mm256_hadd_ps(a.m256, a.m256));
+ float8_t h(_mm256_hadd_ps(b.m256, b.m256));
+ return h[0] + h[4];
+#else
+ return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+{
+ return a == b;
+}
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+{
+ return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+}
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+{
+ return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
+ (b.b != 0.0f) ? a.b / b.b : 0.0f,
+ (b.c != 0.0f) ? a.c / b.c : 0.0f,
+ (b.d != 0.0f) ? a.d / b.d : 0.0f,
+ (b.e != 0.0f) ? a.e / b.e : 0.0f,
+ (b.f != 0.0f) ? a.f / b.f : 0.0f,
+ (b.g != 0.0f) ? a.g / b.g : 0.0f,
+ (b.h != 0.0f) ? a.h / b.h : 0.0f);
+}
+
+ccl_device_inline float8_t ensure_finite(float8_t v)
+{
+ v.a = ensure_finite(v.a);
+ v.b = ensure_finite(v.b);
+ v.c = ensure_finite(v.c);
+ v.d = ensure_finite(v.d);
+ v.e = ensure_finite(v.e);
+ v.f = ensure_finite(v.f);
+ v.g = ensure_finite(v.g);
+ v.h = ensure_finite(v.h);
+
+ return v;
+}
+
+ccl_device_inline bool isfinite_safe(float8_t v)
+{
+ return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
+ isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index b0de0b25a45..aa28682f8c1 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN
ccl_device bool ray_sphere_intersect(float3 ray_P,
float3 ray_D,
- float ray_t,
+ float ray_tmin,
+ float ray_tmax,
float3 sphere_P,
float sphere_radius,
ccl_private float3 *isect_P,
@@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
return false;
}
const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
- if (t < ray_t) {
+ if (t > ray_tmin && t < ray_tmax) {
*isect_t = t;
*isect_P = ray_P + ray_D * t;
return true;
@@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
float3 ray_D,
- float ray_t,
+ float ray_tmin,
+ float ray_tmax,
float3 disk_P,
float disk_radius,
ccl_private float3 *isect_P,
@@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
}
/* Compute t to intersection point. */
const float t = -disk_t / div;
- if (t < 0.0f || t > ray_t) {
+ if (!(t > ray_tmin && t < ray_tmax)) {
return false;
}
/* Test if within radius. */
@@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
ccl_device bool ray_disk_intersect(float3 ray_P,
float3 ray_D,
- float ray_t,
+ float ray_tmin,
+ float ray_tmax,
float3 disk_P,
float3 disk_N,
float disk_radius,
@@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
}
float3 P = ray_P + t * ray_D;
float3 T = P - disk_P;
- if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) {
+
+ if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) {
*isect_P = ray_P + t * ray_D;
*isect_t = t;
return true;
@@ -101,9 +105,55 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
return false;
}
-ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
- float3 ray_dir,
- float ray_t,
+/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
+ccl_device_forceinline float ray_triangle_rcp(const float x)
+{
+#ifdef __KERNEL_NEON__
+ /* Move scalar to vector register and do rcp. */
+ __m128 a;
+ a[0] = x;
+ float32x4_t reciprocal = vrecpeq_f32(a);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ return reciprocal[0];
+#elif defined(__KERNEL_SSE__)
+ const __m128 a = _mm_set_ss(x);
+ const __m128 r = _mm_rcp_ss(a);
+
+# ifdef __KERNEL_AVX2_
+ return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+# else
+ return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+# endif
+#else
+ return 1.0f / x;
+#endif
+}
+
+ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+#else
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+#endif
+}
+
+ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
+ msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
+ msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+#else
+ return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#endif
+}
+
+ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
+ const float3 ray_D,
+ const float ray_tmin,
+ const float ray_tmax,
const float3 tri_a,
const float3 tri_b,
const float3 tri_c,
@@ -111,14 +161,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
ccl_private float *isect_v,
ccl_private float *isect_t)
{
-#define dot3(a, b) dot(a, b)
- const float3 P = ray_P;
- const float3 dir = ray_dir;
+ /* This implementation matches the Plücker coordinates triangle intersection
+ * in Embree. */
/* Calculate vertices relative to ray origin. */
- const float3 v0 = tri_c - P;
- const float3 v1 = tri_a - P;
- const float3 v2 = tri_b - P;
+ const float3 v0 = tri_a - ray_P;
+ const float3 v1 = tri_b - ray_P;
+ const float3 v2 = tri_c - ray_P;
/* Calculate triangle edges. */
const float3 e0 = v2 - v0;
@@ -126,42 +175,73 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
const float3 e2 = v1 - v2;
/* Perform edge tests. */
- const float U = dot(cross(v2 + v0, e0), ray_dir);
- const float V = dot(cross(v0 + v1, e1), ray_dir);
- const float W = dot(cross(v1 + v2, e2), ray_dir);
+ const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
+ const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
+ const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
+ const float UVW = U + V + W;
+ const float eps = FLT_EPSILON * fabsf(UVW);
const float minUVW = min(U, min(V, W));
const float maxUVW = max(U, max(V, W));
- if (minUVW < 0.0f && maxUVW > 0.0f) {
+ if (!(minUVW >= -eps || maxUVW <= eps)) {
return false;
}
/* Calculate geometry normal and denominator. */
- const float3 Ng1 = cross(e1, e0);
- // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+ const float3 Ng1 = ray_triangle_cross(e1, e0);
const float3 Ng = Ng1 + Ng1;
- const float den = dot3(Ng, dir);
+ const float den = dot(Ng, ray_D);
/* Avoid division by 0. */
if (UNLIKELY(den == 0.0f)) {
return false;
}
/* Perform depth test. */
- const float T = dot3(v0, Ng);
- const int sign_den = (__float_as_int(den) & 0x80000000);
- const float sign_T = xor_signmask(T, sign_den);
- if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) {
+ const float T = dot(v0, Ng);
+ const float t = T / den;
+ if (!(t >= ray_tmin && t <= ray_tmax)) {
return false;
}
- const float inv_den = 1.0f / den;
- *isect_u = U * inv_den;
- *isect_v = V * inv_den;
- *isect_t = T * inv_den;
+ const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
+ *isect_u = min(U * rcp_uvw, 1.0f);
+ *isect_v = min(V * rcp_uvw, 1.0f);
+ *isect_t = t;
return true;
+}
+
+ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
+ const float3 ray_D,
+ const float3 tri_a,
+ const float3 tri_b,
+ const float3 tri_c)
+{
+ /* Matches logic in ray_triangle_intersect, self intersection test to validate
+ * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
-#undef dot3
+ /* Calculate vertices relative to ray origin. */
+ const float3 v0 = tri_a - ray_P;
+ const float3 v1 = tri_b - ray_P;
+ const float3 v2 = tri_c - ray_P;
+
+ /* Calculate triangle edges. */
+ const float3 e0 = v2 - v0;
+ const float3 e1 = v0 - v1;
+ const float3 e2 = v1 - v2;
+
+ /* Perform edge tests. */
+ const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
+ const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
+ const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
+
+ const float eps = FLT_EPSILON * fabsf(U + V + W);
+ const float minUVW = min(U, min(V, W));
+ const float maxUVW = max(U, max(V, W));
+
+ /* Note the extended epsilon compared to ray_triangle_intersect, to account
+ * for intersections with neighboring triangles that have an epsilon. */
+ return (minUVW >= eps || maxUVW <= -eps);
}
/* Tests for an intersection between a ray and a quad defined by
@@ -171,8 +251,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
*/
ccl_device bool ray_quad_intersect(float3 ray_P,
float3 ray_D,
- float ray_mint,
- float ray_maxt,
+ float ray_tmin,
+ float ray_tmax,
float3 quad_P,
float3 quad_u,
float3 quad_v,
@@ -185,7 +265,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
{
/* Perform intersection test. */
float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
- if (t < ray_mint || t > ray_maxt) {
+ if (!(t > ray_tmin && t < ray_tmax)) {
return false;
}
const float3 hit = ray_P + t * ray_D;
@@ -207,10 +287,13 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
*isect_P = hit;
if (isect_t != NULL)
*isect_t = t;
+
+ /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
if (isect_u != NULL)
- *isect_u = u + 0.5f;
+ *isect_u = v + 0.5f;
if (isect_v != NULL)
- *isect_v = v + 0.5f;
+ *isect_v = -u - v;
+
return true;
}
diff --git a/intern/cycles/util/opengl.h b/intern/cycles/util/opengl.h
index 090deb861c4..fefee4ec022 100644
--- a/intern/cycles/util/opengl.h
+++ b/intern/cycles/util/opengl.h
@@ -7,6 +7,6 @@
/* OpenGL header includes, used everywhere we use OpenGL, to deal with
* platform differences in one central place. */
-#include <GL/glew.h>
+#include <epoxy/gl.h>
#endif /* __UTIL_OPENGL_H__ */
diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h
index 37eafd57491..586979d2021 100644
--- a/intern/cycles/util/progress.h
+++ b/intern/cycles/util/progress.h
@@ -28,6 +28,7 @@ class Progress {
denoised_tiles = 0;
start_time = time_dt();
render_start_time = time_dt();
+ time_limit = 0.0;
end_time = 0.0;
status = "Initializing";
substatus = "";
@@ -68,6 +69,7 @@ class Progress {
denoised_tiles = 0;
start_time = time_dt();
render_start_time = time_dt();
+ time_limit = 0.0;
end_time = 0.0;
status = "Initializing";
substatus = "";
@@ -145,6 +147,13 @@ class Progress {
render_start_time = time_dt();
}
+ void set_time_limit(double time_limit_)
+ {
+ thread_scoped_lock lock(progress_mutex);
+
+ time_limit = time_limit_;
+ }
+
void add_skip_time(const scoped_timer &start_timer, bool only_render)
{
double skip_time = time_dt() - start_timer.get_start();
@@ -191,8 +200,13 @@ class Progress {
{
thread_scoped_lock lock(progress_mutex);
- if (total_pixel_samples > 0) {
- return ((double)pixel_samples) / (double)total_pixel_samples;
+ if (pixel_samples > 0) {
+ double progress_percent = (double)pixel_samples / (double)total_pixel_samples;
+ if (time_limit != 0.0) {
+ double time_since_render_start = time_dt() - render_start_time;
+ progress_percent = max(progress_percent, time_since_render_start / time_limit);
+ }
+ return min(1.0, progress_percent);
}
return 0.0;
}
@@ -335,7 +349,7 @@ class Progress {
* in which case the current_tile_sample is displayed. */
int rendered_tiles, denoised_tiles;
- double start_time, render_start_time;
+ double start_time, render_start_time, time_limit;
/* End time written when render is done, so it doesn't keep increasing on redraws. */
double end_time;
diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp
index 66ff866ee10..0c318cea44a 100644
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
}
}
+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+ assert(needle.size() == other.size());
+ size_t pos = 0;
+ while (pos != string::npos) {
+ pos = haystack.find(needle, pos);
+ if (pos != string::npos) {
+ memcpy(haystack.data() + pos, other.data(), other.size());
+ pos += other.size();
+ }
+ }
+}
+
string string_remove_trademark(const string &s)
{
string result = s;
@@ -164,6 +177,11 @@ string to_string(const char *str)
return string(str);
}
+string to_string(const float4 &v)
+{
+ return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
string string_to_lower(const string &s)
{
string r = s;
diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h
index a74feee1750..ecbe9e106c6 100644
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
const string &separators = "\t ",
bool skip_empty_tokens = true);
void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
bool string_startswith(string_view s, string_view start);
bool string_endswith(string_view s, string_view end);
string string_strip(const string &s);
string string_remove_trademark(const string &s);
string string_from_bool(const bool var);
string to_string(const char *str);
+string to_string(const float4 &v);
string string_to_lower(const string &s);
/* Wide char strings are only used on Windows to deal with non-ASCII
diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index a13ad95b9fe..3183ac06f26 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -128,53 +128,42 @@ int system_cpu_bits()
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
struct CPUCapabilities {
- bool x64;
- bool mmx;
- bool sse;
bool sse2;
bool sse3;
- bool ssse3;
bool sse41;
- bool sse42;
- bool sse4a;
bool avx;
- bool f16c;
bool avx2;
- bool xop;
- bool fma3;
- bool fma4;
- bool bmi1;
- bool bmi2;
};
static CPUCapabilities &system_cpu_capabilities()
{
- static CPUCapabilities caps;
+ static CPUCapabilities caps = {};
static bool caps_init = false;
if (!caps_init) {
int result[4], num;
- memset(&caps, 0, sizeof(caps));
-
__cpuid(result, 0);
num = result[0];
if (num >= 1) {
__cpuid(result, 0x00000001);
- caps.mmx = (result[3] & ((int)1 << 23)) != 0;
- caps.sse = (result[3] & ((int)1 << 25)) != 0;
- caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
- caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+ const bool sse = (result[3] & ((int)1 << 25)) != 0;
+ const bool sse2 = (result[3] & ((int)1 << 26)) != 0;
+ const bool sse3 = (result[2] & ((int)1 << 0)) != 0;
+
+ const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
+ const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
+ /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */
- caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
- caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
- caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+ const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
+ const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
+ const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
- caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
- caps.avx = false;
- bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
- bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+ /* Simplify to combined capabilities for which we specialize kernels. */
+ caps.sse2 = sse && sse2;
+ caps.sse3 = sse && sse2 && sse3 && ssse3;
+ caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
if (os_uses_xsave_xrestore && cpu_avx_support) {
// Check if the OS will save the YMM registers
@@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities()
# else
xcr_feature_mask = 0;
# endif
- caps.avx = (xcr_feature_mask & 0x6) == 0x6;
- }
+ const bool avx = (xcr_feature_mask & 0x6) == 0x6;
+ const bool f16c = (result[2] & ((int)1 << 29)) != 0;
- caps.f16c = (result[2] & ((int)1 << 29)) != 0;
+ __cpuid(result, 0x00000007);
+ bool bmi1 = (result[1] & ((int)1 << 3)) != 0;
+ bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
+ bool avx2 = (result[1] & ((int)1 << 5)) != 0;
- __cpuid(result, 0x00000007);
- caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
- caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
- caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
+ caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
+ caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
+ bmi2;
+ }
}
caps_init = true;
@@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities()
bool system_cpu_support_sse2()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2;
+ return caps.sse2;
}
bool system_cpu_support_sse3()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
+ return caps.sse3;
}
bool system_cpu_support_sse41()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
+ return caps.sse41;
}
bool system_cpu_support_avx()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
+ return caps.avx;
}
bool system_cpu_support_avx2()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c &&
- caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+ return caps.avx2;
}
#else
@@ -264,26 +255,6 @@ bool system_cpu_support_avx2()
#endif
-bool system_call_self(const vector<string> &args)
-{
- /* Escape program and arguments in case they contain spaces. */
- string cmd = "\"" + Sysutil::this_program_path() + "\"";
-
- for (int i = 0; i < args.size(); i++) {
- cmd += " \"" + args[i] + "\"";
- }
-
-#ifdef _WIN32
- /* Use cmd /S to avoid issues with spaces in arguments. */
- cmd = "cmd /S /C \"" + cmd + " > nul \"";
-#else
- /* Quiet output. */
- cmd += " > /dev/null";
-#endif
-
- return (system(cmd.c_str()) == 0);
-}
-
size_t system_physical_ram()
{
#ifdef _WIN32
diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h
index 23dcfdd303a..2152b89ed24 100644
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -4,15 +4,17 @@
#ifndef __UTIL_SYSTEM_H__
#define __UTIL_SYSTEM_H__
-#include "util/string.h"
-#include "util/vector.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <string>
CCL_NAMESPACE_BEGIN
/* Get width in characters of the current console output. */
int system_console_width();
-string system_cpu_brand_string();
+std::string system_cpu_brand_string();
int system_cpu_bits();
bool system_cpu_support_sse2();
bool system_cpu_support_sse3();
@@ -22,9 +24,6 @@ bool system_cpu_support_avx2();
size_t system_physical_ram();
-/* Start a new process of the current application with the given arguments. */
-bool system_call_self(const vector<string> &args);
-
/* Get identifier of the currently running process. */
uint64_t system_self_process_id();
diff --git a/intern/cycles/util/task.cpp b/intern/cycles/util/task.cpp
index 2edc82eb7c3..12f661f752d 100644
--- a/intern/cycles/util/task.cpp
+++ b/intern/cycles/util/task.cpp
@@ -70,7 +70,7 @@ void TaskScheduler::init(int num_threads)
}
if (num_threads > 0) {
/* Automatic number of threads. */
- VLOG(1) << "Overriding number of TBB threads to " << num_threads << ".";
+ VLOG_INFO << "Overriding number of TBB threads to " << num_threads << ".";
global_control = new tbb::global_control(tbb::global_control::max_allowed_parallelism,
num_threads);
active_num_threads = num_threads;
diff --git a/intern/cycles/util/time.cpp b/intern/cycles/util/time.cpp
index d27a0415106..0295a071f39 100644
--- a/intern/cycles/util/time.cpp
+++ b/intern/cycles/util/time.cpp
@@ -102,7 +102,7 @@ double time_human_readable_to_seconds(const string &time_string)
}
else if (fraction_tokens.size() == 2) {
result = atof(fraction_tokens[1].c_str());
- result *= pow(0.1, fraction_tokens[1].length());
+ result *= ::pow(0.1, fraction_tokens[1].length());
}
else {
/* This is not a valid string, the result can not be reliable. */
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index fa50e1db063..cb985c65dd8 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
/* Transform Inverse */
-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
{
/* SPDX-License-Identifier: BSD-3-Clause
* Adapted from code:
@@ -98,16 +98,8 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
memcpy(R, &tfmR, sizeof(R));
memcpy(M, &tfm, sizeof(M));
- if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
- /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
- * never be in this situation, but try to invert it anyway with tweak */
- M[0][0] += 1e-8f;
- M[1][1] += 1e-8f;
- M[2][2] += 1e-8f;
-
- if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
- return projection_identity();
- }
+ if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
+ return projection_identity();
}
memcpy(&tfmR, R, sizeof(R));
@@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
return tfmR;
}
-Transform transform_inverse(const Transform &tfm)
-{
- ProjectionTransform projection(tfm);
- return projection_to_transform(projection_inverse(projection));
-}
-
Transform transform_transposed_inverse(const Transform &tfm)
{
- ProjectionTransform projection(tfm);
- ProjectionTransform iprojection = projection_inverse(projection);
+ ProjectionTransform iprojection(transform_inverse(tfm));
return projection_to_transform(projection_transpose(iprojection));
}
@@ -229,17 +214,17 @@ static void transform_decompose(DecomposedTransform *decomp, const Transform *tf
/* extract scale and shear first */
float3 scale, shear;
scale.x = len(colx);
- colx = safe_divide_float3_float(colx, scale.x);
+ colx = safe_divide(colx, scale.x);
shear.z = dot(colx, coly);
coly -= shear.z * colx;
scale.y = len(coly);
- coly = safe_divide_float3_float(coly, scale.y);
+ coly = safe_divide(coly, scale.y);
shear.y = dot(colx, colz);
colz -= shear.y * colx;
shear.x = dot(coly, colz);
colz -= shear.x * coly;
scale.z = len(colz);
- colz = safe_divide_float3_float(colz, scale.z);
+ colz = safe_divide(colz, scale.z);
transform_set_column(&M, 0, colx);
transform_set_column(&M, 1, coly);
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 477272f0ba6..24184dc7074 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -11,6 +11,10 @@
#include "util/math.h"
#include "util/types.h"
+#ifndef __KERNEL_GPU__
+# include "util/system.h"
+#endif
+
CCL_NAMESPACE_BEGIN
/* Affine transformation, stored as 4x3 matrix. */
@@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
float4 x, y, z, w;
} DecomposedTransform;
+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
/* Functions */
#ifdef __KERNEL_METAL__
@@ -63,10 +73,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
_MM_TRANSPOSE4_PS(x, y, z, w);
- ssef tmp = shuffle<0>(aa) * x;
- tmp = madd(shuffle<1>(aa), y, tmp);
+ ssef tmp = w;
tmp = madd(shuffle<2>(aa), z, tmp);
- tmp += w;
+ tmp = madd(shuffle<1>(aa), y, tmp);
+ tmp = madd(shuffle<0>(aa), x, tmp);
return float3(tmp.m128);
#elif defined(__KERNEL_METAL__)
@@ -93,9 +103,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
_MM_TRANSPOSE4_PS(x, y, z, w);
- ssef tmp = shuffle<0>(aa) * x;
+ ssef tmp = shuffle<2>(aa) * z;
tmp = madd(shuffle<1>(aa), y, tmp);
- tmp = madd(shuffle<2>(aa), z, tmp);
+ tmp = madd(shuffle<0>(aa), x, tmp);
return float3(tmp.m128);
#elif defined(__KERNEL_METAL__)
@@ -312,7 +322,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
t->z[column] = value.z;
}
-Transform transform_inverse(const Transform &a);
Transform transform_transposed_inverse(const Transform &a);
ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale)
@@ -392,39 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
}
-ccl_device_inline Transform transform_quick_inverse(Transform M)
-{
- /* possible optimization: can we avoid doing this altogether and construct
- * the inverse matrix directly from negated translation, transposed rotation,
- * scale can be inverted but what about shearing? */
- Transform R;
- float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
- M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
- if (det == 0.0f) {
- M.x.x += 1e-8f;
- M.y.y += 1e-8f;
- M.z.z += 1e-8f;
- det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
- M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
- }
- det = (det != 0.0f) ? 1.0f / det : 0.0f;
-
- float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z,
- M.z.y * M.x.z - M.z.z * M.x.y,
- M.y.z * M.x.y - M.y.y * M.x.z);
- float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x,
- M.z.z * M.x.x - M.z.x * M.x.z,
- M.y.x * M.x.z - M.y.z * M.x.x);
- float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y,
- M.z.x * M.x.y - M.z.y * M.x.x,
- M.y.y * M.x.x - M.y.x * M.x.y);
- float3 T = -make_float3(M.x.w, M.y.w, M.z.w);
+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
- R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
- R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
- R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
+ccl_device_inline Transform transform_inverse(const Transform tfm)
+{
+ /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+ if (system_cpu_support_avx2()) {
+ Transform itfm;
+ transform_inverse_cpu_avx2(tfm, itfm);
+ return itfm;
+ }
+ else if (system_cpu_support_sse41()) {
+ Transform itfm;
+ transform_inverse_cpu_sse41(tfm, itfm);
+ return itfm;
+ }
+#endif
- return R;
+ return transform_inverse_impl(tfm);
}
ccl_device_inline void transform_compose(ccl_private Transform *tfm,
@@ -493,13 +491,13 @@ ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
ccl_device_inline bool transform_isfinite_safe(ccl_private Transform *tfm)
{
- return isfinite4_safe(tfm->x) && isfinite4_safe(tfm->y) && isfinite4_safe(tfm->z);
+ return isfinite_safe(tfm->x) && isfinite_safe(tfm->y) && isfinite_safe(tfm->z);
}
ccl_device_inline bool transform_decomposed_isfinite_safe(ccl_private DecomposedTransform *decomp)
{
- return isfinite4_safe(decomp->x) && isfinite4_safe(decomp->y) && isfinite4_safe(decomp->z) &&
- isfinite4_safe(decomp->w);
+ return isfinite_safe(decomp->x) && isfinite_safe(decomp->y) && isfinite_safe(decomp->z) &&
+ isfinite_safe(decomp->w);
}
#ifndef __KERNEL_GPU__
diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp
new file mode 100644
index 00000000000..57c160388e2
--- /dev/null
+++ b/intern/cycles/util/transform_avx2.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+ itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
new file mode 100644
index 00000000000..07fd06c1467
--- /dev/null
+++ b/intern/cycles/util/transform_inverse.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+ const ssef sse_a = (const __m128 &)a;
+ const ssef sse_b = (const __m128 &)b;
+ const ssef r = shuffle<1, 2, 0, 3>(
+ ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+ return (const float3 &)r;
+#endif
+
+ return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+ return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+ return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+ /* This implementation matches the one in Embree exactly, to ensure consistent
+ * results with the ray intersection of instances. */
+ float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+ float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+ float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+ float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+ /* Compute determinant. */
+ float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+ if (det == 0.0f) {
+ /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+ * never be in this situation, but try to invert it anyway with tweak.
+ *
+ * This logic does not match Embree which would just give an invalid
+ * matrix. A better solution would be to remove this and ensure any object
+ * matrix is valid. */
+ x.x += 1e-8f;
+ y.y += 1e-8f;
+ z.z += 1e-8f;
+
+ det = transform_inverse_dot(x, cross(y, z));
+ if (det == 0.0f) {
+ det = FLT_MAX;
+ }
+ }
+
+ /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+ const float3 inverse_x = transform_inverse_cross(y, z) / det;
+ const float3 inverse_y = transform_inverse_cross(z, x) / det;
+ const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+ /* Compute translation and fill transform. */
+ Transform itfm;
+ itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+ itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+ itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+ return itfm;
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp
new file mode 100644
index 00000000000..8a698807a9c
--- /dev/null
+++ b/intern/cycles/util/transform_sse41.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+ itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 031c2f7c4c1..1ab6f76f9bc 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -12,6 +12,7 @@
#if !defined(__KERNEL_GPU__)
# include <stdint.h>
+# include <stdio.h>
#endif
#include "util/defines.h"
@@ -70,6 +71,24 @@ ccl_device_inline bool is_power_of_two(size_t x)
CCL_NAMESPACE_END
+/* Device side printf only tested on CUDA, may work on more GPU devices. */
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__)
+# define __KERNEL_PRINTF__
+#endif
+
+ccl_device_inline void print_float(ccl_private const char *label, const float a)
+{
+#ifdef __KERNEL_PRINTF__
+ printf("%s: %.8f\n", label, (double)a);
+#endif
+}
+
+/* Most GPU APIs matching native vector types, so we only need to implement them for
+ * CPU and oneAPI. */
+#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
+# define __KERNEL_NATIVE_VECTOR_TYPES__
+#endif
+
/* Vectorized types declaration. */
#include "util/types_uchar2.h"
#include "util/types_uchar3.h"
@@ -90,7 +109,7 @@ CCL_NAMESPACE_END
#include "util/types_float4.h"
#include "util/types_float8.h"
-#include "util/types_vector3.h"
+#include "util/types_spectrum.h"
/* Vectorized types implementation. */
#include "util/types_uchar2_impl.h"
@@ -110,8 +129,6 @@ CCL_NAMESPACE_END
#include "util/types_float4_impl.h"
#include "util/types_float8_impl.h"
-#include "util/types_vector3_impl.h"
-
/* SSE types. */
#ifndef __KERNEL_GPU__
# include "util/sseb.h"
diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h
index d8b2efb7b4b..ea510ef832c 100644
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_FLOAT2_H__
-#define __UTIL_TYPES_FLOAT2_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,18 +9,19 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct float2 {
float x, y;
+# ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
+# endif
};
ccl_device_inline float2 make_float2(float x, float y);
-ccl_device_inline void print_float2(const char *label, const float2 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-CCL_NAMESPACE_END
+ccl_device_inline void print_float2(ccl_private const char *label, const float2 a);
-#endif /* __UTIL_TYPES_FLOAT2_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float2_impl.h b/intern/cycles/util/types_float2_impl.h
index d67ec946b79..7ba7dee2e3a 100644
--- a/intern/cycles/util/types_float2_impl.h
+++ b/intern/cycles/util/types_float2_impl.h
@@ -1,20 +1,16 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__
-#define __UTIL_TYPES_FLOAT2_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
-#ifndef __KERNEL_GPU__
-# include <cstdio>
-#endif
-
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
__forceinline float float2::operator[](int i) const
{
util_assert(i >= 0);
@@ -28,19 +24,20 @@ __forceinline float &float2::operator[](int i)
util_assert(i < 2);
return *(&x + i);
}
+# endif
ccl_device_inline float2 make_float2(float x, float y)
{
float2 a = {x, y};
return a;
}
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-ccl_device_inline void print_float2(const char *label, const float2 &a)
+ccl_device_inline void print_float2(ccl_private const char *label, const float2 a)
{
+#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
+#endif
}
-#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */
diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h
index 060c2ac4152..87c6b1d3654 100644
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_FLOAT3_H__
-#define __UTIL_TYPES_FLOAT3_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,28 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct ccl_try_align(16) float3
{
-# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_GPU__
+ /* Compact structure for GPU. */
+ float x, y, z;
+# else
+ /* SIMD aligned structure for CPU. */
+# ifdef __KERNEL_SSE__
union {
__m128 m128;
struct {
float x, y, z, w;
};
};
+# else
+ float x, y, z, w;
+# endif
+# endif
+# ifdef __KERNEL_SSE__
+ /* Convenient constructors and operators for SIMD, otherwise default is enough. */
__forceinline float3();
__forceinline float3(const float3 &a);
__forceinline explicit float3(const __m128 &a);
@@ -29,18 +39,19 @@ struct ccl_try_align(16) float3
__forceinline operator __m128 &();
__forceinline float3 &operator=(const float3 &a);
-# else /* __KERNEL_SSE__ */
- float x, y, z, w;
-# endif /* __KERNEL_SSE__ */
+# endif
+# ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
+# endif
};
-ccl_device_inline float3 make_float3(float f);
ccl_device_inline float3 make_float3(float x, float y, float z);
-ccl_device_inline void print_float3(const char *label, const float3 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline float3 make_float3(float f);
+ccl_device_inline void print_float3(ccl_private const char *label, const float3 a);
/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
* CPU SIMD instructions can be used. */
@@ -78,5 +89,3 @@ struct packed_float3 {
static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes");
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT3_H__ */
diff --git a/intern/cycles/util/types_float3_impl.h b/intern/cycles/util/types_float3_impl.h
index f5ffc48c1be..da76ab2ab2a 100644
--- a/intern/cycles/util/types_float3_impl.h
+++ b/intern/cycles/util/types_float3_impl.h
@@ -1,20 +1,15 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__
-#define __UTIL_TYPES_FLOAT3_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
-#ifndef __KERNEL_GPU__
-# include <cstdio>
-#endif
-
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline float3::float3()
{
@@ -45,6 +40,7 @@ __forceinline float3 &float3::operator=(const float3 &a)
}
# endif /* __KERNEL_SSE__ */
+# ifndef __KERNEL_GPU__
__forceinline float float3::operator[](int i) const
{
util_assert(i >= 0);
@@ -58,33 +54,37 @@ __forceinline float &float3::operator[](int i)
util_assert(i < 3);
return *(&x + i);
}
+# endif
-ccl_device_inline float3 make_float3(float f)
+ccl_device_inline float3 make_float3(float x, float y, float z)
{
-# ifdef __KERNEL_SSE__
- float3 a(_mm_set1_ps(f));
+# if defined(__KERNEL_GPU__)
+ return {x, y, z};
+# elif defined(__KERNEL_SSE__)
+ return float3(_mm_set_ps(0.0f, z, y, x));
# else
- float3 a = {f, f, f, f};
+ return {x, y, z, 0.0f};
# endif
- return a;
}
-ccl_device_inline float3 make_float3(float x, float y, float z)
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline float3 make_float3(float f)
{
-# ifdef __KERNEL_SSE__
- float3 a(_mm_set_ps(0.0f, z, y, x));
-# else
- float3 a = {x, y, z, 0.0f};
-# endif
- return a;
+#if defined(__KERNEL_GPU__)
+ return make_float3(f, f, f);
+#elif defined(__KERNEL_SSE__)
+ return float3(_mm_set1_ps(f));
+#else
+ return {f, f, f, f};
+#endif
}
-ccl_device_inline void print_float3(const char *label, const float3 &a)
+ccl_device_inline void print_float3(ccl_private const char *label, const float3 a)
{
+#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
+#endif
}
-#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */
diff --git a/intern/cycles/util/types_float4.h b/intern/cycles/util/types_float4.h
index 68ba787dac0..a347cfce9a1 100644
--- a/intern/cycles/util/types_float4.h
+++ b/intern/cycles/util/types_float4.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_FLOAT4_H__
-#define __UTIL_TYPES_FLOAT4_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,7 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct int4;
struct ccl_try_align(16) float4
@@ -35,16 +34,17 @@ struct ccl_try_align(16) float4
float x, y, z, w;
# endif /* __KERNEL_SSE__ */
+# ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
+# endif
};
-ccl_device_inline float4 make_float4(float f);
ccl_device_inline float4 make_float4(float x, float y, float z, float w);
-ccl_device_inline float4 make_float4(const int4 &i);
-ccl_device_inline void print_float4(const char *label, const float4 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-CCL_NAMESPACE_END
+ccl_device_inline float4 make_float4(float f);
+ccl_device_inline float4 make_float4(const int4 i);
+ccl_device_inline void print_float4(ccl_private const char *label, const float4 a);
-#endif /* __UTIL_TYPES_FLOAT4_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float4_impl.h b/intern/cycles/util/types_float4_impl.h
index de2e7cb7061..420d9316926 100644
--- a/intern/cycles/util/types_float4_impl.h
+++ b/intern/cycles/util/types_float4_impl.h
@@ -1,20 +1,15 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__
-#define __UTIL_TYPES_FLOAT4_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
-#ifndef __KERNEL_GPU__
-# include <cstdio>
-#endif
-
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline float4::float4()
{
@@ -41,6 +36,7 @@ __forceinline float4 &float4::operator=(const float4 &a)
}
# endif /* __KERNEL_SSE__ */
+# ifndef __KERNEL_GPU__
__forceinline float float4::operator[](int i) const
{
util_assert(i >= 0);
@@ -54,43 +50,42 @@ __forceinline float &float4::operator[](int i)
util_assert(i < 4);
return *(&x + i);
}
+# endif
-ccl_device_inline float4 make_float4(float f)
+ccl_device_inline float4 make_float4(float x, float y, float z, float w)
{
# ifdef __KERNEL_SSE__
- float4 a(_mm_set1_ps(f));
+ return float4(_mm_set_ps(w, z, y, x));
# else
- float4 a = {f, f, f, f};
+ return {x, y, z, w};
# endif
- return a;
}
-ccl_device_inline float4 make_float4(float x, float y, float z, float w)
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline float4 make_float4(float f)
{
-# ifdef __KERNEL_SSE__
- float4 a(_mm_set_ps(w, z, y, x));
-# else
- float4 a = {x, y, z, w};
-# endif
- return a;
+#ifdef __KERNEL_SSE__
+ return float4(_mm_set1_ps(f));
+#else
+ return make_float4(f, f, f, f);
+#endif
}
-ccl_device_inline float4 make_float4(const int4 &i)
+ccl_device_inline float4 make_float4(const int4 i)
{
-# ifdef __KERNEL_SSE__
- float4 a(_mm_cvtepi32_ps(i.m128));
-# else
- float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
-# endif
- return a;
+#ifdef __KERNEL_SSE__
+ return float4(_mm_cvtepi32_ps(i.m128));
+#else
+ return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w);
+#endif
}
-ccl_device_inline void print_float4(const char *label, const float4 &a)
+ccl_device_inline void print_float4(ccl_private const char *label, const float4 a)
{
+#ifdef __KERNEL_PRINTF__
printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
+#endif
}
-#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index 99f9ec9b867..29fd632f08e 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -2,8 +2,7 @@
* Original code Copyright 2017, Intel Corporation
* Modifications Copyright 2018-2022 Blender Foundation. */
-#ifndef __UTIL_TYPES_FLOAT8_H__
-#define __UTIL_TYPES_FLOAT8_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -11,11 +10,16 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+/* float8 is a reserved type in Metal that has not been implemented. For
+ * that reason this is named float8_t and not using native vector types. */
-struct ccl_try_align(32) float8
+#ifdef __KERNEL_GPU__
+struct float8_t
+#else
+struct ccl_try_align(32) float8_t
+#endif
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
union {
__m256 m256;
struct {
@@ -23,28 +27,27 @@ struct ccl_try_align(32) float8
};
};
- __forceinline float8();
- __forceinline float8(const float8 &a);
- __forceinline explicit float8(const __m256 &a);
+ __forceinline float8_t();
+ __forceinline float8_t(const float8_t &a);
+ __forceinline explicit float8_t(const __m256 &a);
__forceinline operator const __m256 &() const;
__forceinline operator __m256 &();
- __forceinline float8 &operator=(const float8 &a);
+ __forceinline float8_t &operator=(const float8_t &a);
-# else /* __KERNEL_AVX2__ */
+#else /* __KERNEL_AVX2__ */
float a, b, c, d, e, f, g, h;
-# endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
+#ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
__forceinline float &operator[](int i);
+#endif
};
-ccl_device_inline float8 make_float8(float f);
-ccl_device_inline float8
-make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
-#endif /* __KERNEL_GPU__ */
+ccl_device_inline float8_t make_float8_t(float f);
+ccl_device_inline float8_t
+make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT8_H__ */
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index 19818976b50..e8576cdaf70 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -2,87 +2,79 @@
* Original code Copyright 2017, Intel Corporation
* Modifications Copyright 2018-2022 Blender Foundation. */
-#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
-#define __UTIL_TYPES_FLOAT8_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
-#ifndef __KERNEL_GPU__
-# include <cstdio>
-#endif
-
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
-# ifdef __KERNEL_AVX2__
-__forceinline float8::float8()
+#ifdef __KERNEL_AVX2__
+__forceinline float8_t::float8_t()
{
}
-__forceinline float8::float8(const float8 &f) : m256(f.m256)
+__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
{
}
-__forceinline float8::float8(const __m256 &f) : m256(f)
+__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
{
}
-__forceinline float8::operator const __m256 &() const
+__forceinline float8_t::operator const __m256 &() const
{
return m256;
}
-__forceinline float8::operator __m256 &()
+__forceinline float8_t::operator __m256 &()
{
return m256;
}
-__forceinline float8 &float8::operator=(const float8 &f)
+__forceinline float8_t &float8_t::operator=(const float8_t &f)
{
m256 = f.m256;
return *this;
}
-# endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
-__forceinline float float8::operator[](int i) const
+#ifndef __KERNEL_GPU__
+__forceinline float float8_t::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
-__forceinline float &float8::operator[](int i)
+__forceinline float &float8_t::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
+#endif
-ccl_device_inline float8 make_float8(float f)
+ccl_device_inline float8_t make_float8_t(float f)
{
-# ifdef __KERNEL_AVX2__
- float8 r(_mm256_set1_ps(f));
-# else
- float8 r = {f, f, f, f, f, f, f, f};
-# endif
+#ifdef __KERNEL_AVX2__
+ float8_t r(_mm256_set1_ps(f));
+#else
+ float8_t r = {f, f, f, f, f, f, f, f};
+#endif
return r;
}
-ccl_device_inline float8
-make_float8(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline float8_t
+make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
{
-# ifdef __KERNEL_AVX2__
- float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
-# else
- float8 r = {a, b, c, d, e, f, g, h};
-# endif
+#ifdef __KERNEL_AVX2__
+ float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#else
+ float8_t r = {a, b, c, d, e, f, g, h};
+#endif
return r;
}
-#endif /* __KERNEL_GPU__ */
-
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */
diff --git a/intern/cycles/util/types_int2.h b/intern/cycles/util/types_int2.h
index 4daf387d9cf..604713dffcd 100644
--- a/intern/cycles/util/types_int2.h
+++ b/intern/cycles/util/types_int2.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_INT2_H__
-#define __UTIL_TYPES_INT2_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct int2 {
int x, y;
+# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
+# endif
};
ccl_device_inline int2 make_int2(int x, int y);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT2_H__ */
diff --git a/intern/cycles/util/types_int2_impl.h b/intern/cycles/util/types_int2_impl.h
index 7989c4d5506..f48c6f46729 100644
--- a/intern/cycles/util/types_int2_impl.h
+++ b/intern/cycles/util/types_int2_impl.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_INT2_IMPL_H__
-#define __UTIL_TYPES_INT2_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
int int2::operator[](int i) const
{
util_assert(i >= 0);
@@ -24,14 +24,13 @@ int &int2::operator[](int i)
util_assert(i < 2);
return *(&x + i);
}
+# endif
ccl_device_inline int2 make_int2(int x, int y)
{
int2 a = {x, y};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT2_IMPL_H__ */
diff --git a/intern/cycles/util/types_int3.h b/intern/cycles/util/types_int3.h
index ad9bcb39bbe..e059ddd3660 100644
--- a/intern/cycles/util/types_int3.h
+++ b/intern/cycles/util/types_int3.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_INT3_H__
-#define __UTIL_TYPES_INT3_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,10 +9,15 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct ccl_try_align(16) int3
{
-# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_GPU__
+ /* Compact structure on the GPU. */
+ int x, y, z;
+# else
+ /* SIMD aligned structure for CPU. */
+# ifdef __KERNEL_SSE__
union {
__m128i m128;
struct {
@@ -29,19 +33,21 @@ struct ccl_try_align(16) int3
__forceinline operator __m128i &();
__forceinline int3 &operator=(const int3 &a);
-# else /* __KERNEL_SSE__ */
+# else /* __KERNEL_SSE__ */
int x, y, z, w;
-# endif /* __KERNEL_SSE__ */
+# endif /* __KERNEL_SSE__ */
+# endif
+# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
+# endif
};
-ccl_device_inline int3 make_int3(int i);
ccl_device_inline int3 make_int3(int x, int y, int z);
-ccl_device_inline void print_int3(const char *label, const int3 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-CCL_NAMESPACE_END
+ccl_device_inline int3 make_int3(int i);
+ccl_device_inline void print_int3(ccl_private const char *label, const int3 a);
-#endif /* __UTIL_TYPES_INT3_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int3_impl.h b/intern/cycles/util/types_int3_impl.h
index 4cfc1cf2987..830dfa3c658 100644
--- a/intern/cycles/util/types_int3_impl.h
+++ b/intern/cycles/util/types_int3_impl.h
@@ -1,20 +1,15 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_INT3_IMPL_H__
-#define __UTIL_TYPES_INT3_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
-#ifndef __KERNEL_GPU__
-# include <cstdio>
-#endif
-
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline int3::int3()
{
@@ -45,6 +40,7 @@ __forceinline int3 &int3::operator=(const int3 &a)
}
# endif /* __KERNEL_SSE__ */
+# ifndef __KERNEL_GPU__
__forceinline int int3::operator[](int i) const
{
util_assert(i >= 0);
@@ -58,34 +54,37 @@ __forceinline int &int3::operator[](int i)
util_assert(i < 3);
return *(&x + i);
}
-
-ccl_device_inline int3 make_int3(int i)
-{
-# ifdef __KERNEL_SSE__
- int3 a(_mm_set1_epi32(i));
-# else
- int3 a = {i, i, i, i};
# endif
- return a;
-}
ccl_device_inline int3 make_int3(int x, int y, int z)
{
-# ifdef __KERNEL_SSE__
- int3 a(_mm_set_epi32(0, z, y, x));
+# if defined(__KERNEL_GPU__)
+ return {x, y, z};
+# elif defined(__KERNEL_SSE__)
+ return int3(_mm_set_epi32(0, z, y, x));
# else
- int3 a = {x, y, z, 0};
+ return {x, y, z, 0};
# endif
+}
- return a;
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline int3 make_int3(int i)
+{
+#if defined(__KERNEL_GPU__)
+ return make_int3(i, i, i);
+#elif defined(__KERNEL_SSE__)
+ return int3(_mm_set1_epi32(i));
+#else
+ return {i, i, i, i};
+#endif
}
-ccl_device_inline void print_int3(const char *label, const int3 &a)
+ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
{
+#ifdef __KERNEL_PRINTF__
printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
+#endif
}
-#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT3_IMPL_H__ */
diff --git a/intern/cycles/util/types_int4.h b/intern/cycles/util/types_int4.h
index f35632fb52f..1a13c03e60e 100644
--- a/intern/cycles/util/types_int4.h
+++ b/intern/cycles/util/types_int4.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_INT4_H__
-#define __UTIL_TYPES_INT4_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,7 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct float3;
struct float4;
@@ -37,17 +36,18 @@ struct ccl_try_align(16) int4
int x, y, z, w;
# endif /* __KERNEL_SSE__ */
+# ifndef __KERNEL_GPU__
__forceinline int operator[](int i) const;
__forceinline int &operator[](int i);
+# endif
};
-ccl_device_inline int4 make_int4(int i);
ccl_device_inline int4 make_int4(int x, int y, int z, int w);
-ccl_device_inline int4 make_int4(const float3 &f);
-ccl_device_inline int4 make_int4(const float4 &f);
-ccl_device_inline void print_int4(const char *label, const int4 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
-CCL_NAMESPACE_END
+ccl_device_inline int4 make_int4(int i);
+ccl_device_inline int4 make_int4(const float3 f);
+ccl_device_inline int4 make_int4(const float4 f);
+ccl_device_inline void print_int4(ccl_private const char *label, const int4 a);
-#endif /* __UTIL_TYPES_INT4_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int4_impl.h b/intern/cycles/util/types_int4_impl.h
index adb4a4cebac..067794e67b4 100644
--- a/intern/cycles/util/types_int4_impl.h
+++ b/intern/cycles/util/types_int4_impl.h
@@ -1,20 +1,15 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_INT4_IMPL_H__
-#define __UTIL_TYPES_INT4_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
-#ifndef __KERNEL_GPU__
-# include <cstdio>
-#endif
-
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
# ifdef __KERNEL_SSE__
__forceinline int4::int4()
{
@@ -45,6 +40,7 @@ __forceinline int4 &int4::operator=(const int4 &a)
}
# endif /* __KERNEL_SSE__ */
+# ifndef __KERNEL_GPU__
__forceinline int int4::operator[](int i) const
{
util_assert(i >= 0);
@@ -58,53 +54,53 @@ __forceinline int &int4::operator[](int i)
util_assert(i < 4);
return *(&x + i);
}
+# endif
-ccl_device_inline int4 make_int4(int i)
+ccl_device_inline int4 make_int4(int x, int y, int z, int w)
{
# ifdef __KERNEL_SSE__
- int4 a(_mm_set1_epi32(i));
+ return int4(_mm_set_epi32(w, z, y, x));
# else
- int4 a = {i, i, i, i};
+ return {x, y, z, w};
# endif
- return a;
}
-ccl_device_inline int4 make_int4(int x, int y, int z, int w)
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline int4 make_int4(int i)
{
-# ifdef __KERNEL_SSE__
- int4 a(_mm_set_epi32(w, z, y, x));
-# else
- int4 a = {x, y, z, w};
-# endif
- return a;
+#ifdef __KERNEL_SSE__
+ return int4(_mm_set1_epi32(i));
+#else
+ return make_int4(i, i, i, i);
+#endif
}
-ccl_device_inline int4 make_int4(const float3 &f)
+ccl_device_inline int4 make_int4(const float3 f)
{
-# ifdef __KERNEL_SSE__
- int4 a(_mm_cvtps_epi32(f.m128));
-# else
- int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-# endif
- return a;
+#if defined(__KERNEL_GPU__)
+ return make_int4((int)f.x, (int)f.y, (int)f.z, 0);
+#elif defined(__KERNEL_SSE__)
+ return int4(_mm_cvtps_epi32(f.m128));
+#else
+ return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
+#endif
}
-ccl_device_inline int4 make_int4(const float4 &f)
+ccl_device_inline int4 make_int4(const float4 f)
{
-# ifdef __KERNEL_SSE__
- int4 a(_mm_cvtps_epi32(f.m128));
-# else
- int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-# endif
- return a;
+#ifdef __KERNEL_SSE__
+ return int4(_mm_cvtps_epi32(f.m128));
+#else
+ return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
+#endif
}
-ccl_device_inline void print_int4(const char *label, const int4 &a)
+ccl_device_inline void print_int4(ccl_private const char *label, const int4 a)
{
+#ifdef __KERNEL_PRINTF__
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
+#endif
}
-#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT4_IMPL_H__ */
diff --git a/intern/cycles/util/types_spectrum.h b/intern/cycles/util/types_spectrum.h
new file mode 100644
index 00000000000..c59230b83ae
--- /dev/null
+++ b/intern/cycles/util/types_spectrum.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_TYPES_SPECTRUM_H__
+#define __UTIL_TYPES_SPECTRUM_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#define SPECTRUM_CHANNELS 3
+#define SPECTRUM_DATA_TYPE float3
+#define PACKED_SPECTRUM_DATA_TYPE packed_float3
+
+using Spectrum = SPECTRUM_DATA_TYPE;
+using PackedSpectrum = PACKED_SPECTRUM_DATA_TYPE;
+
+#define make_spectrum(f) CONCAT(make_, SPECTRUM_DATA_TYPE(f))
+#define load_spectrum(f) CONCAT(load_, SPECTRUM_DATA_TYPE(f))
+#define store_spectrum(s, f) CONCAT(store_, SPECTRUM_DATA_TYPE((s), (f)))
+
+#define zero_spectrum CONCAT(zero_, SPECTRUM_DATA_TYPE)
+#define one_spectrum CONCAT(one_, SPECTRUM_DATA_TYPE)
+
+#define FOREACH_SPECTRUM_CHANNEL(counter) \
+ for (int counter = 0; counter < SPECTRUM_CHANNELS; counter++)
+
+#define GET_SPECTRUM_CHANNEL(v, i) (((ccl_private float *)(&(v)))[i])
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_SPECTRUM_H__ */
diff --git a/intern/cycles/util/types_uchar2.h b/intern/cycles/util/types_uchar2.h
index 445fa8dd703..ce617248e6e 100644
--- a/intern/cycles/util/types_uchar2.h
+++ b/intern/cycles/util/types_uchar2.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UCHAR2_H__
-#define __UTIL_TYPES_UCHAR2_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct uchar2 {
uchar x, y;
+# ifndef __KERNEL_GPU__
__forceinline uchar operator[](int i) const;
__forceinline uchar &operator[](int i);
+# endif
};
ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UCHAR2_H__ */
diff --git a/intern/cycles/util/types_uchar2_impl.h b/intern/cycles/util/types_uchar2_impl.h
index cec1c679050..9f3f3a4efb9 100644
--- a/intern/cycles/util/types_uchar2_impl.h
+++ b/intern/cycles/util/types_uchar2_impl.h
@@ -10,7 +10,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
uchar uchar2::operator[](int i) const
{
util_assert(i >= 0);
@@ -24,13 +25,14 @@ uchar &uchar2::operator[](int i)
util_assert(i < 2);
return *(&x + i);
}
+# endif
ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
{
uchar2 a = {x, y};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_uchar3.h b/intern/cycles/util/types_uchar3.h
index 1ebd86441c3..aed04c4775e 100644
--- a/intern/cycles/util/types_uchar3.h
+++ b/intern/cycles/util/types_uchar3.h
@@ -10,16 +10,18 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct uchar3 {
uchar x, y, z;
+# ifndef __KERNEL_GPU__
__forceinline uchar operator[](int i) const;
__forceinline uchar &operator[](int i);
+# endif
};
ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_uchar3_impl.h b/intern/cycles/util/types_uchar3_impl.h
index 0656baa3da4..83eb3c99b3c 100644
--- a/intern/cycles/util/types_uchar3_impl.h
+++ b/intern/cycles/util/types_uchar3_impl.h
@@ -10,7 +10,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
uchar uchar3::operator[](int i) const
{
util_assert(i >= 0);
@@ -24,13 +25,14 @@ uchar &uchar3::operator[](int i)
util_assert(i < 3);
return *(&x + i);
}
+# endif
ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
{
uchar3 a = {x, y, z};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_uchar4.h b/intern/cycles/util/types_uchar4.h
index 2ac4fb56cbb..fb13a98875e 100644
--- a/intern/cycles/util/types_uchar4.h
+++ b/intern/cycles/util/types_uchar4.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UCHAR4_H__
-#define __UTIL_TYPES_UCHAR4_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct uchar4 {
uchar x, y, z, w;
+# ifndef __KERNEL_GPU__
__forceinline uchar operator[](int i) const;
__forceinline uchar &operator[](int i);
+# endif
};
ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UCHAR4_H__ */
diff --git a/intern/cycles/util/types_uchar4_impl.h b/intern/cycles/util/types_uchar4_impl.h
index b3e8abfe873..244bb98f883 100644
--- a/intern/cycles/util/types_uchar4_impl.h
+++ b/intern/cycles/util/types_uchar4_impl.h
@@ -10,7 +10,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
uchar uchar4::operator[](int i) const
{
util_assert(i >= 0);
@@ -24,13 +25,14 @@ uchar &uchar4::operator[](int i)
util_assert(i < 4);
return *(&x + i);
}
+# endif
ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
{
uchar4 a = {x, y, z, w};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_uint2.h b/intern/cycles/util/types_uint2.h
index e3254b9f0e1..4d76b628088 100644
--- a/intern/cycles/util/types_uint2.h
+++ b/intern/cycles/util/types_uint2.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UINT2_H__
-#define __UTIL_TYPES_UINT2_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct uint2 {
uint x, y;
+# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
+# endif
};
ccl_device_inline uint2 make_uint2(uint x, uint y);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT2_H__ */
diff --git a/intern/cycles/util/types_uint2_impl.h b/intern/cycles/util/types_uint2_impl.h
index e67134a011e..b508aaf2543 100644
--- a/intern/cycles/util/types_uint2_impl.h
+++ b/intern/cycles/util/types_uint2_impl.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UINT2_IMPL_H__
-#define __UTIL_TYPES_UINT2_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
__forceinline uint uint2::operator[](uint i) const
{
util_assert(i < 2);
@@ -22,14 +22,13 @@ __forceinline uint &uint2::operator[](uint i)
util_assert(i < 2);
return *(&x + i);
}
+# endif
ccl_device_inline uint2 make_uint2(uint x, uint y)
{
uint2 a = {x, y};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT2_IMPL_H__ */
diff --git a/intern/cycles/util/types_uint3.h b/intern/cycles/util/types_uint3.h
index 885a8fb84ce..b1571716fc7 100644
--- a/intern/cycles/util/types_uint3.h
+++ b/intern/cycles/util/types_uint3.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UINT3_H__
-#define __UTIL_TYPES_UINT3_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct uint3 {
uint x, y, z;
+# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
+# endif
};
ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT3_H__ */
diff --git a/intern/cycles/util/types_uint3_impl.h b/intern/cycles/util/types_uint3_impl.h
index f4d3d72469c..d36c9f52de9 100644
--- a/intern/cycles/util/types_uint3_impl.h
+++ b/intern/cycles/util/types_uint3_impl.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UINT3_IMPL_H__
-#define __UTIL_TYPES_UINT3_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
__forceinline uint uint3::operator[](uint i) const
{
util_assert(i < 3);
@@ -22,14 +22,13 @@ __forceinline uint &uint3::operator[](uint i)
util_assert(i < 3);
return *(&x + i);
}
+# endif
ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
{
uint3 a = {x, y, z};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT3_IMPL_H__ */
diff --git a/intern/cycles/util/types_uint4.h b/intern/cycles/util/types_uint4.h
index d582b91d2a0..4982b30f577 100644
--- a/intern/cycles/util/types_uint4.h
+++ b/intern/cycles/util/types_uint4.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UINT4_H__
-#define __UTIL_TYPES_UINT4_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct uint4 {
uint x, y, z, w;
+# ifndef __KERNEL_GPU__
__forceinline uint operator[](uint i) const;
__forceinline uint &operator[](uint i);
+# endif
};
ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT4_H__ */
diff --git a/intern/cycles/util/types_uint4_impl.h b/intern/cycles/util/types_uint4_impl.h
index 98a4c5e9fe9..1cfdb9e0992 100644
--- a/intern/cycles/util/types_uint4_impl.h
+++ b/intern/cycles/util/types_uint4_impl.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
-#ifndef __UTIL_TYPES_UINT4_IMPL_H__
-#define __UTIL_TYPES_UINT4_IMPL_H__
+#pragma once
#ifndef __UTIL_TYPES_H__
# error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+# ifndef __KERNEL_GPU__
__forceinline uint uint4::operator[](uint i) const
{
util_assert(i < 3);
@@ -22,14 +22,13 @@ __forceinline uint &uint4::operator[](uint i)
util_assert(i < 3);
return *(&x + i);
}
+# endif
ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
{
uint4 a = {x, y, z, w};
return a;
}
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT4_IMPL_H__ */
diff --git a/intern/cycles/util/types_ushort4.h b/intern/cycles/util/types_ushort4.h
index 1766c6bf734..aef36f63285 100644
--- a/intern/cycles/util/types_ushort4.h
+++ b/intern/cycles/util/types_ushort4.h
@@ -10,7 +10,7 @@
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
struct ushort4 {
uint16_t x, y, z, w;
diff --git a/intern/cycles/util/types_vector3.h b/intern/cycles/util/types_vector3.h
deleted file mode 100644
index 2e0d68e1bd0..00000000000
--- a/intern/cycles/util/types_vector3.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#ifndef __UTIL_TYPES_VECTOR3_H__
-#define __UTIL_TYPES_VECTOR3_H__
-
-#ifndef __UTIL_TYPES_H__
-# error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_GPU__
-template<typename T> class vector3 {
- public:
- T x, y, z;
-
- __forceinline vector3();
- __forceinline vector3(const T &a);
- __forceinline vector3(const T &x, const T &y, const T &z);
-};
-#endif /* __KERNEL_GPU__ */
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_VECTOR3_H__ */
diff --git a/intern/cycles/util/types_vector3_impl.h b/intern/cycles/util/types_vector3_impl.h
deleted file mode 100644
index a765780e2d3..00000000000
--- a/intern/cycles/util/types_vector3_impl.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__
-#define __UTIL_TYPES_VECTOR3_IMPL_H__
-
-#ifndef __UTIL_TYPES_H__
-# error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_GPU__
-template<typename T> ccl_always_inline vector3<T>::vector3()
-{
-}
-
-template<typename T> ccl_always_inline vector3<T>::vector3(const T &a) : x(a), y(a), z(a)
-{
-}
-
-template<typename T>
-ccl_always_inline vector3<T>::vector3(const T &x, const T &y, const T &z) : x(x), y(y), z(z)
-{
-}
-#endif /* __KERNEL_GPU__ */
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_VECTOR3_IMPL_H__ */
diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h
index 0056fb269ae..9e27997cf2c 100644
--- a/intern/cycles/util/vector.h
+++ b/intern/cycles/util/vector.h
@@ -10,7 +10,6 @@
#include "util/aligned_malloc.h"
#include "util/guarded_allocator.h"
-#include "util/types.h"
CCL_NAMESPACE_BEGIN