diff options
Diffstat (limited to 'intern/cycles/util')
60 files changed, 1596 insertions, 725 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index fddac1dbbcf..997d574a3b0 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -3,7 +3,6 @@ set(INC .. - ../../glew-mx ) set(INC_SYS @@ -26,6 +25,8 @@ set(SRC thread.cpp time.cpp transform.cpp + transform_avx2.cpp + transform_sse41.cpp windows.cpp ) @@ -63,6 +64,7 @@ set(SRC_HEADERS math_float2.h math_float3.h math_float4.h + math_float8.h math_int2.h math_int3.h math_int4.h @@ -115,6 +117,7 @@ set(SRC_HEADERS types_int3_impl.h types_int4.h types_int4_impl.h + types_spectrum.h types_uchar2.h types_uchar2_impl.h types_uchar3.h @@ -128,8 +131,6 @@ set(SRC_HEADERS types_uint4.h types_uint4_impl.h types_ushort4.h - types_vector3.h - types_vector3_impl.h unique_ptr.h vector.h version.h @@ -137,9 +138,14 @@ set(SRC_HEADERS xml.h ) +if(CXX_HAS_SSE) + set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") +endif() +if(CXX_HAS_AVX2) + set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_definitions(${GL_DEFINITIONS}) - cycles_add_library(cycles_util "${LIB}" ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h index f89eb28b0b7..1ebf085ae13 100644 --- a/intern/cycles/util/atomic.h +++ b/intern/cycles/util/atomic.h @@ -106,6 +106,116 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float # endif /* __KERNEL_METAL__ */ +# ifdef __KERNEL_ONEAPI__ + +ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, float x) +{ + sycl::atomic_ref<float, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_add(x); +} + +ccl_device_inline float atomic_compare_and_swap_float(ccl_global float *source, + float old_val, + float new_val) +{ + sycl::atomic_ref<float, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*source); + atomic.compare_exchange_weak(old_val, new_val); + return old_val; +} + +ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p, + unsigned int x) +{ + sycl::atomic_ref<unsigned int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_add(x); +} + +ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x) +{ + sycl::atomic_ref<int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_add(x); +} + +ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p, + unsigned int x) +{ + sycl::atomic_ref<unsigned int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_sub(x); +} + +ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, int x) +{ + sycl::atomic_ref<int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_sub(x); +} + +ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p) +{ + return atomic_fetch_and_add_uint32(p, 1); +} + +ccl_device_inline int atomic_fetch_and_inc_uint32(ccl_global int *p) +{ + return atomic_fetch_and_add_uint32(p, 1); +} + +ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p) +{ + return atomic_fetch_and_sub_uint32(p, 1); +} + +ccl_device_inline int atomic_fetch_and_dec_uint32(ccl_global int *p) +{ + return atomic_fetch_and_sub_uint32(p, 1); +} + +ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p, + unsigned int x) +{ + sycl::atomic_ref<unsigned int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_or(x); +} + +ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x) +{ + sycl::atomic_ref<int, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + atomic(*p); + return atomic.fetch_or(x); +} + +# endif /* __KERNEL_ONEAPI__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h index 795c3754976..537f8ab6771 100644 --- a/intern/cycles/util/color.h +++ b/intern/cycles/util/color.h @@ -318,14 +318,14 @@ ccl_device float3 color_highlight_compress(float3 color, ccl_private float3 *var { color += one_float3(); if (variance) { - *variance *= sqr3(one_float3() / color); + *variance *= sqr(one_float3() / color); } - return log3(color); + return log(color); } ccl_device float3 color_highlight_uncompress(float3 color) { - return exp3(color) - one_float3(); + return exp(color) - one_float3(); } CCL_NAMESPACE_END diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp index 65d108bb9d1..8210e21f951 100644 --- a/intern/cycles/util/debug.cpp +++ b/intern/cycles/util/debug.cpp @@ -13,7 +13,6 @@ CCL_NAMESPACE_BEGIN DebugFlags::CPU::CPU() - : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO) { reset(); } @@ -25,7 +24,7 @@ void DebugFlags::CPU::reset() do { \ flag = (getenv(env) == NULL); \ if (!flag) { \ - VLOG(1) << "Disabling " << STRINGIFY(flag) << " instruction set."; \ + VLOG_INFO << "Disabling " << STRINGIFY(flag) << " instruction set."; \ } \ } while (0) @@ -41,17 +40,17 @@ void DebugFlags::CPU::reset() bvh_layout = BVH_LAYOUT_AUTO; } -DebugFlags::CUDA::CUDA() : adaptive_compile(false) +DebugFlags::CUDA::CUDA() { reset(); } -DebugFlags::HIP::HIP() : adaptive_compile(false) +DebugFlags::HIP::HIP() { reset(); } -DebugFlags::Metal::Metal() : adaptive_compile(false) +DebugFlags::Metal::Metal() { reset(); } @@ -84,14 +83,13 @@ void DebugFlags::OptiX::reset() use_debug = false; } -DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false) +DebugFlags::DebugFlags() { /* Nothing for now. */ } void DebugFlags::reset() { - viewport_static_bvh = false; cpu.reset(); cuda.reset(); optix.reset(); diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h index 3565fdea17f..ab200649f59 100644 --- a/intern/cycles/util/debug.h +++ b/intern/cycles/util/debug.h @@ -17,11 +17,6 @@ CCL_NAMESPACE_BEGIN */ class DebugFlags { public: - /* Use static BVH in viewport, to match final render exactly. */ - bool viewport_static_bvh; - - bool running_inside_blender; - /* Descriptor of CPU feature-set to be used. */ struct CPU { CPU(); @@ -30,11 +25,11 @@ class DebugFlags { void reset(); /* Flags describing which instructions sets are allowed for use. */ - bool avx2; - bool avx; - bool sse41; - bool sse3; - bool sse2; + bool avx2 = true; + bool avx = true; + bool sse41 = true; + bool sse3 = true; + bool sse2 = true; /* Check functions to see whether instructions up to the given one * are allowed for use. @@ -65,7 +60,7 @@ class DebugFlags { * By default the fastest will be used. For debugging the BVH used by other * CPUs and GPUs can be selected here instead. */ - BVHLayout bvh_layout; + BVHLayout bvh_layout = BVH_LAYOUT_AUTO; }; /* Descriptor of CUDA feature-set to be used. */ @@ -77,7 +72,7 @@ class DebugFlags { /* Whether adaptive feature based runtime compile is enabled or not. * Requires the CUDA Toolkit and only works on Linux at the moment. */ - bool adaptive_compile; + bool adaptive_compile = false; }; /* Descriptor of HIP feature-set to be used. */ @@ -88,7 +83,7 @@ class DebugFlags { void reset(); /* Whether adaptive feature based runtime compile is enabled or not. */ - bool adaptive_compile; + bool adaptive_compile = false; }; /* Descriptor of OptiX feature-set to be used. */ @@ -100,7 +95,7 @@ class DebugFlags { /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable * validations, and lower optimization level. */ - bool use_debug; + bool use_debug = false; }; /* Descriptor of Metal feature-set to be used. */ @@ -111,7 +106,7 @@ class DebugFlags { void reset(); /* Whether adaptive feature based runtime compile is enabled or not. */ - bool adaptive_compile; + bool adaptive_compile = false; }; /* Get instance of debug flags registry. */ @@ -142,15 +137,9 @@ class DebugFlags { private: DebugFlags(); -#if (__cplusplus > 199711L) public: explicit DebugFlags(DebugFlags const & /*other*/) = delete; void operator=(DebugFlags const & /*other*/) = delete; -#else - private: - explicit DebugFlags(DebugFlags const & /*other*/); - void operator=(DebugFlags const & /*other*/); -#endif }; typedef DebugFlags &DebugFlagsRef; diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index 115a747cf1c..1969529eff0 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -81,7 +81,7 @@ /* macros */ /* hints for branch prediction, only use in code that runs a _lot_ */ -#if defined(__GNUC__) && defined(__KERNEL_CPU__) +#if defined(__GNUC__) && !defined(__KERNEL_GPU__) # define LIKELY(x) __builtin_expect(!!(x), 1) # define UNLIKELY(x) __builtin_expect(!!(x), 0) #else @@ -89,46 +89,6 @@ # define UNLIKELY(x) (x) #endif -#if defined(__GNUC__) || defined(__clang__) -# if defined(__cplusplus) -/* Some magic to be sure we don't have reference in the type. */ -template<typename T> static inline T decltype_helper(T x) -{ - return x; -} -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif -#endif - -/* Causes warning: - * incompatible types when assigning to type 'Foo' from type 'Bar' - * ... the compiler optimizes away the temp var */ -#ifdef __GNUC__ -# define CHECK_TYPE(var, type) \ - { \ - TYPEOF(var) * __tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ - } \ - (void)0 - -# define CHECK_TYPE_PAIR(var_a, var_b) \ - { \ - TYPEOF(var_a) * __tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ - } \ - (void)0 -#else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) -#endif - -/* can be used in simple macros */ -#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val))) - #ifndef __KERNEL_GPU__ # include <cassert> # define util_assert(statement) assert(statement) @@ -136,4 +96,7 @@ template<typename T> static inline T decltype_helper(T x) # define util_assert(statement) #endif +#define CONCAT_HELPER(a, ...) a##__VA_ARGS__ +#define CONCAT(a, ...) CONCAT_HELPER(a, __VA_ARGS__) + #endif /* __UTIL_DEFINES_H__ */ diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h index 434bc12d670..c668638eb02 100644 --- a/intern/cycles/util/half.h +++ b/intern/cycles/util/half.h @@ -35,7 +35,7 @@ ccl_device_inline float half_to_float(half h_in) #else /* CUDA has its own half data type, no need to define then */ -# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) +# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -73,7 +73,7 @@ struct half4 { ccl_device_inline half float_to_half_image(float f) { -#if defined(__KERNEL_METAL__) +#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__) return half(min(f, 65504.0f)); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(min(f, 65504.0f)); @@ -103,6 +103,8 @@ ccl_device_inline float half_to_float_image(half h) { #if defined(__KERNEL_METAL__) return half_to_float(h); +#elif defined(__KERNEL_ONEAPI__) + return float(h); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __half2float(h); #else @@ -136,7 +138,7 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h) ccl_device_inline half float_to_half_display(const float f) { -#if defined(__KERNEL_METAL__) +#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__) return half(min(f, 65504.0f)); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(min(f, 65504.0f)); diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h index 081b33025d8..4f83f331229 100644 --- a/intern/cycles/util/hash.h +++ b/intern/cycles/util/hash.h @@ -4,10 +4,28 @@ #ifndef __UTIL_HASH_H__ #define __UTIL_HASH_H__ +#include "util/math.h" #include "util/types.h" CCL_NAMESPACE_BEGIN +/* [0, uint_max] -> [0.0, 1.0) */ +ccl_device_forceinline float uint_to_float_excl(uint n) +{ + // Note: we divide by 4294967808 instead of 2^32 because the latter + // leads to a [0.0, 1.0] mapping instead of [0.0, 1.0) due to floating + // point rounding error. 4294967808 unfortunately leaves (precisely) + // one unused ulp between the max number this outputs and 1.0, but + // that's the best you can do with this construction. + return (float)n * (1.0f / 4294967808.0f); +} + +/* [0, uint_max] -> [0.0, 1.0] */ +ccl_device_forceinline float uint_to_float_incl(uint n) +{ + return (float)n * (1.0f / (float)0xFFFFFFFFu); +} + /* ***** Jenkins Lookup3 Hash Functions ***** */ /* Source: http://burtleburtle.net/bob/c/lookup3.c */ @@ -116,22 +134,22 @@ ccl_device_inline uint hash_uint4(uint kx, uint ky, uint kz, uint kw) ccl_device_inline float hash_uint_to_float(uint kx) { - return (float)hash_uint(kx) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint(kx)); } ccl_device_inline float hash_uint2_to_float(uint kx, uint ky) { - return (float)hash_uint2(kx, ky) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint2(kx, ky)); } ccl_device_inline float hash_uint3_to_float(uint kx, uint ky, uint kz) { - return (float)hash_uint3(kx, ky, kz) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint3(kx, ky, kz)); } ccl_device_inline float hash_uint4_to_float(uint kx, uint ky, uint kz, uint kw) { - return (float)hash_uint4(kx, ky, kz, kw) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint4(kx, ky, kz, kw)); } /* Hashing float or float[234] into a float in the range [0, 1]. */ @@ -359,6 +377,123 @@ ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw) #endif +/* ***** Hash Prospector Hash Functions ***** + * + * These are based on the high-quality 32-bit hash/mixing functions from + * https://github.com/skeeto/hash-prospector + */ + +ccl_device_inline uint hash_hp_uint(uint i) +{ + // The actual mixing function from Hash Prospector. + i ^= i >> 16; + i *= 0x21f0aaad; + i ^= i >> 15; + i *= 0xd35a2d97; + i ^= i >> 15; + + // The xor is just to make input zero not map to output zero. + // The number is randomly selected and isn't special. + return i ^ 0xe6fe3beb; +} + +/* Seedable version of hash_hp_uint() above. */ +ccl_device_inline uint hash_hp_seeded_uint(uint i, uint seed) +{ + // Manipulate the seed so it doesn't interact poorly with n when they + // are both e.g. incrementing. This isn't fool-proof, but is good + // enough for practical use. + seed ^= seed << 19; + + return hash_hp_uint(i ^ seed); +} + +/* Outputs [0.0, 1.0). */ +ccl_device_inline float hash_hp_float(uint i) +{ + return uint_to_float_excl(hash_hp_uint(i)); +} + +/* Outputs [0.0, 1.0). */ +ccl_device_inline float hash_hp_seeded_float(uint i, uint seed) +{ + return uint_to_float_excl(hash_hp_seeded_uint(i, seed)); +} + +/* ***** Modified Wang Hash Functions ***** + * + * These are based on a bespoke modified version of the Wang hash, and + * can serve as a faster hash when quality isn't critical. + * + * The original Wang hash is documented here: + * https://www.burtleburtle.net/bob/hash/integer.html + */ + +ccl_device_inline uint hash_wang_seeded_uint(uint i, uint seed) +{ + i = (i ^ 61) ^ seed; + i += i << 3; + i ^= i >> 4; + i *= 0x27d4eb2d; + return i; +} + +/* Outputs [0.0, 1.0). */ +ccl_device_inline float hash_wang_seeded_float(uint i, uint seed) +{ + return uint_to_float_excl(hash_wang_seeded_uint(i, seed)); +} + +/* ***** Index Shuffling Hash Function ***** + * + * This function takes an index, the length of the thing the index points + * into, and returns a shuffled index. For example, if you pass indices + * 0 through 19 to this function with a length parameter of 20, it will + * return the indices in a shuffled order with no repeats. Indices + * larger than the length parameter will simply repeat the same shuffled + * pattern over and over. + * + * This is useful for iterating over an array in random shuffled order + * without having to shuffle the array itself. + * + * Passing different seeds results in different random shuffles. + * + * This function runs in average O(1) time. + * + * See https://andrew-helmer.github.io/permute/ for details on how this + * works. + */ +ccl_device_inline uint hash_shuffle_uint(uint i, uint length, uint seed) +{ + i = i % length; + uint mask = (1 << (32 - count_leading_zeros(length - 1))) - 1; + + do { + i ^= seed; + i *= 0xe170893d; + i ^= seed >> 16; + i ^= (i & mask) >> 4; + i ^= seed >> 8; + i *= 0x0929eb3f; + i ^= seed >> 23; + i ^= (i & mask) >> 1; + i *= 1 | seed >> 27; + i *= 0x6935fa69; + i ^= (i & mask) >> 11; + i *= 0x74dcb303; + i ^= (i & mask) >> 2; + i *= 0x9e501cc3; + i ^= (i & mask) >> 2; + i *= 0xc860a3df; + i &= mask; + i ^= i >> 5; + } while (i >= length); + + return i; +} + +/* ********** */ + #ifndef __KERNEL_GPU__ static inline uint hash_string(const char *str) { diff --git a/intern/cycles/util/log.h b/intern/cycles/util/log.h index b33c826d6f5..3780d03c0d1 100644 --- a/intern/cycles/util/log.h +++ b/intern/cycles/util/log.h @@ -69,9 +69,22 @@ class LogMessageVoidify { # define LOG_ASSERT(expression) LOG_SUPPRESS() #endif -#define VLOG_ONCE(level, flag) \ - if (!flag) \ - flag = true, VLOG(level) +/* Verbose logging categories. */ + +/* Warnings. */ +#define VLOG_WARNING VLOG(1) +/* Info about devices, scene contents and features used. */ +#define VLOG_INFO VLOG(2) +#define VLOG_INFO_IS_ON VLOG_IS_ON(2) +/* Work being performed and timing/memory stats about that work. */ +#define VLOG_WORK VLOG(3) +#define VLOG_WORK_IS_ON VLOG_IS_ON(3) +/* Detailed device timing stats. */ +#define VLOG_DEVICE_STATS VLOG(4) +#define VLOG_DEVICE_STATS_IS_ON VLOG_IS_ON(4) +/* Verbose debug messages. */ +#define VLOG_DEBUG VLOG(5) +#define VLOG_DEBUG_IS_ON VLOG_IS_ON(5) struct int2; struct float3; diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 555a5304764..0905b3ec5c9 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -79,7 +79,7 @@ CCL_NAMESPACE_BEGIN /* Scalar */ -#ifndef __HIP__ +#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__) # ifdef _WIN32 ccl_device_inline float fmaxf(float a, float b) { @@ -92,12 +92,18 @@ ccl_device_inline float fminf(float a, float b) } # endif /* _WIN32 */ -#endif /* __HIP__ */ +#endif /* __HIP__, __KERNEL_ONEAPI__ */ -#ifndef __KERNEL_GPU__ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) +# ifndef __KERNEL_ONEAPI__ using std::isfinite; using std::isnan; using std::sqrt; +# else +using sycl::sqrt; +# define isfinite(x) sycl::isfinite((x)) +# define isnan(x) sycl::isnan((x)) +# endif ccl_device_inline int abs(int x) { @@ -297,8 +303,15 @@ ccl_device_inline float4 __int4_as_float4(int4 i) #endif /* !defined(__KERNEL_METAL__) */ #if defined(__KERNEL_METAL__) -# define isnan_safe(v) isnan(v) -# define isfinite_safe(v) isfinite(v) +ccl_device_forceinline bool isnan_safe(float f) +{ + return isnan(f); +} + +ccl_device_forceinline bool isfinite_safe(float f) +{ + return isfinite(f); +} #else template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr) { @@ -498,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a) return make_float4(a.x, a.y, a.z, 1.0f); } +ccl_device_inline float4 float3_to_float4(const float3 a, const float w) +{ + return make_float4(a.x, a.y, a.z, w); +} + ccl_device_inline float inverse_lerp(float a, float b, float x) { return (x - a) / (b - a); @@ -522,6 +540,7 @@ CCL_NAMESPACE_END #include "util/math_float2.h" #include "util/math_float3.h" #include "util/math_float4.h" +#include "util/math_float8.h" #include "util/rect.h" @@ -576,26 +595,26 @@ ccl_device_inline void make_orthonormals(const float3 N, /* Color division */ -ccl_device_inline float3 safe_invert_color(float3 a) +ccl_device_inline Spectrum safe_invert_color(Spectrum a) { - float x, y, z; - - x = (a.x != 0.0f) ? 1.0f / a.x : 0.0f; - y = (a.y != 0.0f) ? 1.0f / a.y : 0.0f; - z = (a.z != 0.0f) ? 1.0f / a.z : 0.0f; + FOREACH_SPECTRUM_CHANNEL (i) { + GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ? + 1.0f / GET_SPECTRUM_CHANNEL(a, i) : + 0.0f; + } - return make_float3(x, y, z); + return a; } -ccl_device_inline float3 safe_divide_color(float3 a, float3 b) +ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b) { - float x, y, z; - - x = (b.x != 0.0f) ? a.x / b.x : 0.0f; - y = (b.y != 0.0f) ? a.y / b.y : 0.0f; - z = (b.z != 0.0f) ? a.z / b.z : 0.0f; + FOREACH_SPECTRUM_CHANNEL (i) { + GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ? + GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) : + 0.0f; + } - return make_float3(x, y, z); + return a; } ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b) @@ -786,6 +805,11 @@ ccl_device_inline uint popcount(uint x) return i & 1; } # endif +#elif defined(__KERNEL_ONEAPI__) +# define popcount(x) sycl::popcount(x) +#elif defined(__KERNEL_HIP__) +/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */ +# define popcount(x) __popcll(x) #elif !defined(__KERNEL_METAL__) # define popcount(x) __popc(x) #endif @@ -796,6 +820,8 @@ ccl_device_inline uint count_leading_zeros(uint x) return __clz(x); #elif defined(__KERNEL_METAL__) return clz(x); +#elif defined(__KERNEL_ONEAPI__) + return sycl::clz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -814,6 +840,8 @@ ccl_device_inline uint count_trailing_zeros(uint x) return (__ffs(x) - 1); #elif defined(__KERNEL_METAL__) return ctz(x); +#elif defined(__KERNEL_ONEAPI__) + return sycl::ctz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -858,16 +886,16 @@ ccl_device_inline float2 map_to_tube(const float3 co) ccl_device_inline float2 map_to_sphere(const float3 co) { - float l = len(co); + float l = dot(co, co); float u, v; if (l > 0.0f) { if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) { u = 0.0f; /* Otherwise domain error. */ } else { - u = (1.0f - atan2f(co.x, co.y) / M_PI_F) / 2.0f; + u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F); } - v = 1.0f - safe_acosf(co.z / l) / M_PI_F; + v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F; } else { u = v = 0.0f; @@ -925,7 +953,11 @@ ccl_device_inline uint prev_power_of_two(uint x) ccl_device_inline uint32_t reverse_integer_bits(uint32_t x) { /* Use a native instruction if it exists. */ -#if defined(__aarch64__) || defined(_M_ARM64) +#if defined(__KERNEL_CUDA__) + return __brev(x); +#elif defined(__KERNEL_METAL__) + return reverse_bits(x); +#elif defined(__aarch64__) || defined(_M_ARM64) /* Assume the rbit is always available on 64bit ARM architecture. */ __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x)); return x; @@ -934,10 +966,6 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x) * This 32-bit Thumb instruction is available in ARMv6T2 and above. */ __asm__("rbit %0, %1" : "=r"(x) : "r"(x)); return x; -#elif defined(__KERNEL_CUDA__) - return __brev(x); -#elif defined(__KERNEL_METAL__) - return reverse_bits(x); #elif __has_builtin(__builtin_bitreverse32) return __builtin_bitreverse32(x); #else diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h index 2221e7a9835..142a664a1d2 100644 --- a/intern/cycles/util/math_fast.h +++ b/intern/cycles/util/math_fast.h @@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x) return fast_exp2f(x / M_LN2_F); } -#if defined(__KERNEL_CPU__) && !defined(_MSC_VER) +#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER) /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see * T78047 and T78869 for details. Just disable for now, it only makes * a small difference in denoising performance. */ diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 365c322dd7e..c408eadf195 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -53,26 +53,25 @@ ccl_device_inline float3 ceil(const float3 &a); ccl_device_inline float3 reflect(const float3 incident, const float3 normal); #endif /* !defined(__KERNEL_METAL__) */ -ccl_device_inline float min3(float3 a); -ccl_device_inline float max3(float3 a); +ccl_device_inline float reduce_min(float3 a); +ccl_device_inline float reduce_max(float3 a); ccl_device_inline float len(const float3 a); ccl_device_inline float len_squared(const float3 a); ccl_device_inline float3 project(const float3 v, const float3 v_proj); -ccl_device_inline float3 saturate3(float3 a); ccl_device_inline float3 safe_normalize(const float3 a); ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t); ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t); -ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b); -ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b); +ccl_device_inline float3 safe_divide(const float3 a, const float3 b); +ccl_device_inline float3 safe_divide(const float3 a, const float b); ccl_device_inline float3 interp(float3 a, float3 b, float t); -ccl_device_inline float3 sqr3(float3 a); +ccl_device_inline float3 sqr(float3 a); ccl_device_inline bool is_zero(const float3 a); ccl_device_inline float reduce_add(const float3 a); ccl_device_inline float average(const float3 a); -ccl_device_inline bool isequal_float3(const float3 a, const float3 b); +ccl_device_inline bool isequal(const float3 a, const float3 b); /******************************************************************************* * Definition. @@ -148,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a) ccl_device_inline float3 operator/(const float3 &a, const float f) { - float invf = 1.0f / f; - return a * invf; +# if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, _mm_set1_ps(f))); +# else + return make_float3(a.x / f, a.y / f, a.z / f); +# endif } ccl_device_inline float3 operator/(const float3 &a, const float3 &b) @@ -285,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b) ccl_device_inline float3 cross(const float3 &a, const float3 &b) { - float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); - return r; +# ifdef __KERNEL_SSE__ + return float3(shuffle<1, 2, 0, 3>( + msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b)))); +# else + return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +# endif } ccl_device_inline float3 normalize(const float3 &a) @@ -377,14 +383,30 @@ ccl_device_inline float3 rcp(const float3 &a) return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); # endif } + +ccl_device_inline float3 saturate(float3 a) +{ + return make_float3(saturatef(a.x), saturatef(a.y), saturatef(a.z)); +} + +ccl_device_inline float3 exp(float3 v) +{ + return make_float3(expf(v.x), expf(v.y), expf(v.z)); +} + +ccl_device_inline float3 log(float3 v) +{ + return make_float3(logf(v.x), logf(v.y), logf(v.z)); +} + #endif /* !__KERNEL_METAL__ */ -ccl_device_inline float min3(float3 a) +ccl_device_inline float reduce_min(float3 a) { return min(min(a.x, a.y), a.z); } -ccl_device_inline float max3(float3 a) +ccl_device_inline float reduce_max(float3 a) { return max(max(a.x, a.y), a.z); } @@ -433,11 +455,6 @@ ccl_device_inline float3 project(const float3 v, const float3 v_proj) return (len_squared != 0.0f) ? (dot(v, v_proj) / len_squared) * v_proj : zero_float3(); } -ccl_device_inline float3 saturate3(float3 a) -{ - return make_float3(saturatef(a.x), saturatef(a.y), saturatef(a.z)); -} - ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t) { *t = len(a); @@ -457,14 +474,14 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t return (*t != 0.0f) ? a / (*t) : a; } -ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b) +ccl_device_inline float3 safe_divide(const float3 a, const float3 b) { return make_float3((b.x != 0.0f) ? a.x / b.x : 0.0f, (b.y != 0.0f) ? a.y / b.y : 0.0f, (b.z != 0.0f) ? a.z / b.z : 0.0f); } -ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b) +ccl_device_inline float3 safe_divide(const float3 a, const float b) { return (b != 0.0f) ? a / b : zero_float3(); } @@ -474,7 +491,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t) return a + t * (b - a); } -ccl_device_inline float3 sqr3(float3 a) +ccl_device_inline float3 sqr(float3 a) { return a * a; } @@ -504,7 +521,7 @@ ccl_device_inline float average(const float3 a) return reduce_add(a) * (1.0f / 3.0f); } -ccl_device_inline bool isequal_float3(const float3 a, const float3 b) +ccl_device_inline bool isequal(const float3 a, const float3 b) { #if defined(__KERNEL_METAL__) return all(a == b); @@ -513,21 +530,11 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b) #endif } -ccl_device_inline float3 pow3(float3 v, float e) +ccl_device_inline float3 pow(float3 v, float e) { return make_float3(powf(v.x, e), powf(v.y, e), powf(v.z, e)); } -ccl_device_inline float3 exp3(float3 v) -{ - return make_float3(expf(v.x), expf(v.y), expf(v.z)); -} - -ccl_device_inline float3 log3(float3 v) -{ - return make_float3(logf(v.x), logf(v.y), logf(v.z)); -} - ccl_device_inline int3 quick_floor_to_int3(const float3 a) { #ifdef __KERNEL_SSE__ @@ -540,12 +547,12 @@ ccl_device_inline int3 quick_floor_to_int3(const float3 a) #endif } -ccl_device_inline bool isfinite3_safe(float3 v) +ccl_device_inline bool isfinite_safe(float3 v) { return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z); } -ccl_device_inline float3 ensure_finite3(float3 v) +ccl_device_inline float3 ensure_finite(float3 v) { if (!isfinite_safe(v.x)) v.x = 0.0f; diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index ae9dfe75a9c..c2721873037 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -55,7 +55,8 @@ ccl_device_inline float4 floor(const float4 &a); ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); #endif /* !__KERNEL_METAL__*/ -ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b); +ccl_device_inline float4 safe_divide(const float4 a, const float4 b); +ccl_device_inline float4 safe_divide(const float4 a, const float b); #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> @@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b); # endif #endif /* __KERNEL_SSE__ */ +ccl_device_inline float reduce_min(const float4 a); +ccl_device_inline float reduce_max(const float4 a); +ccl_device_inline float reduce_add(const float4 a); + +ccl_device_inline bool isequal(const float4 a, const float4 b); + #ifndef __KERNEL_GPU__ ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b); -ccl_device_inline float4 reduce_min(const float4 &a); -ccl_device_inline float4 reduce_max(const float4 &a); -ccl_device_inline float4 reduce_add(const float4 &a); #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a) # endif } -ccl_device_inline float4 reduce_add(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vaddvq_f32(a))); -# elif defined(__KERNEL_SSE3__) - float4 h(_mm_hadd_ps(a.m128, a.m128)); - return float4(_mm_hadd_ps(h.m128, h.m128)); -# else - float4 h(shuffle<1, 0, 3, 2>(a) + a); - return shuffle<2, 3, 0, 1>(h) + h; -# endif -# else - float sum = (a.x + a.y) + (a.z + a.w); - return make_float4(sum, sum, sum, sum); -# endif -} - ccl_device_inline float average(const float4 &a) { - return reduce_add(a).x * 0.25f; + return reduce_add(a) * 0.25f; } ccl_device_inline float len(const float4 &a) @@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) return a + t * (b - a); } +ccl_device_inline float4 saturate(const float4 &a) +{ + return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w)); +} + +ccl_device_inline float4 exp(float4 v) +{ + return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z)); +} + +ccl_device_inline float4 log(float4 v) +{ + return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z)); +} + #endif /* !__KERNEL_METAL__*/ +ccl_device_inline float reduce_add(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vaddvq_f32(a); +# elif defined(__KERNEL_SSE3__) + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128)); +# else + float4 h(shuffle<1, 0, 3, 2>(a) + a); + return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h); +# endif +#else + return a.x + a.y + a.z + a.w; +#endif +} + +ccl_device_inline float reduce_min(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vminvq_f32(a); +# else + float4 h = min(shuffle<1, 0, 3, 2>(a), a); + return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h)); +# endif +#else + return min(min(a.x, a.y), min(a.z, a.w)); +#endif +} + +ccl_device_inline float reduce_max(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vmaxvq_f32(a); +# else + float4 h = max(shuffle<1, 0, 3, 2>(a), a); + return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h)); +# endif +#else + return max(max(a.x, a.y), max(a.z, a.w)); +#endif +} + +ccl_device_inline bool isequal(const float4 a, const float4 b) +{ +#if defined(__KERNEL_METAL__) + return all(a == b); +#else + return a == b; +#endif +} + #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &b) @@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a) return select(mask, a, zero_float4()); } -ccl_device_inline float4 reduce_min(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vminvq_f32(a))); -# else - float4 h = min(shuffle<1, 0, 3, 2>(a), a); - return min(shuffle<2, 3, 0, 1>(h), h); -# endif -# else - return make_float4(min(min(a.x, a.y), min(a.z, a.w))); -# endif -} - -ccl_device_inline float4 reduce_max(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vmaxvq_f32(a))); -# else - float4 h = max(shuffle<1, 0, 3, 2>(a), a); - return max(shuffle<2, 3, 0, 1>(h), h); -# endif -# else - return make_float4(max(max(a.x, a.y), max(a.z, a.w))); -# endif -} - ccl_device_inline float4 load_float4(ccl_private const float *v) { # ifdef __KERNEL_SSE__ @@ -500,17 +527,25 @@ ccl_device_inline float4 load_float4(ccl_private const float *v) #endif /* !__KERNEL_GPU__ */ -ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b) +ccl_device_inline float4 safe_divide(const float4 a, const float b) { return (b != 0.0f) ? a / b : zero_float4(); } -ccl_device_inline bool isfinite4_safe(float4 v) +ccl_device_inline float4 safe_divide(const float4 a, const float4 b) +{ + return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f, + (b.y != 0.0f) ? a.y / b.y : 0.0f, + (b.z != 0.0f) ? a.z / b.z : 0.0f, + (b.w != 0.0f) ? a.w / b.w : 0.0f); +} + +ccl_device_inline bool isfinite_safe(float4 v) { return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w); } -ccl_device_inline float4 ensure_finite4(float4 v) +ccl_device_inline float4 ensure_finite(float4 v) { if (!isfinite_safe(v.x)) v.x = 0.0f; @@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite4(float4 v) return v; } +ccl_device_inline float4 pow(float4 v, float e) +{ + return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e)); +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_FLOAT4_H__ */ diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h new file mode 100644 index 00000000000..b538cfbe70b --- /dev/null +++ b/intern/cycles/util/math_float8.h @@ -0,0 +1,419 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2022 Blender Foundation */ + +#ifndef __UTIL_MATH_FLOAT8_H__ +#define __UTIL_MATH_FLOAT8_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util/types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +ccl_device_inline float8_t operator+(const float8_t a, const float8_t b); +ccl_device_inline float8_t operator+(const float8_t a, const float f); +ccl_device_inline float8_t operator+(const float f, const float8_t a); + +ccl_device_inline float8_t operator-(const float8_t a); +ccl_device_inline float8_t operator-(const float8_t a, const float8_t b); +ccl_device_inline float8_t operator-(const float8_t a, const float f); +ccl_device_inline float8_t operator-(const float f, const float8_t a); + +ccl_device_inline float8_t operator*(const float8_t a, const float8_t b); +ccl_device_inline float8_t operator*(const float8_t a, const float f); +ccl_device_inline float8_t operator*(const float f, const float8_t a); + +ccl_device_inline float8_t operator/(const float8_t a, const float8_t b); +ccl_device_inline float8_t operator/(const float8_t a, float f); +ccl_device_inline float8_t operator/(const float f, const float8_t a); + +ccl_device_inline float8_t operator+=(float8_t a, const float8_t b); + +ccl_device_inline float8_t operator*=(float8_t a, const float8_t b); +ccl_device_inline float8_t operator*=(float8_t a, float f); + +ccl_device_inline float8_t operator/=(float8_t a, float f); + +ccl_device_inline bool operator==(const float8_t a, const float8_t b); + +ccl_device_inline float8_t rcp(const float8_t a); +ccl_device_inline float8_t sqrt(const float8_t a); +ccl_device_inline float8_t sqr(const float8_t a); +ccl_device_inline bool is_zero(const float8_t a); +ccl_device_inline float average(const float8_t a); +ccl_device_inline float8_t min(const float8_t a, const float8_t b); +ccl_device_inline float8_t max(const float8_t a, const float8_t b); +ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx); +ccl_device_inline float8_t fabs(const float8_t a); +ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t); +ccl_device_inline float8_t saturate(const float8_t a); + +ccl_device_inline float8_t safe_divide(const float8_t a, const float b); +ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b); + +ccl_device_inline float reduce_min(const float8_t a); +ccl_device_inline float reduce_max(const float8_t a); +ccl_device_inline float reduce_add(const float8_t a); + +ccl_device_inline bool isequal(const float8_t a, const float8_t b); + +/******************************************************************************* + * Definition. + */ + +ccl_device_inline float8_t zero_float8_t() +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_setzero_ps()); +#else + return make_float8_t(0.0f); +#endif +} + +ccl_device_inline float8_t one_float8_t() +{ + return make_float8_t(1.0f); +} + +ccl_device_inline float8_t operator+(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_add_ps(a.m256, b.m256)); +#else + return make_float8_t( + a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h); +#endif +} + +ccl_device_inline float8_t operator+(const float8_t a, const float f) +{ + return a + make_float8_t(f); +} + +ccl_device_inline float8_t operator+(const float f, const float8_t a) +{ + return make_float8_t(f) + a; +} + +ccl_device_inline float8_t operator-(const float8_t a) +{ +#ifdef __KERNEL_AVX2__ + __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return float8_t(_mm256_xor_ps(a.m256, mask)); +#else + return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h); +#endif +} + +ccl_device_inline float8_t operator-(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_sub_ps(a.m256, b.m256)); +#else + return make_float8_t( + a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h); +#endif +} + +ccl_device_inline float8_t operator-(const float8_t a, const float f) +{ + return a - make_float8_t(f); +} + +ccl_device_inline float8_t operator-(const float f, const float8_t a) +{ + return make_float8_t(f) - a; +} + +ccl_device_inline float8_t operator*(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_mul_ps(a.m256, b.m256)); +#else + return make_float8_t( + a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h); +#endif +} + +ccl_device_inline float8_t operator*(const float8_t a, const float f) +{ + return a * make_float8_t(f); +} + +ccl_device_inline float8_t operator*(const float f, const float8_t a) +{ + return make_float8_t(f) * a; +} + +ccl_device_inline float8_t operator/(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_div_ps(a.m256, b.m256)); +#else + return make_float8_t( + a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h); +#endif +} + +ccl_device_inline float8_t operator/(const float8_t a, const float f) +{ + return a / make_float8_t(f); +} + +ccl_device_inline float8_t operator/(const float f, const float8_t a) +{ + return make_float8_t(f) / a; +} + +ccl_device_inline float8_t operator+=(float8_t a, const float8_t b) +{ + return a = a + b; +} + +ccl_device_inline float8_t operator-=(float8_t a, const float8_t b) +{ + return a = a - b; +} + +ccl_device_inline float8_t operator*=(float8_t a, const float8_t b) +{ + return a = a * b; +} + +ccl_device_inline float8_t operator*=(float8_t a, float f) +{ + return a = a * f; +} + +ccl_device_inline float8_t operator/=(float8_t a, float f) +{ + return a = a / f; +} + +ccl_device_inline bool operator==(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return (_mm256_movemask_ps(_mm256_castsi256_ps( + _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) & + 0b11111111) == 0b11111111; +#else + return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f && + a.g == b.g && a.h == b.h); +#endif +} + +ccl_device_inline float8_t rcp(const float8_t a) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_rcp_ps(a.m256)); +#else + return make_float8_t(1.0f / a.a, + 1.0f / a.b, + 1.0f / a.c, + 1.0f / a.d, + 1.0f / a.e, + 1.0f / a.f, + 1.0f / a.g, + 1.0f / a.h); +#endif +} + +ccl_device_inline float8_t sqrt(const float8_t a) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_sqrt_ps(a.m256)); +#else + return make_float8_t(sqrtf(a.a), + sqrtf(a.b), + sqrtf(a.c), + sqrtf(a.d), + sqrtf(a.e), + sqrtf(a.f), + sqrtf(a.g), + sqrtf(a.h)); +#endif +} + +ccl_device_inline float8_t sqr(const float8_t a) +{ + return a * a; +} + +ccl_device_inline bool is_zero(const float8_t a) +{ + return a == make_float8_t(0.0f); +} + +ccl_device_inline float average(const float8_t a) +{ + return reduce_add(a) / 8.0f; +} + +ccl_device_inline float8_t min(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_min_ps(a.m256, b.m256)); +#else + return make_float8_t(min(a.a, b.a), + min(a.b, b.b), + min(a.c, b.c), + min(a.d, b.d), + min(a.e, b.e), + min(a.f, b.f), + min(a.g, b.g), + min(a.h, b.h)); +#endif +} + +ccl_device_inline float8_t max(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_max_ps(a.m256, b.m256)); +#else + return make_float8_t(max(a.a, b.a), + max(a.b, b.b), + max(a.c, b.c), + max(a.d, b.d), + max(a.e, b.e), + max(a.f, b.f), + max(a.g, b.g), + max(a.h, b.h)); +#endif +} + +ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float8_t fabs(const float8_t a) +{ +#ifdef __KERNEL_AVX2__ + return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); +#else + return make_float8_t(fabsf(a.a), + fabsf(a.b), + fabsf(a.c), + fabsf(a.d), + fabsf(a.e), + fabsf(a.f), + fabsf(a.g), + fabsf(a.h)); +#endif +} + +ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t) +{ + return a + t * (b - a); +} + +ccl_device_inline float8_t saturate(const float8_t a) +{ + return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f)); +} + +ccl_device_inline float8_t exp(float8_t v) +{ + return make_float8_t( + expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h)); +} + +ccl_device_inline float8_t log(float8_t v) +{ + return make_float8_t( + logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h)); +} + +ccl_device_inline float dot(const float8_t a, const float8_t b) +{ +#ifdef __KERNEL_AVX2__ + float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF)); + return t[0] + t[4]; +#else + return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) + + (a.g * b.g) + (a.h * b.h); +#endif +} + +ccl_device_inline float8_t pow(float8_t v, float e) +{ + return make_float8_t(powf(v.a, e), + powf(v.b, e), + powf(v.c, e), + powf(v.d, e), + powf(v.e, e), + powf(v.f, e), + powf(v.g, e), + powf(v.h, e)); +} + +ccl_device_inline float reduce_min(const float8_t a) +{ + return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h))); +} + +ccl_device_inline float reduce_max(const float8_t a) +{ + return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h))); +} + +ccl_device_inline float reduce_add(const float8_t a) +{ +#ifdef __KERNEL_AVX2__ + float8_t b(_mm256_hadd_ps(a.m256, a.m256)); + float8_t h(_mm256_hadd_ps(b.m256, b.m256)); + return h[0] + h[4]; +#else + return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h; +#endif +} + +ccl_device_inline bool isequal(const float8_t a, const float8_t b) +{ + return a == b; +} + +ccl_device_inline float8_t safe_divide(const float8_t a, const float b) +{ + return (b != 0.0f) ? a / b : make_float8_t(0.0f); +} + +ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b) +{ + return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f, + (b.b != 0.0f) ? a.b / b.b : 0.0f, + (b.c != 0.0f) ? a.c / b.c : 0.0f, + (b.d != 0.0f) ? a.d / b.d : 0.0f, + (b.e != 0.0f) ? a.e / b.e : 0.0f, + (b.f != 0.0f) ? a.f / b.f : 0.0f, + (b.g != 0.0f) ? a.g / b.g : 0.0f, + (b.h != 0.0f) ? a.h / b.h : 0.0f); +} + +ccl_device_inline float8_t ensure_finite(float8_t v) +{ + v.a = ensure_finite(v.a); + v.b = ensure_finite(v.b); + v.c = ensure_finite(v.c); + v.d = ensure_finite(v.d); + v.e = ensure_finite(v.e); + v.f = ensure_finite(v.f); + v.g = ensure_finite(v.g); + v.h = ensure_finite(v.h); + + return v; +} + +ccl_device_inline bool isfinite_safe(float8_t v) +{ + return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) && + isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h); +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT8_H__ */ diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index b0de0b25a45..aa28682f8c1 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN ccl_device bool ray_sphere_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 sphere_P, float sphere_radius, ccl_private float3 *isect_P, @@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, return false; } const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ - if (t < ray_t) { + if (t > ray_tmin && t < ray_tmax) { *isect_t = t; *isect_P = ray_P + ray_D * t; return true; @@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, ccl_device bool ray_aligned_disk_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 disk_P, float disk_radius, ccl_private float3 *isect_P, @@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, } /* Compute t to intersection point. */ const float t = -disk_t / div; - if (t < 0.0f || t > ray_t) { + if (!(t > ray_tmin && t < ray_tmax)) { return false; } /* Test if within radius. */ @@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, ccl_device bool ray_disk_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 disk_P, float3 disk_N, float disk_radius, @@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P, } float3 P = ray_P + t * ray_D; float3 T = P - disk_P; - if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) { + + if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) { *isect_P = ray_P + t * ray_D; *isect_t = t; return true; @@ -101,9 +105,55 @@ ccl_device bool ray_disk_intersect(float3 ray_P, return false; } -ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, - float3 ray_dir, - float ray_t, +/* Custom rcp, cross and dot implementations that match Embree bit for bit. */ +ccl_device_forceinline float ray_triangle_rcp(const float x) +{ +#ifdef __KERNEL_NEON__ + /* Move scalar to vector register and do rcp. */ + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#elif defined(__KERNEL_SSE__) + const __m128 a = _mm_set_ss(x); + const __m128 r = _mm_rcp_ss(a); + +# ifdef __KERNEL_AVX2_ + return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); +# else + return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); +# endif +#else + return 1.0f / x; +#endif +} + +ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0]; +#else + return a.x * b.x + a.y * b.y + a.z * b.z; +#endif +} + +ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0], + msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0], + msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]); +#else + return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +#endif +} + +ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, + const float3 ray_D, + const float ray_tmin, + const float ray_tmax, const float3 tri_a, const float3 tri_b, const float3 tri_c, @@ -111,14 +161,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, ccl_private float *isect_v, ccl_private float *isect_t) { -#define dot3(a, b) dot(a, b) - const float3 P = ray_P; - const float3 dir = ray_dir; + /* This implementation matches the Plücker coordinates triangle intersection + * in Embree. */ /* Calculate vertices relative to ray origin. */ - const float3 v0 = tri_c - P; - const float3 v1 = tri_a - P; - const float3 v2 = tri_b - P; + const float3 v0 = tri_a - ray_P; + const float3 v1 = tri_b - ray_P; + const float3 v2 = tri_c - ray_P; /* Calculate triangle edges. */ const float3 e0 = v2 - v0; @@ -126,42 +175,73 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, const float3 e2 = v1 - v2; /* Perform edge tests. */ - const float U = dot(cross(v2 + v0, e0), ray_dir); - const float V = dot(cross(v0 + v1, e1), ray_dir); - const float W = dot(cross(v1 + v2, e2), ray_dir); + const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D); + const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D); + const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D); + const float UVW = U + V + W; + const float eps = FLT_EPSILON * fabsf(UVW); const float minUVW = min(U, min(V, W)); const float maxUVW = max(U, max(V, W)); - if (minUVW < 0.0f && maxUVW > 0.0f) { + if (!(minUVW >= -eps || maxUVW <= eps)) { return false; } /* Calculate geometry normal and denominator. */ - const float3 Ng1 = cross(e1, e0); - // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); + const float3 Ng1 = ray_triangle_cross(e1, e0); const float3 Ng = Ng1 + Ng1; - const float den = dot3(Ng, dir); + const float den = dot(Ng, ray_D); /* Avoid division by 0. */ if (UNLIKELY(den == 0.0f)) { return false; } /* Perform depth test. */ - const float T = dot3(v0, Ng); - const int sign_den = (__float_as_int(den) & 0x80000000); - const float sign_T = xor_signmask(T, sign_den); - if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) { + const float T = dot(v0, Ng); + const float t = T / den; + if (!(t >= ray_tmin && t <= ray_tmax)) { return false; } - const float inv_den = 1.0f / den; - *isect_u = U * inv_den; - *isect_v = V * inv_den; - *isect_t = T * inv_den; + const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW); + *isect_u = min(U * rcp_uvw, 1.0f); + *isect_v = min(V * rcp_uvw, 1.0f); + *isect_t = t; return true; +} + +ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P, + const float3 ray_D, + const float3 tri_a, + const float3 tri_b, + const float3 tri_c) +{ + /* Matches logic in ray_triangle_intersect, self intersection test to validate + * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */ -#undef dot3 + /* Calculate vertices relative to ray origin. */ + const float3 v0 = tri_a - ray_P; + const float3 v1 = tri_b - ray_P; + const float3 v2 = tri_c - ray_P; + + /* Calculate triangle edges. */ + const float3 e0 = v2 - v0; + const float3 e1 = v0 - v1; + const float3 e2 = v1 - v2; + + /* Perform edge tests. */ + const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D); + const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D); + const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D); + + const float eps = FLT_EPSILON * fabsf(U + V + W); + const float minUVW = min(U, min(V, W)); + const float maxUVW = max(U, max(V, W)); + + /* Note the extended epsilon compared to ray_triangle_intersect, to account + * for intersections with neighboring triangles that have an epsilon. */ + return (minUVW >= eps || maxUVW <= -eps); } /* Tests for an intersection between a ray and a quad defined by @@ -171,8 +251,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, */ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, - float ray_mint, - float ray_maxt, + float ray_tmin, + float ray_tmax, float3 quad_P, float3 quad_u, float3 quad_v, @@ -185,7 +265,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P, { /* Perform intersection test. */ float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); - if (t < ray_mint || t > ray_maxt) { + if (!(t > ray_tmin && t < ray_tmax)) { return false; } const float3 hit = ray_P + t * ray_D; @@ -207,10 +287,13 @@ ccl_device bool ray_quad_intersect(float3 ray_P, *isect_P = hit; if (isect_t != NULL) *isect_t = t; + + /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */ if (isect_u != NULL) - *isect_u = u + 0.5f; + *isect_u = v + 0.5f; if (isect_v != NULL) - *isect_v = v + 0.5f; + *isect_v = -u - v; + return true; } diff --git a/intern/cycles/util/opengl.h b/intern/cycles/util/opengl.h index 090deb861c4..fefee4ec022 100644 --- a/intern/cycles/util/opengl.h +++ b/intern/cycles/util/opengl.h @@ -7,6 +7,6 @@ /* OpenGL header includes, used everywhere we use OpenGL, to deal with * platform differences in one central place. */ -#include <GL/glew.h> +#include <epoxy/gl.h> #endif /* __UTIL_OPENGL_H__ */ diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h index 37eafd57491..586979d2021 100644 --- a/intern/cycles/util/progress.h +++ b/intern/cycles/util/progress.h @@ -28,6 +28,7 @@ class Progress { denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + time_limit = 0.0; end_time = 0.0; status = "Initializing"; substatus = ""; @@ -68,6 +69,7 @@ class Progress { denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + time_limit = 0.0; end_time = 0.0; status = "Initializing"; substatus = ""; @@ -145,6 +147,13 @@ class Progress { render_start_time = time_dt(); } + void set_time_limit(double time_limit_) + { + thread_scoped_lock lock(progress_mutex); + + time_limit = time_limit_; + } + void add_skip_time(const scoped_timer &start_timer, bool only_render) { double skip_time = time_dt() - start_timer.get_start(); @@ -191,8 +200,13 @@ class Progress { { thread_scoped_lock lock(progress_mutex); - if (total_pixel_samples > 0) { - return ((double)pixel_samples) / (double)total_pixel_samples; + if (pixel_samples > 0) { + double progress_percent = (double)pixel_samples / (double)total_pixel_samples; + if (time_limit != 0.0) { + double time_since_render_start = time_dt() - render_start_time; + progress_percent = max(progress_percent, time_since_render_start / time_limit); + } + return min(1.0, progress_percent); } return 0.0; } @@ -335,7 +349,7 @@ class Progress { * in which case the current_tile_sample is displayed. */ int rendered_tiles, denoised_tiles; - double start_time, render_start_time; + double start_time, render_start_time, time_limit; /* End time written when render is done, so it doesn't keep increasing on redraws. */ double end_time; diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp index 66ff866ee10..0c318cea44a 100644 --- a/intern/cycles/util/string.cpp +++ b/intern/cycles/util/string.cpp @@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other) } } +void string_replace_same_length(string &haystack, const string &needle, const string &other) +{ + assert(needle.size() == other.size()); + size_t pos = 0; + while (pos != string::npos) { + pos = haystack.find(needle, pos); + if (pos != string::npos) { + memcpy(haystack.data() + pos, other.data(), other.size()); + pos += other.size(); + } + } +} + string string_remove_trademark(const string &s) { string result = s; @@ -164,6 +177,11 @@ string to_string(const char *str) return string(str); } +string to_string(const float4 &v) +{ + return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w); +} + string string_to_lower(const string &s) { string r = s; diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h index a74feee1750..ecbe9e106c6 100644 --- a/intern/cycles/util/string.h +++ b/intern/cycles/util/string.h @@ -38,12 +38,14 @@ void string_split(vector<string> &tokens, const string &separators = "\t ", bool skip_empty_tokens = true); void string_replace(string &haystack, const string &needle, const string &other); +void string_replace_same_length(string &haystack, const string &needle, const string &other); bool string_startswith(string_view s, string_view start); bool string_endswith(string_view s, string_view end); string string_strip(const string &s); string string_remove_trademark(const string &s); string string_from_bool(const bool var); string to_string(const char *str); +string to_string(const float4 &v); string string_to_lower(const string &s); /* Wide char strings are only used on Windows to deal with non-ASCII diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp index a13ad95b9fe..3183ac06f26 100644 --- a/intern/cycles/util/system.cpp +++ b/intern/cycles/util/system.cpp @@ -128,53 +128,42 @@ int system_cpu_bits() #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) struct CPUCapabilities { - bool x64; - bool mmx; - bool sse; bool sse2; bool sse3; - bool ssse3; bool sse41; - bool sse42; - bool sse4a; bool avx; - bool f16c; bool avx2; - bool xop; - bool fma3; - bool fma4; - bool bmi1; - bool bmi2; }; static CPUCapabilities &system_cpu_capabilities() { - static CPUCapabilities caps; + static CPUCapabilities caps = {}; static bool caps_init = false; if (!caps_init) { int result[4], num; - memset(&caps, 0, sizeof(caps)); - __cpuid(result, 0); num = result[0]; if (num >= 1) { __cpuid(result, 0x00000001); - caps.mmx = (result[3] & ((int)1 << 23)) != 0; - caps.sse = (result[3] & ((int)1 << 25)) != 0; - caps.sse2 = (result[3] & ((int)1 << 26)) != 0; - caps.sse3 = (result[2] & ((int)1 << 0)) != 0; + const bool sse = (result[3] & ((int)1 << 25)) != 0; + const bool sse2 = (result[3] & ((int)1 << 26)) != 0; + const bool sse3 = (result[2] & ((int)1 << 0)) != 0; + + const bool ssse3 = (result[2] & ((int)1 << 9)) != 0; + const bool sse41 = (result[2] & ((int)1 << 19)) != 0; + /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */ - caps.ssse3 = (result[2] & ((int)1 << 9)) != 0; - caps.sse41 = (result[2] & ((int)1 << 19)) != 0; - caps.sse42 = (result[2] & ((int)1 << 20)) != 0; + const bool fma3 = (result[2] & ((int)1 << 12)) != 0; + const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0; + const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0; - caps.fma3 = (result[2] & ((int)1 << 12)) != 0; - caps.avx = false; - bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0; - bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0; + /* Simplify to combined capabilities for which we specialize kernels. */ + caps.sse2 = sse && sse2; + caps.sse3 = sse && sse2 && sse3 && ssse3; + caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41; if (os_uses_xsave_xrestore && cpu_avx_support) { // Check if the OS will save the YMM registers @@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities() # else xcr_feature_mask = 0; # endif - caps.avx = (xcr_feature_mask & 0x6) == 0x6; - } + const bool avx = (xcr_feature_mask & 0x6) == 0x6; + const bool f16c = (result[2] & ((int)1 << 29)) != 0; - caps.f16c = (result[2] & ((int)1 << 29)) != 0; + __cpuid(result, 0x00000007); + bool bmi1 = (result[1] & ((int)1 << 3)) != 0; + bool bmi2 = (result[1] & ((int)1 << 8)) != 0; + bool avx2 = (result[1] & ((int)1 << 5)) != 0; - __cpuid(result, 0x00000007); - caps.bmi1 = (result[1] & ((int)1 << 3)) != 0; - caps.bmi2 = (result[1] & ((int)1 << 8)) != 0; - caps.avx2 = (result[1] & ((int)1 << 5)) != 0; + caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx; + caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 && + bmi2; + } } caps_init = true; @@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities() bool system_cpu_support_sse2() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2; + return caps.sse2; } bool system_cpu_support_sse3() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3; + return caps.sse3; } bool system_cpu_support_sse41() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41; + return caps.sse41; } bool system_cpu_support_avx() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx; + return caps.avx; } bool system_cpu_support_avx2() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c && - caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2; + return caps.avx2; } #else @@ -264,26 +255,6 @@ bool system_cpu_support_avx2() #endif -bool system_call_self(const vector<string> &args) -{ - /* Escape program and arguments in case they contain spaces. */ - string cmd = "\"" + Sysutil::this_program_path() + "\""; - - for (int i = 0; i < args.size(); i++) { - cmd += " \"" + args[i] + "\""; - } - -#ifdef _WIN32 - /* Use cmd /S to avoid issues with spaces in arguments. */ - cmd = "cmd /S /C \"" + cmd + " > nul \""; -#else - /* Quiet output. */ - cmd += " > /dev/null"; -#endif - - return (system(cmd.c_str()) == 0); -} - size_t system_physical_ram() { #ifdef _WIN32 diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h index 23dcfdd303a..2152b89ed24 100644 --- a/intern/cycles/util/system.h +++ b/intern/cycles/util/system.h @@ -4,15 +4,17 @@ #ifndef __UTIL_SYSTEM_H__ #define __UTIL_SYSTEM_H__ -#include "util/string.h" -#include "util/vector.h" +#include <stdint.h> +#include <stdlib.h> + +#include <string> CCL_NAMESPACE_BEGIN /* Get width in characters of the current console output. */ int system_console_width(); -string system_cpu_brand_string(); +std::string system_cpu_brand_string(); int system_cpu_bits(); bool system_cpu_support_sse2(); bool system_cpu_support_sse3(); @@ -22,9 +24,6 @@ bool system_cpu_support_avx2(); size_t system_physical_ram(); -/* Start a new process of the current application with the given arguments. */ -bool system_call_self(const vector<string> &args); - /* Get identifier of the currently running process. */ uint64_t system_self_process_id(); diff --git a/intern/cycles/util/task.cpp b/intern/cycles/util/task.cpp index 2edc82eb7c3..12f661f752d 100644 --- a/intern/cycles/util/task.cpp +++ b/intern/cycles/util/task.cpp @@ -70,7 +70,7 @@ void TaskScheduler::init(int num_threads) } if (num_threads > 0) { /* Automatic number of threads. */ - VLOG(1) << "Overriding number of TBB threads to " << num_threads << "."; + VLOG_INFO << "Overriding number of TBB threads to " << num_threads << "."; global_control = new tbb::global_control(tbb::global_control::max_allowed_parallelism, num_threads); active_num_threads = num_threads; diff --git a/intern/cycles/util/time.cpp b/intern/cycles/util/time.cpp index d27a0415106..0295a071f39 100644 --- a/intern/cycles/util/time.cpp +++ b/intern/cycles/util/time.cpp @@ -102,7 +102,7 @@ double time_human_readable_to_seconds(const string &time_string) } else if (fraction_tokens.size() == 2) { result = atof(fraction_tokens[1].c_str()); - result *= pow(0.1, fraction_tokens[1].length()); + result *= ::pow(0.1, fraction_tokens[1].length()); } else { /* This is not a valid string, the result can not be reliable. */ diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp index fa50e1db063..cb985c65dd8 100644 --- a/intern/cycles/util/transform.cpp +++ b/intern/cycles/util/transform.cpp @@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN /* Transform Inverse */ -static bool transform_matrix4_gj_inverse(float R[][4], float M[][4]) +static bool projection_matrix4_inverse(float R[][4], float M[][4]) { /* SPDX-License-Identifier: BSD-3-Clause * Adapted from code: @@ -98,16 +98,8 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) memcpy(R, &tfmR, sizeof(R)); memcpy(M, &tfm, sizeof(M)); - if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should - * never be in this situation, but try to invert it anyway with tweak */ - M[0][0] += 1e-8f; - M[1][1] += 1e-8f; - M[2][2] += 1e-8f; - - if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - return projection_identity(); - } + if (UNLIKELY(!projection_matrix4_inverse(R, M))) { + return projection_identity(); } memcpy(&tfmR, R, sizeof(R)); @@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) return tfmR; } -Transform transform_inverse(const Transform &tfm) -{ - ProjectionTransform projection(tfm); - return projection_to_transform(projection_inverse(projection)); -} - Transform transform_transposed_inverse(const Transform &tfm) { - ProjectionTransform projection(tfm); - ProjectionTransform iprojection = projection_inverse(projection); + ProjectionTransform iprojection(transform_inverse(tfm)); return projection_to_transform(projection_transpose(iprojection)); } @@ -229,17 +214,17 @@ static void transform_decompose(DecomposedTransform *decomp, const Transform *tf /* extract scale and shear first */ float3 scale, shear; scale.x = len(colx); - colx = safe_divide_float3_float(colx, scale.x); + colx = safe_divide(colx, scale.x); shear.z = dot(colx, coly); coly -= shear.z * colx; scale.y = len(coly); - coly = safe_divide_float3_float(coly, scale.y); + coly = safe_divide(coly, scale.y); shear.y = dot(colx, colz); colz -= shear.y * colx; shear.x = dot(coly, colz); colz -= shear.x * coly; scale.z = len(colz); - colz = safe_divide_float3_float(colz, scale.z); + colz = safe_divide(colz, scale.z); transform_set_column(&M, 0, colx); transform_set_column(&M, 1, coly); diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index 477272f0ba6..24184dc7074 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -11,6 +11,10 @@ #include "util/math.h" #include "util/types.h" +#ifndef __KERNEL_GPU__ +# include "util/system.h" +#endif + CCL_NAMESPACE_BEGIN /* Affine transformation, stored as 4x3 matrix. */ @@ -38,6 +42,12 @@ typedef struct DecomposedTransform { float4 x, y, z, w; } DecomposedTransform; +CCL_NAMESPACE_END + +#include "util/transform_inverse.h" + +CCL_NAMESPACE_BEGIN + /* Functions */ #ifdef __KERNEL_METAL__ @@ -63,10 +73,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f _MM_TRANSPOSE4_PS(x, y, z, w); - ssef tmp = shuffle<0>(aa) * x; - tmp = madd(shuffle<1>(aa), y, tmp); + ssef tmp = w; tmp = madd(shuffle<2>(aa), z, tmp); - tmp += w; + tmp = madd(shuffle<1>(aa), y, tmp); + tmp = madd(shuffle<0>(aa), x, tmp); return float3(tmp.m128); #elif defined(__KERNEL_METAL__) @@ -93,9 +103,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con _MM_TRANSPOSE4_PS(x, y, z, w); - ssef tmp = shuffle<0>(aa) * x; + ssef tmp = shuffle<2>(aa) * z; tmp = madd(shuffle<1>(aa), y, tmp); - tmp = madd(shuffle<2>(aa), z, tmp); + tmp = madd(shuffle<0>(aa), x, tmp); return float3(tmp.m128); #elif defined(__KERNEL_METAL__) @@ -312,7 +322,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val t->z[column] = value.z; } -Transform transform_inverse(const Transform &a); Transform transform_transposed_inverse(const Transform &a); ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale) @@ -392,39 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t) #endif /* defined(__KERNEL_GPU_RAYTRACING__) */ } -ccl_device_inline Transform transform_quick_inverse(Transform M) -{ - /* possible optimization: can we avoid doing this altogether and construct - * the inverse matrix directly from negated translation, transposed rotation, - * scale can be inverted but what about shearing? */ - Transform R; - float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) + - M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z); - if (det == 0.0f) { - M.x.x += 1e-8f; - M.y.y += 1e-8f; - M.z.z += 1e-8f; - det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) + - M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z); - } - det = (det != 0.0f) ? 1.0f / det : 0.0f; - - float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z, - M.z.y * M.x.z - M.z.z * M.x.y, - M.y.z * M.x.y - M.y.y * M.x.z); - float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x, - M.z.z * M.x.x - M.z.x * M.x.z, - M.y.x * M.x.z - M.y.z * M.x.x); - float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y, - M.z.x * M.x.y - M.z.y * M.x.x, - M.y.y * M.x.x - M.y.x * M.x.y); - float3 T = -make_float3(M.x.w, M.y.w, M.z.w); +#ifndef __KERNEL_GPU__ +void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm); +void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm); +#endif - R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T)); - R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T)); - R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T)); +ccl_device_inline Transform transform_inverse(const Transform tfm) +{ + /* Optimized transform implementations. */ +#ifndef __KERNEL_GPU__ + if (system_cpu_support_avx2()) { + Transform itfm; + transform_inverse_cpu_avx2(tfm, itfm); + return itfm; + } + else if (system_cpu_support_sse41()) { + Transform itfm; + transform_inverse_cpu_sse41(tfm, itfm); + return itfm; + } +#endif - return R; + return transform_inverse_impl(tfm); } ccl_device_inline void transform_compose(ccl_private Transform *tfm, @@ -493,13 +491,13 @@ ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm, ccl_device_inline bool transform_isfinite_safe(ccl_private Transform *tfm) { - return isfinite4_safe(tfm->x) && isfinite4_safe(tfm->y) && isfinite4_safe(tfm->z); + return isfinite_safe(tfm->x) && isfinite_safe(tfm->y) && isfinite_safe(tfm->z); } ccl_device_inline bool transform_decomposed_isfinite_safe(ccl_private DecomposedTransform *decomp) { - return isfinite4_safe(decomp->x) && isfinite4_safe(decomp->y) && isfinite4_safe(decomp->z) && - isfinite4_safe(decomp->w); + return isfinite_safe(decomp->x) && isfinite_safe(decomp->y) && isfinite_safe(decomp->z) && + isfinite_safe(decomp->w); } #ifndef __KERNEL_GPU__ diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp new file mode 100644 index 00000000000..57c160388e2 --- /dev/null +++ b/intern/cycles/util/transform_avx2.cpp @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "util/transform.h" + +CCL_NAMESPACE_BEGIN + +void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm) +{ + itfm = transform_inverse_impl(tfm); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h new file mode 100644 index 00000000000..07fd06c1467 --- /dev/null +++ b/intern/cycles/util/transform_inverse.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Custom cross and dot implementations that match Embree bit for bit. + * Normally we don't use SSE41/AVX outside the kernel, but for this it's + * important to match exactly for ray tracing precision. */ + +ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b) +{ +#ifdef __AVX2__ + const ssef sse_a = (const __m128 &)a; + const ssef sse_b = (const __m128 &)b; + const ssef r = shuffle<1, 2, 0, 3>( + ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b))); + return (const float3 &)r; +#endif + + return cross(a, b); +} + +ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b) +{ +#ifdef __SSE4_1__ + return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F)); +#endif + + return dot(a, b); +} + +ccl_device_inline Transform transform_inverse_impl(const Transform tfm) +{ + /* This implementation matches the one in Embree exactly, to ensure consistent + * results with the ray intersection of instances. */ + float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); + float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); + float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); + float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); + + /* Compute determinant. */ + float det = transform_inverse_dot(x, transform_inverse_cross(y, z)); + + if (det == 0.0f) { + /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should + * never be in this situation, but try to invert it anyway with tweak. + * + * This logic does not match Embree which would just give an invalid + * matrix. A better solution would be to remove this and ensure any object + * matrix is valid. */ + x.x += 1e-8f; + y.y += 1e-8f; + z.z += 1e-8f; + + det = transform_inverse_dot(x, cross(y, z)); + if (det == 0.0f) { + det = FLT_MAX; + } + } + + /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ + const float3 inverse_x = transform_inverse_cross(y, z) / det; + const float3 inverse_y = transform_inverse_cross(z, x) / det; + const float3 inverse_z = transform_inverse_cross(x, y) / det; + + /* Compute translation and fill transform. */ + Transform itfm; + itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w)); + itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w)); + itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w)); + + return itfm; +} +CCL_NAMESPACE_END diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp new file mode 100644 index 00000000000..8a698807a9c --- /dev/null +++ b/intern/cycles/util/transform_sse41.cpp @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "util/transform.h" + +CCL_NAMESPACE_BEGIN + +void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm) +{ + itfm = transform_inverse_impl(tfm); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h index 031c2f7c4c1..1ab6f76f9bc 100644 --- a/intern/cycles/util/types.h +++ b/intern/cycles/util/types.h @@ -12,6 +12,7 @@ #if !defined(__KERNEL_GPU__) # include <stdint.h> +# include <stdio.h> #endif #include "util/defines.h" @@ -70,6 +71,24 @@ ccl_device_inline bool is_power_of_two(size_t x) CCL_NAMESPACE_END +/* Device side printf only tested on CUDA, may work on more GPU devices. */ +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__) +# define __KERNEL_PRINTF__ +#endif + +ccl_device_inline void print_float(ccl_private const char *label, const float a) +{ +#ifdef __KERNEL_PRINTF__ + printf("%s: %.8f\n", label, (double)a); +#endif +} + +/* Most GPU APIs matching native vector types, so we only need to implement them for + * CPU and oneAPI. */ +#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__) +# define __KERNEL_NATIVE_VECTOR_TYPES__ +#endif + /* Vectorized types declaration. */ #include "util/types_uchar2.h" #include "util/types_uchar3.h" @@ -90,7 +109,7 @@ CCL_NAMESPACE_END #include "util/types_float4.h" #include "util/types_float8.h" -#include "util/types_vector3.h" +#include "util/types_spectrum.h" /* Vectorized types implementation. */ #include "util/types_uchar2_impl.h" @@ -110,8 +129,6 @@ CCL_NAMESPACE_END #include "util/types_float4_impl.h" #include "util/types_float8_impl.h" -#include "util/types_vector3_impl.h" - /* SSE types. */ #ifndef __KERNEL_GPU__ # include "util/sseb.h" diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h index d8b2efb7b4b..ea510ef832c 100644 --- a/intern/cycles/util/types_float2.h +++ b/intern/cycles/util/types_float2.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_FLOAT2_H__ -#define __UTIL_TYPES_FLOAT2_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,18 +9,19 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct float2 { float x, y; +# ifndef __KERNEL_GPU__ __forceinline float operator[](int i) const; __forceinline float &operator[](int i); +# endif }; ccl_device_inline float2 make_float2(float x, float y); -ccl_device_inline void print_float2(const char *label, const float2 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ -CCL_NAMESPACE_END +ccl_device_inline void print_float2(ccl_private const char *label, const float2 a); -#endif /* __UTIL_TYPES_FLOAT2_H__ */ +CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float2_impl.h b/intern/cycles/util/types_float2_impl.h index d67ec946b79..7ba7dee2e3a 100644 --- a/intern/cycles/util/types_float2_impl.h +++ b/intern/cycles/util/types_float2_impl.h @@ -1,20 +1,16 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__ -#define __UTIL_TYPES_FLOAT2_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." #endif -#ifndef __KERNEL_GPU__ -# include <cstdio> -#endif - CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ __forceinline float float2::operator[](int i) const { util_assert(i >= 0); @@ -28,19 +24,20 @@ __forceinline float &float2::operator[](int i) util_assert(i < 2); return *(&x + i); } +# endif ccl_device_inline float2 make_float2(float x, float y) { float2 a = {x, y}; return a; } +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ -ccl_device_inline void print_float2(const char *label, const float2 &a) +ccl_device_inline void print_float2(ccl_private const char *label, const float2 a) { +#ifdef __KERNEL_PRINTF__ printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); +#endif } -#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */ diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h index 060c2ac4152..87c6b1d3654 100644 --- a/intern/cycles/util/types_float3.h +++ b/intern/cycles/util/types_float3.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_FLOAT3_H__ -#define __UTIL_TYPES_FLOAT3_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,28 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct ccl_try_align(16) float3 { -# ifdef __KERNEL_SSE__ +# ifdef __KERNEL_GPU__ + /* Compact structure for GPU. */ + float x, y, z; +# else + /* SIMD aligned structure for CPU. */ +# ifdef __KERNEL_SSE__ union { __m128 m128; struct { float x, y, z, w; }; }; +# else + float x, y, z, w; +# endif +# endif +# ifdef __KERNEL_SSE__ + /* Convenient constructors and operators for SIMD, otherwise default is enough. */ __forceinline float3(); __forceinline float3(const float3 &a); __forceinline explicit float3(const __m128 &a); @@ -29,18 +39,19 @@ struct ccl_try_align(16) float3 __forceinline operator __m128 &(); __forceinline float3 &operator=(const float3 &a); -# else /* __KERNEL_SSE__ */ - float x, y, z, w; -# endif /* __KERNEL_SSE__ */ +# endif +# ifndef __KERNEL_GPU__ __forceinline float operator[](int i) const; __forceinline float &operator[](int i); +# endif }; -ccl_device_inline float3 make_float3(float f); ccl_device_inline float3 make_float3(float x, float y, float z); -ccl_device_inline void print_float3(const char *label, const float3 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ + +ccl_device_inline float3 make_float3(float f); +ccl_device_inline void print_float3(ccl_private const char *label, const float3 a); /* Smaller float3 for storage. For math operations this must be converted to float3, so that on the * CPU SIMD instructions can be used. */ @@ -78,5 +89,3 @@ struct packed_float3 { static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes"); CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_FLOAT3_H__ */ diff --git a/intern/cycles/util/types_float3_impl.h b/intern/cycles/util/types_float3_impl.h index f5ffc48c1be..da76ab2ab2a 100644 --- a/intern/cycles/util/types_float3_impl.h +++ b/intern/cycles/util/types_float3_impl.h @@ -1,20 +1,15 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__ -#define __UTIL_TYPES_FLOAT3_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." #endif -#ifndef __KERNEL_GPU__ -# include <cstdio> -#endif - CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ # ifdef __KERNEL_SSE__ __forceinline float3::float3() { @@ -45,6 +40,7 @@ __forceinline float3 &float3::operator=(const float3 &a) } # endif /* __KERNEL_SSE__ */ +# ifndef __KERNEL_GPU__ __forceinline float float3::operator[](int i) const { util_assert(i >= 0); @@ -58,33 +54,37 @@ __forceinline float &float3::operator[](int i) util_assert(i < 3); return *(&x + i); } +# endif -ccl_device_inline float3 make_float3(float f) +ccl_device_inline float3 make_float3(float x, float y, float z) { -# ifdef __KERNEL_SSE__ - float3 a(_mm_set1_ps(f)); +# if defined(__KERNEL_GPU__) + return {x, y, z}; +# elif defined(__KERNEL_SSE__) + return float3(_mm_set_ps(0.0f, z, y, x)); # else - float3 a = {f, f, f, f}; + return {x, y, z, 0.0f}; # endif - return a; } -ccl_device_inline float3 make_float3(float x, float y, float z) +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ + +ccl_device_inline float3 make_float3(float f) { -# ifdef __KERNEL_SSE__ - float3 a(_mm_set_ps(0.0f, z, y, x)); -# else - float3 a = {x, y, z, 0.0f}; -# endif - return a; +#if defined(__KERNEL_GPU__) + return make_float3(f, f, f); +#elif defined(__KERNEL_SSE__) + return float3(_mm_set1_ps(f)); +#else + return {f, f, f, f}; +#endif } -ccl_device_inline void print_float3(const char *label, const float3 &a) +ccl_device_inline void print_float3(ccl_private const char *label, const float3 a) { +#ifdef __KERNEL_PRINTF__ printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); +#endif } -#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */ diff --git a/intern/cycles/util/types_float4.h b/intern/cycles/util/types_float4.h index 68ba787dac0..a347cfce9a1 100644 --- a/intern/cycles/util/types_float4.h +++ b/intern/cycles/util/types_float4.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_FLOAT4_H__ -#define __UTIL_TYPES_FLOAT4_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,7 +9,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct int4; struct ccl_try_align(16) float4 @@ -35,16 +34,17 @@ struct ccl_try_align(16) float4 float x, y, z, w; # endif /* __KERNEL_SSE__ */ +# ifndef __KERNEL_GPU__ __forceinline float operator[](int i) const; __forceinline float &operator[](int i); +# endif }; -ccl_device_inline float4 make_float4(float f); ccl_device_inline float4 make_float4(float x, float y, float z, float w); -ccl_device_inline float4 make_float4(const int4 &i); -ccl_device_inline void print_float4(const char *label, const float4 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ -CCL_NAMESPACE_END +ccl_device_inline float4 make_float4(float f); +ccl_device_inline float4 make_float4(const int4 i); +ccl_device_inline void print_float4(ccl_private const char *label, const float4 a); -#endif /* __UTIL_TYPES_FLOAT4_H__ */ +CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float4_impl.h b/intern/cycles/util/types_float4_impl.h index de2e7cb7061..420d9316926 100644 --- a/intern/cycles/util/types_float4_impl.h +++ b/intern/cycles/util/types_float4_impl.h @@ -1,20 +1,15 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__ -#define __UTIL_TYPES_FLOAT4_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." #endif -#ifndef __KERNEL_GPU__ -# include <cstdio> -#endif - CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ # ifdef __KERNEL_SSE__ __forceinline float4::float4() { @@ -41,6 +36,7 @@ __forceinline float4 &float4::operator=(const float4 &a) } # endif /* __KERNEL_SSE__ */ +# ifndef __KERNEL_GPU__ __forceinline float float4::operator[](int i) const { util_assert(i >= 0); @@ -54,43 +50,42 @@ __forceinline float &float4::operator[](int i) util_assert(i < 4); return *(&x + i); } +# endif -ccl_device_inline float4 make_float4(float f) +ccl_device_inline float4 make_float4(float x, float y, float z, float w) { # ifdef __KERNEL_SSE__ - float4 a(_mm_set1_ps(f)); + return float4(_mm_set_ps(w, z, y, x)); # else - float4 a = {f, f, f, f}; + return {x, y, z, w}; # endif - return a; } -ccl_device_inline float4 make_float4(float x, float y, float z, float w) +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ + +ccl_device_inline float4 make_float4(float f) { -# ifdef __KERNEL_SSE__ - float4 a(_mm_set_ps(w, z, y, x)); -# else - float4 a = {x, y, z, w}; -# endif - return a; +#ifdef __KERNEL_SSE__ + return float4(_mm_set1_ps(f)); +#else + return make_float4(f, f, f, f); +#endif } -ccl_device_inline float4 make_float4(const int4 &i) +ccl_device_inline float4 make_float4(const int4 i) { -# ifdef __KERNEL_SSE__ - float4 a(_mm_cvtepi32_ps(i.m128)); -# else - float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w}; -# endif - return a; +#ifdef __KERNEL_SSE__ + return float4(_mm_cvtepi32_ps(i.m128)); +#else + return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w); +#endif } -ccl_device_inline void print_float4(const char *label, const float4 &a) +ccl_device_inline void print_float4(ccl_private const char *label, const float4 a) { +#ifdef __KERNEL_PRINTF__ printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w); +#endif } -#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */ diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h index 99f9ec9b867..29fd632f08e 100644 --- a/intern/cycles/util/types_float8.h +++ b/intern/cycles/util/types_float8.h @@ -2,8 +2,7 @@ * Original code Copyright 2017, Intel Corporation * Modifications Copyright 2018-2022 Blender Foundation. */ -#ifndef __UTIL_TYPES_FLOAT8_H__ -#define __UTIL_TYPES_FLOAT8_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -11,11 +10,16 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +/* float8 is a reserved type in Metal that has not been implemented. For + * that reason this is named float8_t and not using native vector types. */ -struct ccl_try_align(32) float8 +#ifdef __KERNEL_GPU__ +struct float8_t +#else +struct ccl_try_align(32) float8_t +#endif { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ union { __m256 m256; struct { @@ -23,28 +27,27 @@ struct ccl_try_align(32) float8 }; }; - __forceinline float8(); - __forceinline float8(const float8 &a); - __forceinline explicit float8(const __m256 &a); + __forceinline float8_t(); + __forceinline float8_t(const float8_t &a); + __forceinline explicit float8_t(const __m256 &a); __forceinline operator const __m256 &() const; __forceinline operator __m256 &(); - __forceinline float8 &operator=(const float8 &a); + __forceinline float8_t &operator=(const float8_t &a); -# else /* __KERNEL_AVX2__ */ +#else /* __KERNEL_AVX2__ */ float a, b, c, d, e, f, g, h; -# endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX2__ */ +#ifndef __KERNEL_GPU__ __forceinline float operator[](int i) const; __forceinline float &operator[](int i); +#endif }; -ccl_device_inline float8 make_float8(float f); -ccl_device_inline float8 -make_float8(float a, float b, float c, float d, float e, float f, float g, float h); -#endif /* __KERNEL_GPU__ */ +ccl_device_inline float8_t make_float8_t(float f); +ccl_device_inline float8_t +make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h); CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_FLOAT8_H__ */ diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h index 19818976b50..e8576cdaf70 100644 --- a/intern/cycles/util/types_float8_impl.h +++ b/intern/cycles/util/types_float8_impl.h @@ -2,87 +2,79 @@ * Original code Copyright 2017, Intel Corporation * Modifications Copyright 2018-2022 Blender Foundation. */ -#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__ -#define __UTIL_TYPES_FLOAT8_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." #endif -#ifndef __KERNEL_GPU__ -# include <cstdio> -#endif - CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ -# ifdef __KERNEL_AVX2__ -__forceinline float8::float8() +#ifdef __KERNEL_AVX2__ +__forceinline float8_t::float8_t() { } -__forceinline float8::float8(const float8 &f) : m256(f.m256) +__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256) { } -__forceinline float8::float8(const __m256 &f) : m256(f) +__forceinline float8_t::float8_t(const __m256 &f) : m256(f) { } -__forceinline float8::operator const __m256 &() const +__forceinline float8_t::operator const __m256 &() const { return m256; } -__forceinline float8::operator __m256 &() +__forceinline float8_t::operator __m256 &() { return m256; } -__forceinline float8 &float8::operator=(const float8 &f) +__forceinline float8_t &float8_t::operator=(const float8_t &f) { m256 = f.m256; return *this; } -# endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX2__ */ -__forceinline float float8::operator[](int i) const +#ifndef __KERNEL_GPU__ +__forceinline float float8_t::operator[](int i) const { util_assert(i >= 0); util_assert(i < 8); return *(&a + i); } -__forceinline float &float8::operator[](int i) +__forceinline float &float8_t::operator[](int i) { util_assert(i >= 0); util_assert(i < 8); return *(&a + i); } +#endif -ccl_device_inline float8 make_float8(float f) +ccl_device_inline float8_t make_float8_t(float f) { -# ifdef __KERNEL_AVX2__ - float8 r(_mm256_set1_ps(f)); -# else - float8 r = {f, f, f, f, f, f, f, f}; -# endif +#ifdef __KERNEL_AVX2__ + float8_t r(_mm256_set1_ps(f)); +#else + float8_t r = {f, f, f, f, f, f, f, f}; +#endif return r; } -ccl_device_inline float8 -make_float8(float a, float b, float c, float d, float e, float f, float g, float h) +ccl_device_inline float8_t +make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h) { -# ifdef __KERNEL_AVX2__ - float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h)); -# else - float8 r = {a, b, c, d, e, f, g, h}; -# endif +#ifdef __KERNEL_AVX2__ + float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h)); +#else + float8_t r = {a, b, c, d, e, f, g, h}; +#endif return r; } -#endif /* __KERNEL_GPU__ */ - CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */ diff --git a/intern/cycles/util/types_int2.h b/intern/cycles/util/types_int2.h index 4daf387d9cf..604713dffcd 100644 --- a/intern/cycles/util/types_int2.h +++ b/intern/cycles/util/types_int2.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_INT2_H__ -#define __UTIL_TYPES_INT2_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,17 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct int2 { int x, y; +# ifndef __KERNEL_GPU__ __forceinline int operator[](int i) const; __forceinline int &operator[](int i); +# endif }; ccl_device_inline int2 make_int2(int x, int y); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_INT2_H__ */ diff --git a/intern/cycles/util/types_int2_impl.h b/intern/cycles/util/types_int2_impl.h index 7989c4d5506..f48c6f46729 100644 --- a/intern/cycles/util/types_int2_impl.h +++ b/intern/cycles/util/types_int2_impl.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_INT2_IMPL_H__ -#define __UTIL_TYPES_INT2_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,7 +9,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ int int2::operator[](int i) const { util_assert(i >= 0); @@ -24,14 +24,13 @@ int &int2::operator[](int i) util_assert(i < 2); return *(&x + i); } +# endif ccl_device_inline int2 make_int2(int x, int y) { int2 a = {x, y}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_INT2_IMPL_H__ */ diff --git a/intern/cycles/util/types_int3.h b/intern/cycles/util/types_int3.h index ad9bcb39bbe..e059ddd3660 100644 --- a/intern/cycles/util/types_int3.h +++ b/intern/cycles/util/types_int3.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_INT3_H__ -#define __UTIL_TYPES_INT3_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,10 +9,15 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct ccl_try_align(16) int3 { -# ifdef __KERNEL_SSE__ +# ifdef __KERNEL_GPU__ + /* Compact structure on the GPU. */ + int x, y, z; +# else + /* SIMD aligned structure for CPU. */ +# ifdef __KERNEL_SSE__ union { __m128i m128; struct { @@ -29,19 +33,21 @@ struct ccl_try_align(16) int3 __forceinline operator __m128i &(); __forceinline int3 &operator=(const int3 &a); -# else /* __KERNEL_SSE__ */ +# else /* __KERNEL_SSE__ */ int x, y, z, w; -# endif /* __KERNEL_SSE__ */ +# endif /* __KERNEL_SSE__ */ +# endif +# ifndef __KERNEL_GPU__ __forceinline int operator[](int i) const; __forceinline int &operator[](int i); +# endif }; -ccl_device_inline int3 make_int3(int i); ccl_device_inline int3 make_int3(int x, int y, int z); -ccl_device_inline void print_int3(const char *label, const int3 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ -CCL_NAMESPACE_END +ccl_device_inline int3 make_int3(int i); +ccl_device_inline void print_int3(ccl_private const char *label, const int3 a); -#endif /* __UTIL_TYPES_INT3_H__ */ +CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int3_impl.h b/intern/cycles/util/types_int3_impl.h index 4cfc1cf2987..830dfa3c658 100644 --- a/intern/cycles/util/types_int3_impl.h +++ b/intern/cycles/util/types_int3_impl.h @@ -1,20 +1,15 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_INT3_IMPL_H__ -#define __UTIL_TYPES_INT3_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." #endif -#ifndef __KERNEL_GPU__ -# include <cstdio> -#endif - CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ # ifdef __KERNEL_SSE__ __forceinline int3::int3() { @@ -45,6 +40,7 @@ __forceinline int3 &int3::operator=(const int3 &a) } # endif /* __KERNEL_SSE__ */ +# ifndef __KERNEL_GPU__ __forceinline int int3::operator[](int i) const { util_assert(i >= 0); @@ -58,34 +54,37 @@ __forceinline int &int3::operator[](int i) util_assert(i < 3); return *(&x + i); } - -ccl_device_inline int3 make_int3(int i) -{ -# ifdef __KERNEL_SSE__ - int3 a(_mm_set1_epi32(i)); -# else - int3 a = {i, i, i, i}; # endif - return a; -} ccl_device_inline int3 make_int3(int x, int y, int z) { -# ifdef __KERNEL_SSE__ - int3 a(_mm_set_epi32(0, z, y, x)); +# if defined(__KERNEL_GPU__) + return {x, y, z}; +# elif defined(__KERNEL_SSE__) + return int3(_mm_set_epi32(0, z, y, x)); # else - int3 a = {x, y, z, 0}; + return {x, y, z, 0}; # endif +} - return a; +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ + +ccl_device_inline int3 make_int3(int i) +{ +#if defined(__KERNEL_GPU__) + return make_int3(i, i, i); +#elif defined(__KERNEL_SSE__) + return int3(_mm_set1_epi32(i)); +#else + return {i, i, i, i}; +#endif } -ccl_device_inline void print_int3(const char *label, const int3 &a) +ccl_device_inline void print_int3(ccl_private const char *label, const int3 a) { +#ifdef __KERNEL_PRINTF__ printf("%s: %d %d %d\n", label, a.x, a.y, a.z); +#endif } -#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_INT3_IMPL_H__ */ diff --git a/intern/cycles/util/types_int4.h b/intern/cycles/util/types_int4.h index f35632fb52f..1a13c03e60e 100644 --- a/intern/cycles/util/types_int4.h +++ b/intern/cycles/util/types_int4.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_INT4_H__ -#define __UTIL_TYPES_INT4_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,7 +9,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct float3; struct float4; @@ -37,17 +36,18 @@ struct ccl_try_align(16) int4 int x, y, z, w; # endif /* __KERNEL_SSE__ */ +# ifndef __KERNEL_GPU__ __forceinline int operator[](int i) const; __forceinline int &operator[](int i); +# endif }; -ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int x, int y, int z, int w); -ccl_device_inline int4 make_int4(const float3 &f); -ccl_device_inline int4 make_int4(const float4 &f); -ccl_device_inline void print_int4(const char *label, const int4 &a); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ -CCL_NAMESPACE_END +ccl_device_inline int4 make_int4(int i); +ccl_device_inline int4 make_int4(const float3 f); +ccl_device_inline int4 make_int4(const float4 f); +ccl_device_inline void print_int4(ccl_private const char *label, const int4 a); -#endif /* __UTIL_TYPES_INT4_H__ */ +CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int4_impl.h b/intern/cycles/util/types_int4_impl.h index adb4a4cebac..067794e67b4 100644 --- a/intern/cycles/util/types_int4_impl.h +++ b/intern/cycles/util/types_int4_impl.h @@ -1,20 +1,15 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_INT4_IMPL_H__ -#define __UTIL_TYPES_INT4_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." #endif -#ifndef __KERNEL_GPU__ -# include <cstdio> -#endif - CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ # ifdef __KERNEL_SSE__ __forceinline int4::int4() { @@ -45,6 +40,7 @@ __forceinline int4 &int4::operator=(const int4 &a) } # endif /* __KERNEL_SSE__ */ +# ifndef __KERNEL_GPU__ __forceinline int int4::operator[](int i) const { util_assert(i >= 0); @@ -58,53 +54,53 @@ __forceinline int &int4::operator[](int i) util_assert(i < 4); return *(&x + i); } +# endif -ccl_device_inline int4 make_int4(int i) +ccl_device_inline int4 make_int4(int x, int y, int z, int w) { # ifdef __KERNEL_SSE__ - int4 a(_mm_set1_epi32(i)); + return int4(_mm_set_epi32(w, z, y, x)); # else - int4 a = {i, i, i, i}; + return {x, y, z, w}; # endif - return a; } -ccl_device_inline int4 make_int4(int x, int y, int z, int w) +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ + +ccl_device_inline int4 make_int4(int i) { -# ifdef __KERNEL_SSE__ - int4 a(_mm_set_epi32(w, z, y, x)); -# else - int4 a = {x, y, z, w}; -# endif - return a; +#ifdef __KERNEL_SSE__ + return int4(_mm_set1_epi32(i)); +#else + return make_int4(i, i, i, i); +#endif } -ccl_device_inline int4 make_int4(const float3 &f) +ccl_device_inline int4 make_int4(const float3 f) { -# ifdef __KERNEL_SSE__ - int4 a(_mm_cvtps_epi32(f.m128)); -# else - int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; -# endif - return a; +#if defined(__KERNEL_GPU__) + return make_int4((int)f.x, (int)f.y, (int)f.z, 0); +#elif defined(__KERNEL_SSE__) + return int4(_mm_cvtps_epi32(f.m128)); +#else + return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w); +#endif } -ccl_device_inline int4 make_int4(const float4 &f) +ccl_device_inline int4 make_int4(const float4 f) { -# ifdef __KERNEL_SSE__ - int4 a(_mm_cvtps_epi32(f.m128)); -# else - int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; -# endif - return a; +#ifdef __KERNEL_SSE__ + return int4(_mm_cvtps_epi32(f.m128)); +#else + return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w); +#endif } -ccl_device_inline void print_int4(const char *label, const int4 &a) +ccl_device_inline void print_int4(ccl_private const char *label, const int4 a) { +#ifdef __KERNEL_PRINTF__ printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); +#endif } -#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_INT4_IMPL_H__ */ diff --git a/intern/cycles/util/types_spectrum.h b/intern/cycles/util/types_spectrum.h new file mode 100644 index 00000000000..c59230b83ae --- /dev/null +++ b/intern/cycles/util/types_spectrum.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2022 Blender Foundation */ + +#ifndef __UTIL_TYPES_SPECTRUM_H__ +#define __UTIL_TYPES_SPECTRUM_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util/types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#define SPECTRUM_CHANNELS 3 +#define SPECTRUM_DATA_TYPE float3 +#define PACKED_SPECTRUM_DATA_TYPE packed_float3 + +using Spectrum = SPECTRUM_DATA_TYPE; +using PackedSpectrum = PACKED_SPECTRUM_DATA_TYPE; + +#define make_spectrum(f) CONCAT(make_, SPECTRUM_DATA_TYPE(f)) +#define load_spectrum(f) CONCAT(load_, SPECTRUM_DATA_TYPE(f)) +#define store_spectrum(s, f) CONCAT(store_, SPECTRUM_DATA_TYPE((s), (f))) + +#define zero_spectrum CONCAT(zero_, SPECTRUM_DATA_TYPE) +#define one_spectrum CONCAT(one_, SPECTRUM_DATA_TYPE) + +#define FOREACH_SPECTRUM_CHANNEL(counter) \ + for (int counter = 0; counter < SPECTRUM_CHANNELS; counter++) + +#define GET_SPECTRUM_CHANNEL(v, i) (((ccl_private float *)(&(v)))[i]) + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_SPECTRUM_H__ */ diff --git a/intern/cycles/util/types_uchar2.h b/intern/cycles/util/types_uchar2.h index 445fa8dd703..ce617248e6e 100644 --- a/intern/cycles/util/types_uchar2.h +++ b/intern/cycles/util/types_uchar2.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UCHAR2_H__ -#define __UTIL_TYPES_UCHAR2_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,17 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct uchar2 { uchar x, y; +# ifndef __KERNEL_GPU__ __forceinline uchar operator[](int i) const; __forceinline uchar &operator[](int i); +# endif }; ccl_device_inline uchar2 make_uchar2(uchar x, uchar y); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UCHAR2_H__ */ diff --git a/intern/cycles/util/types_uchar2_impl.h b/intern/cycles/util/types_uchar2_impl.h index cec1c679050..9f3f3a4efb9 100644 --- a/intern/cycles/util/types_uchar2_impl.h +++ b/intern/cycles/util/types_uchar2_impl.h @@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ uchar uchar2::operator[](int i) const { util_assert(i >= 0); @@ -24,13 +25,14 @@ uchar &uchar2::operator[](int i) util_assert(i < 2); return *(&x + i); } +# endif ccl_device_inline uchar2 make_uchar2(uchar x, uchar y) { uchar2 a = {x, y}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar3.h b/intern/cycles/util/types_uchar3.h index 1ebd86441c3..aed04c4775e 100644 --- a/intern/cycles/util/types_uchar3.h +++ b/intern/cycles/util/types_uchar3.h @@ -10,16 +10,18 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct uchar3 { uchar x, y, z; +# ifndef __KERNEL_GPU__ __forceinline uchar operator[](int i) const; __forceinline uchar &operator[](int i); +# endif }; ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar3_impl.h b/intern/cycles/util/types_uchar3_impl.h index 0656baa3da4..83eb3c99b3c 100644 --- a/intern/cycles/util/types_uchar3_impl.h +++ b/intern/cycles/util/types_uchar3_impl.h @@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ uchar uchar3::operator[](int i) const { util_assert(i >= 0); @@ -24,13 +25,14 @@ uchar &uchar3::operator[](int i) util_assert(i < 3); return *(&x + i); } +# endif ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) { uchar3 a = {x, y, z}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uchar4.h b/intern/cycles/util/types_uchar4.h index 2ac4fb56cbb..fb13a98875e 100644 --- a/intern/cycles/util/types_uchar4.h +++ b/intern/cycles/util/types_uchar4.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UCHAR4_H__ -#define __UTIL_TYPES_UCHAR4_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,17 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct uchar4 { uchar x, y, z, w; +# ifndef __KERNEL_GPU__ __forceinline uchar operator[](int i) const; __forceinline uchar &operator[](int i); +# endif }; ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UCHAR4_H__ */ diff --git a/intern/cycles/util/types_uchar4_impl.h b/intern/cycles/util/types_uchar4_impl.h index b3e8abfe873..244bb98f883 100644 --- a/intern/cycles/util/types_uchar4_impl.h +++ b/intern/cycles/util/types_uchar4_impl.h @@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ uchar uchar4::operator[](int i) const { util_assert(i >= 0); @@ -24,13 +25,14 @@ uchar &uchar4::operator[](int i) util_assert(i < 4); return *(&x + i); } +# endif ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) { uchar4 a = {x, y, z, w}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_uint2.h b/intern/cycles/util/types_uint2.h index e3254b9f0e1..4d76b628088 100644 --- a/intern/cycles/util/types_uint2.h +++ b/intern/cycles/util/types_uint2.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UINT2_H__ -#define __UTIL_TYPES_UINT2_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,17 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct uint2 { uint x, y; +# ifndef __KERNEL_GPU__ __forceinline uint operator[](uint i) const; __forceinline uint &operator[](uint i); +# endif }; ccl_device_inline uint2 make_uint2(uint x, uint y); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UINT2_H__ */ diff --git a/intern/cycles/util/types_uint2_impl.h b/intern/cycles/util/types_uint2_impl.h index e67134a011e..b508aaf2543 100644 --- a/intern/cycles/util/types_uint2_impl.h +++ b/intern/cycles/util/types_uint2_impl.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UINT2_IMPL_H__ -#define __UTIL_TYPES_UINT2_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,7 +9,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ __forceinline uint uint2::operator[](uint i) const { util_assert(i < 2); @@ -22,14 +22,13 @@ __forceinline uint &uint2::operator[](uint i) util_assert(i < 2); return *(&x + i); } +# endif ccl_device_inline uint2 make_uint2(uint x, uint y) { uint2 a = {x, y}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UINT2_IMPL_H__ */ diff --git a/intern/cycles/util/types_uint3.h b/intern/cycles/util/types_uint3.h index 885a8fb84ce..b1571716fc7 100644 --- a/intern/cycles/util/types_uint3.h +++ b/intern/cycles/util/types_uint3.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UINT3_H__ -#define __UTIL_TYPES_UINT3_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,17 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct uint3 { uint x, y, z; +# ifndef __KERNEL_GPU__ __forceinline uint operator[](uint i) const; __forceinline uint &operator[](uint i); +# endif }; ccl_device_inline uint3 make_uint3(uint x, uint y, uint z); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UINT3_H__ */ diff --git a/intern/cycles/util/types_uint3_impl.h b/intern/cycles/util/types_uint3_impl.h index f4d3d72469c..d36c9f52de9 100644 --- a/intern/cycles/util/types_uint3_impl.h +++ b/intern/cycles/util/types_uint3_impl.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UINT3_IMPL_H__ -#define __UTIL_TYPES_UINT3_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,7 +9,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ __forceinline uint uint3::operator[](uint i) const { util_assert(i < 3); @@ -22,14 +22,13 @@ __forceinline uint &uint3::operator[](uint i) util_assert(i < 3); return *(&x + i); } +# endif ccl_device_inline uint3 make_uint3(uint x, uint y, uint z) { uint3 a = {x, y, z}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UINT3_IMPL_H__ */ diff --git a/intern/cycles/util/types_uint4.h b/intern/cycles/util/types_uint4.h index d582b91d2a0..4982b30f577 100644 --- a/intern/cycles/util/types_uint4.h +++ b/intern/cycles/util/types_uint4.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UINT4_H__ -#define __UTIL_TYPES_UINT4_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,17 +9,17 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct uint4 { uint x, y, z, w; +# ifndef __KERNEL_GPU__ __forceinline uint operator[](uint i) const; __forceinline uint &operator[](uint i); +# endif }; ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w); -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UINT4_H__ */ diff --git a/intern/cycles/util/types_uint4_impl.h b/intern/cycles/util/types_uint4_impl.h index 98a4c5e9fe9..1cfdb9e0992 100644 --- a/intern/cycles/util/types_uint4_impl.h +++ b/intern/cycles/util/types_uint4_impl.h @@ -1,8 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#ifndef __UTIL_TYPES_UINT4_IMPL_H__ -#define __UTIL_TYPES_UINT4_IMPL_H__ +#pragma once #ifndef __UTIL_TYPES_H__ # error "Do not include this file directly, include util/types.h instead." @@ -10,7 +9,8 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ +# ifndef __KERNEL_GPU__ __forceinline uint uint4::operator[](uint i) const { util_assert(i < 3); @@ -22,14 +22,13 @@ __forceinline uint &uint4::operator[](uint i) util_assert(i < 3); return *(&x + i); } +# endif ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) { uint4 a = {x, y, z, w}; return a; } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_UINT4_IMPL_H__ */ diff --git a/intern/cycles/util/types_ushort4.h b/intern/cycles/util/types_ushort4.h index 1766c6bf734..aef36f63285 100644 --- a/intern/cycles/util/types_ushort4.h +++ b/intern/cycles/util/types_ushort4.h @@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_GPU__ +#ifndef __KERNEL_NATIVE_VECTOR_TYPES__ struct ushort4 { uint16_t x, y, z, w; diff --git a/intern/cycles/util/types_vector3.h b/intern/cycles/util/types_vector3.h deleted file mode 100644 index 2e0d68e1bd0..00000000000 --- a/intern/cycles/util/types_vector3.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ - -#ifndef __UTIL_TYPES_VECTOR3_H__ -#define __UTIL_TYPES_VECTOR3_H__ - -#ifndef __UTIL_TYPES_H__ -# error "Do not include this file directly, include util/types.h instead." -#endif - -CCL_NAMESPACE_BEGIN - -#ifndef __KERNEL_GPU__ -template<typename T> class vector3 { - public: - T x, y, z; - - __forceinline vector3(); - __forceinline vector3(const T &a); - __forceinline vector3(const T &x, const T &y, const T &z); -}; -#endif /* __KERNEL_GPU__ */ - -CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_VECTOR3_H__ */ diff --git a/intern/cycles/util/types_vector3_impl.h b/intern/cycles/util/types_vector3_impl.h deleted file mode 100644 index a765780e2d3..00000000000 --- a/intern/cycles/util/types_vector3_impl.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ - -#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__ -#define __UTIL_TYPES_VECTOR3_IMPL_H__ - -#ifndef __UTIL_TYPES_H__ -# error "Do not include this file directly, include util/types.h instead." -#endif - -CCL_NAMESPACE_BEGIN - -#ifndef __KERNEL_GPU__ -template<typename T> ccl_always_inline vector3<T>::vector3() -{ -} - -template<typename T> ccl_always_inline vector3<T>::vector3(const T &a) : x(a), y(a), z(a) -{ -} - -template<typename T> -ccl_always_inline vector3<T>::vector3(const T &x, const T &y, const T &z) : x(x), y(y), z(z) -{ -} -#endif /* __KERNEL_GPU__ */ - -CCL_NAMESPACE_END - -#endif /* __UTIL_TYPES_VECTOR3_IMPL_H__ */ diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h index 0056fb269ae..9e27997cf2c 100644 --- a/intern/cycles/util/vector.h +++ b/intern/cycles/util/vector.h @@ -10,7 +10,6 @@ #include "util/aligned_malloc.h" #include "util/guarded_allocator.h" -#include "util/types.h" CCL_NAMESPACE_BEGIN |