From a296b8f694d1a93d40da78312758580f69b43be7 Mon Sep 17 00:00:00 2001 From: Christian Rauch Date: Mon, 15 Aug 2022 14:58:04 +0200 Subject: GPU: replace GLEW with libepoxy With libepoxy we can choose between EGL and GLX at runtime, as well as dynamically open EGL and GLX libraries without linking to them. This will make it possible to build with Wayland, EGL, GLVND support while still running on systems that only have X11, GLX and libGL. It also paves the way for headless rendering through EGL. libepoxy is a new library dependency, and is included in the precompiled libraries. GLEW is no longer a dependency, and WITH_SYSTEM_GLEW was removed. Includes contributions by Brecht Van Lommel, Ray Molenkamp, Campbell Barton and Sergey Sharybin. Ref T76428 Differential Revision: https://developer.blender.org/D15291 --- intern/cycles/util/CMakeLists.txt | 1 - intern/cycles/util/opengl.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 81a7607baab..2dafe729dfe 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -3,7 +3,6 @@ set(INC .. - ../../glew-mx ) set(INC_SYS diff --git a/intern/cycles/util/opengl.h b/intern/cycles/util/opengl.h index 090deb861c4..fefee4ec022 100644 --- a/intern/cycles/util/opengl.h +++ b/intern/cycles/util/opengl.h @@ -7,6 +7,6 @@ /* OpenGL header includes, used everywhere we use OpenGL, to deal with * platform differences in one central place. */ -#include +#include #endif /* __UTIL_OPENGL_H__ */ -- cgit v1.2.3 From 8ffc11dbcb21e81634e8f22cd65fdc921c7320d1 Mon Sep 17 00:00:00 2001 From: Sebastian Parborg Date: Mon, 15 Aug 2022 16:44:24 +0200 Subject: Cleanup OpenGL linking and related code after libepoxy merge This cleans up the OpenGL build flags and linking. It additionally also removes some dead code. One of these dead code paths is WITH_X11_ALPHA which actually never was active even with the build flag on. The call to use this was never called because the default initializer for GHOST was set to have it off per default. Nothing called this function with a boolean value to enable it. These cleanups are needed to support true headless OpenGL rendering. Without these cleanups libepoxy will fail to load the correct OpenGL Libraries as we have already linked them to the blender binary. Reviewed By: Brecht, Campbell, Jeroen Differential Revision: http://developer.blender.org/D15554 --- intern/cycles/util/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 2dafe729dfe..997d574a3b0 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -148,6 +148,4 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_definitions(${GL_DEFINITIONS}) - cycles_add_library(cycles_util "${LIB}" ${SRC} ${SRC_HEADERS}) -- cgit v1.2.3 From a06c9b5ca8364f95bbfa6c3bedd23307e6817437 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Thu, 18 Aug 2022 20:45:09 +0200 Subject: Cycles: add Sobol-Burley sampling pattern Based on the paper "Practical Hash-based Owen Scrambling" by Brent Burley, 2020, Journal of Computer Graphics Techniques. It is distinct from the existing Sobol sampler in two important ways: * It is Owen scrambled, which gives it a much better convergence rate in many situations. * It uses padding for higher dimensions, rather than using higher Sobol dimensions directly. In practice this is advantagous because high-dimensional Sobol sequences have holes in their sampling patterns that don't resolve until an unreasonable number of samples are taken. (See Burley's paper for details.) The pattern reduces noise in some benchmark scenes, however it is also slower, particularly on the CPU. So for now Progressive Multi-Jittered sampling remains the default. Differential Revision: https://developer.blender.org/D15679 --- intern/cycles/util/hash.h | 120 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 4 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h index 081b33025d8..61705276a90 100644 --- a/intern/cycles/util/hash.h +++ b/intern/cycles/util/hash.h @@ -8,6 +8,23 @@ CCL_NAMESPACE_BEGIN +/* [0, uint_max] -> [0.0, 1.0) */ +ccl_device_forceinline float uint_to_float_excl(uint n) +{ + // Note: we divide by 4294967808 instead of 2^32 because the latter + // leads to a [0.0, 1.0] mapping instead of [0.0, 1.0) due to floating + // point rounding error. 4294967808 unfortunately leaves (precisely) + // one unused ulp between the max number this outputs and 1.0, but + // that's the best you can do with this construction. + return (float)n * (1.0f / 4294967808.0f); +} + +/* [0, uint_max] -> [0.0, 1.0] */ +ccl_device_forceinline float uint_to_float_incl(uint n) +{ + return (float)n * (1.0f / (float)0xFFFFFFFFu); +} + /* ***** Jenkins Lookup3 Hash Functions ***** */ /* Source: http://burtleburtle.net/bob/c/lookup3.c */ @@ -116,22 +133,22 @@ ccl_device_inline uint hash_uint4(uint kx, uint ky, uint kz, uint kw) ccl_device_inline float hash_uint_to_float(uint kx) { - return (float)hash_uint(kx) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint(kx)); } ccl_device_inline float hash_uint2_to_float(uint kx, uint ky) { - return (float)hash_uint2(kx, ky) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint2(kx, ky)); } ccl_device_inline float hash_uint3_to_float(uint kx, uint ky, uint kz) { - return (float)hash_uint3(kx, ky, kz) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint3(kx, ky, kz)); } ccl_device_inline float hash_uint4_to_float(uint kx, uint ky, uint kz, uint kw) { - return (float)hash_uint4(kx, ky, kz, kw) / (float)0xFFFFFFFFu; + return uint_to_float_incl(hash_uint4(kx, ky, kz, kw)); } /* Hashing float or float[234] into a float in the range [0, 1]. */ @@ -359,6 +376,101 @@ ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw) #endif +/* ***** Hash Prospector Hash Functions ***** + * + * These are based on the high-quality 32-bit hash/mixings functions from + * https://github.com/skeeto/hash-prospector + */ + +ccl_device_inline uint hash_hp_uint(uint i) +{ + // The actual mixing function from Hash Prospector. + i ^= i >> 16; + i *= 0x21f0aaad; + i ^= i >> 15; + i *= 0xd35a2d97; + i ^= i >> 15; + + // The xor is just to make input zero not map to output zero. + // The number is randomly selected and isn't special. + return i ^ 0xe6fe3beb; +} + +/* Seedable version of hash_hp_uint() above. */ +ccl_device_inline uint hash_hp_seeded_uint(uint i, uint seed) +{ + // Manipulate the seed so it doesn't interact poorly with n when they + // are both e.g. incrementing. This isn't fool-proof, but is good + // enough for practical use. + seed ^= seed << 19; + + return hash_hp_uint(i ^ seed); +} + +/* Outputs [0.0, 1.0]. */ +ccl_device_inline float hash_hp_seeded_float(uint i, uint seed) +{ + return uint_to_float_incl(hash_hp_seeded_uint(i, seed)); +} + +/* ***** CMJ Hash Functions ***** + * + * These are based on one of the hash functions in the paper + * "Correlated Multi-Jittered Sampling" by Andrew Kensler, 2013. + * + * These are here for backwards-compatibility, and can be replaced + * by the Hash Prospector hashes above at some point. + * See https://developer.blender.org/D15679#426304 + */ + +ccl_device_inline uint hash_cmj_seeded_uint(uint i, uint seed) +{ + i ^= seed; + i ^= i >> 17; + i ^= i >> 10; + i *= 0xb36534e5; + i ^= i >> 12; + i ^= i >> 21; + i *= 0x93fc4795; + i ^= 0xdf6e307f; + i ^= i >> 17; + i *= 1 | seed >> 18; + + return i; +} + +/* Outputs [0.0, 1.0]. */ +ccl_device_inline float hash_cmj_seeded_float(uint i, uint seed) +{ + return uint_to_float_excl(hash_cmj_seeded_uint(i, seed)); +} + +/* ***** Modified Wang Hash Functions ***** + * + * These are based on a bespoke modified version of the Wang hash, and + * can serve as a faster hash when quality isn't critical. + * + * The original Wang hash is documented here: + * https://www.burtleburtle.net/bob/hash/integer.html + */ + +ccl_device_inline uint hash_wang_seeded_uint(uint i, uint seed) +{ + i = (i ^ 61) ^ seed; + i += i << 3; + i ^= i >> 4; + i *= 0x27d4eb2d; + return i; +} + +/* Outputs [0.0, 1.0]. */ +ccl_device_inline float hash_wang_seeded_float(uint i, uint seed) +{ + return uint_to_float_incl(hash_wang_seeded_uint(i, seed)); +} + +/* ********** */ + #ifndef __KERNEL_GPU__ static inline uint hash_string(const char *str) { -- cgit v1.2.3 From 4b62970dd378164a9f5d4592f923ae92a894da87 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Fri, 19 Aug 2022 20:32:42 +0200 Subject: Cleanup: replace CHECK_TYPE macro with static_assert To avoid conflicts with BLI headers and simplify code. --- intern/cycles/util/defines.h | 40 ---------------------------------------- 1 file changed, 40 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index d0df1a221fc..56a41a1dc45 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -89,46 +89,6 @@ # define UNLIKELY(x) (x) #endif -#if defined(__GNUC__) || defined(__clang__) -# if defined(__cplusplus) -/* Some magic to be sure we don't have reference in the type. */ -template static inline T decltype_helper(T x) -{ - return x; -} -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif -#endif - -/* Causes warning: - * incompatible types when assigning to type 'Foo' from type 'Bar' - * ... the compiler optimizes away the temp var */ -#ifdef __GNUC__ -# define CHECK_TYPE(var, type) \ - { \ - TYPEOF(var) * __tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ - } \ - (void)0 - -# define CHECK_TYPE_PAIR(var_a, var_b) \ - { \ - TYPEOF(var_a) * __tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ - } \ - (void)0 -#else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) -#endif - -/* can be used in simple macros */ -#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val))) - #ifndef __KERNEL_GPU__ # include # define util_assert(statement) assert(statement) -- cgit v1.2.3 From a3e1a9e2aace26a71c2698cd96ce4086db25e94d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Fri, 26 Aug 2022 12:45:20 +1000 Subject: Cleanup: spelling in comments, format --- intern/cycles/util/hash.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h index 61705276a90..351b8796be7 100644 --- a/intern/cycles/util/hash.h +++ b/intern/cycles/util/hash.h @@ -378,7 +378,7 @@ ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw) /* ***** Hash Prospector Hash Functions ***** * - * These are based on the high-quality 32-bit hash/mixings functions from + * These are based on the high-quality 32-bit hash/mixing functions from * https://github.com/skeeto/hash-prospector */ -- cgit v1.2.3 From 50df9caef01a4225db216d9c4c0515134f7a37bf Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Tue, 23 Aug 2022 20:48:48 +0200 Subject: Cycles: improve Progressive Multi-Jittered sampling Fix two issues in the previous implementation: * Only power-of-two prefixes were progressively stratified, not suffixes. This resulted in unnecessarily increased noise when using non-power-of-two sample counts. * In order to try to get away with just a single sample pattern, the code used a combination of sample index shuffling and Cranley-Patterson rotation. Index shuffling is normally fine, but due to the sample patterns themselves not being quite right (as described above) this actually resulted in additional increased noise. Cranley-Patterson, on the other hand, always increases noise with randomized (t,s) nets like PMJ02, and should be avoided with these kinds of sequences. Addressed with the following changes: * Replace the sample pattern generation code with a much simpler algorithm recently published in the paper "Stochastic Generation of (t, s) Sample Sequences". This new implementation is easier to verify, produces fully progressively stratified PMJ02, and is *far* faster than the previous code, being O(N) in the number of samples generated. * It keeps the sample index shuffling, which works correctly now due to the improved sample patterns. But it now uses a newer high-quality hash instead of the original Laine-Karras hash. * The scrambling distance feature cannot (to my knowledge) be implemented with any decorrelation strategy other than Cranley-Patterson, so Cranley-Patterson is still used when that feature is enabled. But it is now disabled otherwise, since it increases noise. * In place of Cranley-Patterson, multiple independent patterns are generated and randomly chosen for different pixels and dimensions as described in the original PMJ paper. In this patch, the pattern selection is done via hash-based shuffling to ensure there are no repeats within a single pixel until all patterns have been used. The combination of these fixes brings the quality of Cycles' PMJ sampler in line with the previously submitted Sobol-Burley sampler in D15679. They are essentially indistinguishable in terms of quality/noise, which is expected since they are both randomized (0,2) sequences. Differential Revision: https://developer.blender.org/D15746 --- intern/cycles/util/hash.h | 91 +++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 34 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h index 351b8796be7..4f83f331229 100644 --- a/intern/cycles/util/hash.h +++ b/intern/cycles/util/hash.h @@ -4,6 +4,7 @@ #ifndef __UTIL_HASH_H__ #define __UTIL_HASH_H__ +#include "util/math.h" #include "util/types.h" CCL_NAMESPACE_BEGIN @@ -407,42 +408,16 @@ ccl_device_inline uint hash_hp_seeded_uint(uint i, uint seed) return hash_hp_uint(i ^ seed); } -/* Outputs [0.0, 1.0]. */ -ccl_device_inline float hash_hp_seeded_float(uint i, uint seed) +/* Outputs [0.0, 1.0). */ +ccl_device_inline float hash_hp_float(uint i) { - return uint_to_float_incl(hash_hp_seeded_uint(i, seed)); + return uint_to_float_excl(hash_hp_uint(i)); } -/* ***** CMJ Hash Functions ***** - * - * These are based on one of the hash functions in the paper - * "Correlated Multi-Jittered Sampling" by Andrew Kensler, 2013. - * - * These are here for backwards-compatibility, and can be replaced - * by the Hash Prospector hashes above at some point. - * See https://developer.blender.org/D15679#426304 - */ - -ccl_device_inline uint hash_cmj_seeded_uint(uint i, uint seed) -{ - i ^= seed; - i ^= i >> 17; - i ^= i >> 10; - i *= 0xb36534e5; - i ^= i >> 12; - i ^= i >> 21; - i *= 0x93fc4795; - i ^= 0xdf6e307f; - i ^= i >> 17; - i *= 1 | seed >> 18; - - return i; -} - -/* Outputs [0.0, 1.0]. */ -ccl_device_inline float hash_cmj_seeded_float(uint i, uint seed) +/* Outputs [0.0, 1.0). */ +ccl_device_inline float hash_hp_seeded_float(uint i, uint seed) { - return uint_to_float_excl(hash_cmj_seeded_uint(i, seed)); + return uint_to_float_excl(hash_hp_seeded_uint(i, seed)); } /* ***** Modified Wang Hash Functions ***** @@ -463,10 +438,58 @@ ccl_device_inline uint hash_wang_seeded_uint(uint i, uint seed) return i; } -/* Outputs [0.0, 1.0]. */ +/* Outputs [0.0, 1.0). */ ccl_device_inline float hash_wang_seeded_float(uint i, uint seed) { - return uint_to_float_incl(hash_wang_seeded_uint(i, seed)); + return uint_to_float_excl(hash_wang_seeded_uint(i, seed)); +} + +/* ***** Index Shuffling Hash Function ***** + * + * This function takes an index, the length of the thing the index points + * into, and returns a shuffled index. For example, if you pass indices + * 0 through 19 to this function with a length parameter of 20, it will + * return the indices in a shuffled order with no repeats. Indices + * larger than the length parameter will simply repeat the same shuffled + * pattern over and over. + * + * This is useful for iterating over an array in random shuffled order + * without having to shuffle the array itself. + * + * Passing different seeds results in different random shuffles. + * + * This function runs in average O(1) time. + * + * See https://andrew-helmer.github.io/permute/ for details on how this + * works. + */ +ccl_device_inline uint hash_shuffle_uint(uint i, uint length, uint seed) +{ + i = i % length; + uint mask = (1 << (32 - count_leading_zeros(length - 1))) - 1; + + do { + i ^= seed; + i *= 0xe170893d; + i ^= seed >> 16; + i ^= (i & mask) >> 4; + i ^= seed >> 8; + i *= 0x0929eb3f; + i ^= seed >> 23; + i ^= (i & mask) >> 1; + i *= 1 | seed >> 27; + i *= 0x6935fa69; + i ^= (i & mask) >> 11; + i *= 0x74dcb303; + i ^= (i & mask) >> 2; + i *= 0x9e501cc3; + i ^= (i & mask) >> 2; + i *= 0xc860a3df; + i &= mask; + i ^= i >> 5; + } while (i >= length); + + return i; } /* ********** */ -- cgit v1.2.3 From 6951e8890ae3d0923e377cff6023d78202d81a03 Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Sat, 3 Sep 2022 17:21:44 +0200 Subject: Mikktspace: Optimized port to C++ This commit is a big overhaul to the Mikktspace module, which is used to compute tangents. I'm not calling it a rewrite since it's the result of a lot of iterations on the original code, but pretty much everything is reworked somehow. Overall goal was to a) make it faster and b) make it maintainable. Notable changes: - Since the callbacks for requesting geometry data were a big bottleneck before, I've ported it to C++ and made it header-only, templating on the data source. That way, the compiler generates code specific to the caller, which allows it to inline the data source and specialize for some cases (e.g. subd vs. non-subd in Cycles). - The one input parameter, an optional angle threshold, was not used anywhere. Turns out that removing it allows for considerable algorithmic simplification, removing a lot of the complexity in the later stages. Therefore, I've just removed the option in the new code. - The code computes several outputs, but only one (the tangent itself) is ever used in Blender. Therefore, I've removed the others to simplify the code. They could easily be brought back if needed, none of the algorithmic simplifications are conflicting with them. - The original code had fallback paths for many steps in case temporary memory allocation fails, but that never actually gets used anyways since malloc() doesn't really ever return NULL in practise, so I removed them. - In general, I've restructured A LOT of the code to make the algorithms clearer and make use of some C++ features (vectors, std::array, booleans, classes), though there's still some of cleanup that could be done. - Parallelized duplicate detection, neighbor detection, triangle tangent computation, degenerate triangle handling and tangent space accumulation. - Replaced several algorithms with faster equivalents: Duplicate detection uses a (concurrent) hash set now, neighbor detection uses Radixsort and splits vertices by index pairs etc. As for results, the exact speedup depends on the scene of course, but let's consider the file from T97378: - Blender 3.1 (before D14675): 6.07sec - Blender 3.2 (with D14675): 4.62sec - rBf0a36599007d (last nightly build): 4.42sec - With this commit: 0.90sec This speedup will mostly be noticed at the start of Cycles renders and, even more importantly, in Eevee when doing something that changes the geometry (e.g. animating) on a model using normal maps. Differential Revision: https://developer.blender.org/D15589 --- intern/cycles/util/math.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 0585dcc8ad5..0905b3ec5c9 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -886,16 +886,16 @@ ccl_device_inline float2 map_to_tube(const float3 co) ccl_device_inline float2 map_to_sphere(const float3 co) { - float l = len(co); + float l = dot(co, co); float u, v; if (l > 0.0f) { if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) { u = 0.0f; /* Otherwise domain error. */ } else { - u = (1.0f - atan2f(co.x, co.y) / M_PI_F) / 2.0f; + u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F); } - v = 1.0f - safe_acosf(co.z / l) / M_PI_F; + v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F; } else { u = v = 0.0f; -- cgit v1.2.3 From fd1bc90679a89a29172683939827be9c565e2217 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Sun, 18 Sep 2022 17:32:53 +0200 Subject: Cycles: sync changes from standalone repository * Windows build fixes * Workaround for Hydra + OpenColorIO link issue * Bump version --- intern/cycles/util/version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/version.h b/intern/cycles/util/version.h index 8260dd4f55d..fb02c3eaeb7 100644 --- a/intern/cycles/util/version.h +++ b/intern/cycles/util/version.h @@ -9,7 +9,7 @@ CCL_NAMESPACE_BEGIN #define CYCLES_VERSION_MAJOR 3 -#define CYCLES_VERSION_MINOR 2 +#define CYCLES_VERSION_MINOR 4 #define CYCLES_VERSION_PATCH 0 #define CYCLES_MAKE_VERSION_STRING2(a, b, c) #a "." #b "." #c @@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN /* Blender libraries version compatible with this version */ -#define CYCLES_BLENDER_LIBRARIES_VERSION 3.1 +#define CYCLES_BLENDER_LIBRARIES_VERSION 3.3 CCL_NAMESPACE_END -- cgit v1.2.3