diff options
Diffstat (limited to 'intern/cycles/util')
-rw-r--r-- | intern/cycles/util/CMakeLists.txt | 1 | ||||
-rw-r--r-- | intern/cycles/util/util_atomic.h | 14 | ||||
-rw-r--r-- | intern/cycles/util/util_debug.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/util/util_debug.h | 7 | ||||
-rw-r--r-- | intern/cycles/util/util_defines.h | 135 | ||||
-rw-r--r-- | intern/cycles/util/util_logging.h | 16 | ||||
-rw-r--r-- | intern/cycles/util/util_math.h | 12 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float3.h | 26 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float4.h | 109 | ||||
-rw-r--r-- | intern/cycles/util/util_math_matrix.h | 56 | ||||
-rw-r--r-- | intern/cycles/util/util_optimization.h | 52 | ||||
-rw-r--r-- | intern/cycles/util/util_path.cpp | 213 | ||||
-rw-r--r-- | intern/cycles/util/util_progress.h | 19 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 184 | ||||
-rw-r--r-- | intern/cycles/util/util_sseb.h | 3 | ||||
-rw-r--r-- | intern/cycles/util/util_ssef.h | 3 | ||||
-rw-r--r-- | intern/cycles/util/util_ssei.h | 9 | ||||
-rw-r--r-- | intern/cycles/util/util_stats.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/util_string.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/util/util_task.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/util/util_time.h | 7 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 131 |
22 files changed, 593 insertions, 423 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 43f9a57d099..7f3747a0f58 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -38,6 +38,7 @@ set(SRC_HEADERS util_atomic.h util_boundbox.h util_debug.h + util_defines.h util_guarded_allocator.cpp util_foreach.h util_function.h diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 6c52117ef9a..f3c7ae546a0 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -22,19 +22,10 @@ /* Using atomic ops header from Blender. */ #include "atomic_ops.h" -ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) -{ - size_t prev_value = *maximum_value; - while(prev_value < value) { - if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) { - break; - } - } -} - #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) +#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1) #define CCL_LOCAL_MEM_FENCE 0 #define ccl_barrier(flags) (void)0 @@ -68,6 +59,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so #define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) #define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) +#define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) #define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE #define ccl_barrier(flags) barrier(flags) @@ -79,7 +71,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so #define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x)) #define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x)) +#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x)) #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) +#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1) #define CCL_LOCAL_MEM_FENCE #define ccl_barrier(flags) __syncthreads() diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index ab038d2b9fb..eb078d69252 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -122,13 +122,16 @@ void DebugFlags::OpenCL::reset() } DebugFlags::DebugFlags() +: viewport_static_bvh(false) { /* Nothing for now. */ } void DebugFlags::reset() { + viewport_static_bvh = false; cpu.reset(); + cuda.reset(); opencl.reset(); } @@ -184,8 +187,8 @@ std::ostream& operator <<(std::ostream &os, << " Device type : " << opencl_device_type << "\n" << " Kernel type : " << opencl_kernel_type << "\n" << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" - << " Single program : " << string_from_bool(debug_flags.opencl.single_program) - << "\n"; + << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n" + << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n"; return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 4505d584490..9255279c5ab 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -30,6 +30,9 @@ CCL_NAMESPACE_BEGIN */ class DebugFlags { public: + /* Use static BVH in viewport, to match final render exactly. */ + bool viewport_static_bvh; + /* Descriptor of CPU feature-set to be used. */ struct CPU { CPU(); @@ -115,6 +118,10 @@ public: /* Use single program */ bool single_program; + + /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */ + /* Artificial memory limit in bytes (0 if disabled). */ + size_t mem_limit; }; /* Get instance of debug flags registry. */ diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h new file mode 100644 index 00000000000..ae654092c87 --- /dev/null +++ b/intern/cycles/util/util_defines.h @@ -0,0 +1,135 @@ + +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_DEFINES_H__ +#define __UTIL_DEFINES_H__ + +/* Bitness */ + +#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +# define __KERNEL_64_BIT__ +#endif + +/* Qualifiers for kernel code shared by CPU and GPU */ + +#ifndef __KERNEL_GPU__ +# define ccl_device static inline +# define ccl_device_noinline static +# define ccl_global +# define ccl_constant +# define ccl_local +# define ccl_local_param +# define ccl_private +# define ccl_restrict __restrict +# define ccl_ref & +# define __KERNEL_WITH_SSE_ALIGN__ + +# if defined(_WIN32) && !defined(FREE_WINDOWS) +# define ccl_device_inline static __forceinline +# define ccl_device_forceinline static __forceinline +# define ccl_align(...) __declspec(align(__VA_ARGS__)) +# ifdef __KERNEL_64_BIT__ +# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) +# else /* __KERNEL_64_BIT__ */ +# undef __KERNEL_WITH_SSE_ALIGN__ +/* No support for function arguments (error C2719). */ +# define ccl_try_align(...) +# endif /* __KERNEL_64_BIT__ */ +# define ccl_may_alias +# define ccl_always_inline __forceinline +# define ccl_never_inline __declspec(noinline) +# define ccl_maybe_unused +# else /* _WIN32 && !FREE_WINDOWS */ +# define ccl_device_inline static inline __attribute__((always_inline)) +# define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) +# ifndef FREE_WINDOWS64 +# define __forceinline inline __attribute__((always_inline)) +# endif +# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +# define ccl_may_alias __attribute__((__may_alias__)) +# define ccl_always_inline __attribute__((always_inline)) +# define ccl_never_inline __attribute__((noinline)) +# define ccl_maybe_unused __attribute__((used)) +# endif /* _WIN32 && !FREE_WINDOWS */ + +/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ +# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +# define ATTR_FALLTHROUGH __attribute__((fallthrough)) +# else +# define ATTR_FALLTHROUGH ((void)0) +# endif +#endif /* __KERNEL_GPU__ */ + +/* macros */ + +/* hints for branch prediction, only use in code that runs a _lot_ */ +#if defined(__GNUC__) && defined(__KERNEL_CPU__) +# define LIKELY(x) __builtin_expect(!!(x), 1) +# define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +# define LIKELY(x) (x) +# define UNLIKELY(x) (x) +#endif + +#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) +# define HAS_CPP11_FEATURES +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(HAS_CPP11_FEATURES) +/* Some magic to be sure we don't have reference in the type. */ +template<typename T> static inline T decltype_helper(T x) { return x; } +# define TYPEOF(x) decltype(decltype_helper(x)) +# else +# define TYPEOF(x) typeof(x) +# endif +#endif + +/* Causes warning: + * incompatible types when assigning to type 'Foo' from type 'Bar' + * ... the compiler optimizes away the temp var */ +#ifdef __GNUC__ +#define CHECK_TYPE(var, type) { \ + TYPEOF(var) *__tmp; \ + __tmp = (type *)NULL; \ + (void)__tmp; \ +} (void)0 + +#define CHECK_TYPE_PAIR(var_a, var_b) { \ + TYPEOF(var_a) *__tmp; \ + __tmp = (typeof(var_b) *)NULL; \ + (void)__tmp; \ +} (void)0 +#else +# define CHECK_TYPE(var, type) +# define CHECK_TYPE_PAIR(var_a, var_b) +#endif + +/* can be used in simple macros */ +#define CHECK_TYPE_INLINE(val, type) \ + ((void)(((type)0) != (val))) + +#ifndef __KERNEL_GPU__ +# include <cassert> +# define util_assert(statement) assert(statement) +#else +# define util_assert(statement) +#endif + +#endif /* __UTIL_DEFINES_H__ */ + diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index ecf9c9cfee0..492f830e67c 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -19,28 +19,30 @@ #if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__) # include <glog/logging.h> -#else -# include <iostream> #endif +#include <iostream> + CCL_NAMESPACE_BEGIN #if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__) -class StubStream : public std::ostream { - public: - StubStream() : std::ostream(NULL) { } +class StubStream { +public: + template<class T> + StubStream& operator<<(const T&) { + return *this; + } }; class LogMessageVoidify { public: LogMessageVoidify() { } - void operator&(::std::ostream&) { } + void operator&(StubStream&) { } }; # define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream() # define LOG(severity) LOG_SUPPRESS() # define VLOG(severity) LOG_SUPPRESS() - #endif #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level) diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index b719640b19c..fb04d49bcd9 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b) #ifndef __KERNEL_GPU__ using std::isfinite; using std::isnan; +using std::sqrt; ccl_device_inline int abs(int x) { @@ -223,7 +224,7 @@ ccl_device_inline bool isfinite_safe(float f) { /* By IEEE 754 rule, 2*Inf equals Inf */ unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); + return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); } ccl_device_inline float ensure_finite(float v) @@ -329,15 +330,22 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t) return (A)(a * ((B)1 - t) + b * t); } +#endif /* __KERNEL_OPENCL__ */ + /* Triangle */ +#ifndef __KERNEL_OPENCL__ ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3) +#else +ccl_device_inline float triangle_area(const float3 v1, + const float3 v2, + const float3 v3) +#endif { return len(cross(v3 - v2, v1 - v2))*0.5f; } -#endif /* __KERNEL_OPENCL__ */ /* Orthonormal vectors */ diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index 5327d9f7cc6..e73e5bc17a2 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f) ccl_device_inline float3 operator*(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 +#if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(a.x*f, a.y*f, a.z*f); @@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a) ccl_device_inline float3 operator/(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(f / a.x, f / a.y, f / a.z); #endif @@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f) ccl_device_inline float3 operator/(const float3& a, const float3& b) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, b.m128)); #else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); #endif @@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t) ccl_device_inline float3 rcp(const float3& a) { #ifdef __KERNEL_SSE__ - const float4 r(_mm_rcp_ps(a.m128)); - return float3(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); #endif @@ -374,9 +368,9 @@ ccl_device_inline bool isfinite3_safe(float3 v) ccl_device_inline float3 ensure_finite3(float3 v) { - if(!isfinite_safe(v.x)) v.x = 0.0; - if(!isfinite_safe(v.y)) v.y = 0.0; - if(!isfinite_safe(v.z)) v.z = 0.0; + if(!isfinite_safe(v.x)) v.x = 0.0f; + if(!isfinite_safe(v.y)) v.y = 0.0f; + if(!isfinite_safe(v.z)) v.z = 0.0f; return v; } diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index d89121b3a1d..aa7e56fefe9 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b); ccl_device_inline float dot(const float4& a, const float4& b); ccl_device_inline float len_squared(const float4& a); ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 sqrt(const float4& a); +ccl_device_inline float4 sqr(const float4& a); ccl_device_inline float4 cross(const float4& a, const float4& b); ccl_device_inline bool is_zero(const float4& a); -ccl_device_inline float reduce_add(const float4& a); ccl_device_inline float average(const float4& a); ccl_device_inline float len(const float4& a); ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b); +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b); template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b); +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b); + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); @@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b); ccl_device_inline float4 reduce_min(const float4& a); ccl_device_inline float4 reduce_max(const float4& a); -# if 0 ccl_device_inline float4 reduce_add(const float4& a); -# endif #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f) ccl_device_inline float4 operator/(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return a * rcp(b); + return float4(_mm_div_ps(a.m128, b.m128)); #else return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); #endif @@ -171,8 +176,7 @@ ccl_device_inline float4 operator/=(float4& a, float f) ccl_device_inline int4 operator<(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128))); + return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); #else return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); #endif @@ -181,8 +185,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b) ccl_device_inline int4 operator>=(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128))); + return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); #else return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); #endif @@ -191,8 +194,7 @@ ccl_device_inline int4 operator>=(const float4& a, const float4& b) ccl_device_inline int4 operator<=(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128))); + return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); #else return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); #endif @@ -224,14 +226,30 @@ ccl_device_inline float len_squared(const float4& a) ccl_device_inline float4 rcp(const float4& a) { #ifdef __KERNEL_SSE__ - float4 r(_mm_rcp_ps(a.m128)); - return float4(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); #endif } +ccl_device_inline float4 sqrt(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sqrt_ps(a.m128)); +#else + return make_float4(sqrtf(a.x), + sqrtf(a.y), + sqrtf(a.z), + sqrtf(a.w)); +#endif +} + +ccl_device_inline float4 sqr(const float4& a) +{ + return a * a; +} + ccl_device_inline float4 cross(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ @@ -254,20 +272,25 @@ ccl_device_inline bool is_zero(const float4& a) #endif } -ccl_device_inline float reduce_add(const float4& a) +ccl_device_inline float4 reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE3__ + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return float4( _mm_hadd_ps(h.m128, h.m128)); +# else float4 h(shuffle<1,0,3,2>(a) + a); - /* TODO(sergey): Investigate efficiency. */ - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); + return shuffle<2,3,0,1>(h) + h; +# endif #else - return ((a.x + a.y) + (a.z + a.w)); + float sum = (a.x + a.y) + (a.z + a.w); + return make_float4(sum, sum, sum, sum); #endif } ccl_device_inline float average(const float4& a) { - return reduce_add(a) * 0.25f; + return reduce_add(a).x * 0.25f; } ccl_device_inline float len(const float4& a) @@ -309,6 +332,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b) max(a.w, b.w)); #endif } + +ccl_device_inline float4 fabs(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +#else + return make_float4(fabsf(a.x), + fabsf(a.y), + fabsf(a.z), + fabsf(a.w)); +#endif +} #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ @@ -320,11 +355,28 @@ __forceinline const float4 shuffle(const float4& b) _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); } +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b) +{ + return float4(_mm_shuffle_ps(a.m128, b.m128, + _MM_SHUFFLE(index_3, index_2, index_1, index_0))); +} + template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) { return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); } +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b) +{ + return float4(_mm_movelh_ps(a.m128, b.m128)); +} + +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b) +{ + return float4(_mm_movehl_ps(b.m128, a.m128)); +} + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { @@ -344,9 +396,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), - _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); + return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128))); #else return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, @@ -355,6 +405,13 @@ ccl_device_inline float4 select(const int4& mask, #endif } +ccl_device_inline float4 mask(const int4& mask, + const float4& a) +{ + /* Replace elements of x with zero where mask isn't set. */ + return select(mask, a, make_float4(0.0f)); +} + ccl_device_inline float4 reduce_min(const float4& a) { #ifdef __KERNEL_SSE__ @@ -375,17 +432,15 @@ ccl_device_inline float4 reduce_max(const float4& a) #endif } -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) +ccl_device_inline float4 load_float4(const float *v) { #ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; + return float4(_mm_loadu_ps(v)); #else - return make_float4((a.x + a.y) + (a.z + a.w)); + return make_float4(v[0], v[1], v[2], v[3]); #endif } -#endif + #endif /* !__KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index c7511f8306e..b31dbe4fc67 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float { const float singular_epsilon = 1e-9f; - for (int row = 0; row < n; row++) { - for (int col = 0; col < n; col++) { + for(int row = 0; row < n; row++) { + for(int col = 0; col < n; col++) { MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; } } - for (int sweep = 0; sweep < 8; sweep++) { + for(int sweep = 0; sweep < 8; sweep++) { float off_diagonal = 0.0f; - for (int row = 1; row < n; row++) { - for (int col = 0; col < row; col++) { + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { off_diagonal += fabsf(MAT(A, n, row, col)); } } - if (off_diagonal < 1e-7f) { + if(off_diagonal < 1e-7f) { /* The matrix has nearly reached diagonal form. * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ break; @@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float float abs_element = fabsf(element); /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ - if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { MAT(A, n, row, col) = 0.0f; continue; } @@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float * Then, we compute sin(phi) and cos(phi) themselves. */ float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); float ratio; - if (abs_element > singular_epsilon*fabsf(singular_diff)) { + if(abs_element > singular_epsilon*fabsf(singular_diff)) { float cot_2phi = 0.5f*singular_diff / element; ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); - if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ } else { ratio = element / singular_diff; @@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } /* Sort eigenvalues and the associated eigenvectors. */ - for (int i = 0; i < n - 1; i++) { + for(int i = 0; i < n - 1; i++) { float v = MAT(A, n, i, i); int k = i; - for (int j = i; j < n; j++) { - if (MAT(A, n, j, j) >= v) { + for(int j = i; j < n; j++) { + if(MAT(A, n, j, j) >= v) { v = MAT(A, n, j, j); k = j; } } - if (k != i) { + if(k != i) { /* Swap eigenvalues. */ MAT(A, n, k, k) = MAT(A, n, i, i); MAT(A, n, i, i) = v; /* Swap eigenvectors. */ - for (int j = 0; j < n; j++) { + for(int j = 0; j < n; j++) { float v = MATS(V, n, i, j, v_stride); MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); MATS(V, n, k, j, v_stride) = v; @@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } #ifdef __KERNEL_SSE3__ -ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +ccl_device_inline void math_vector_zero_sse(float4 *A, int n) { for(int i = 0; i < n; i++) { - A[i] = _mm_setzero_ps(); + A[i] = make_float4(0.0f); } } -ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +ccl_device_inline void math_matrix_zero_sse(float4 *A, int n) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_setzero_ps(); + MAT(A, n, row, col) = make_float4(0.0f); } } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ -ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) +ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); + MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight; } } } -ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_add_ps(V[i], a[i]); + V[i] += a[i]; } } -ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_mul_ps(V[i], a[i]); + V[i] *= a[i]; } } -ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) +ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n) { for(int i = 0; i < n; i++) { - a[i] = _mm_max_ps(a[i], b[i]); + a[i] = max(a[i], b[i]); } } -ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) +ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); + MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0]; } } } diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 6f70a474fe7..0382c0811dd 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -19,16 +19,6 @@ #ifndef __KERNEL_GPU__ -/* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) || \ - defined(__KERNEL_SSE3__) || \ - defined(__KERNEL_SSSE3__) || \ - defined(__KERNEL_SSE41__) || \ - defined(__KERNEL_AVX__) || \ - defined(__KERNEL_AVX2__) - /* do nothing */ -#endif - /* x86 * * Compile a regular, SSE2 and SSE3 kernel. */ @@ -73,48 +63,6 @@ #endif /* defined(__x86_64__) || defined(_M_X64) */ -/* SSE Experiment - * - * This is disabled code for an experiment to use SSE types globally for types - * such as float3 and float4. Currently this gives an overall slowdown. */ - -#if 0 -# define __KERNEL_SSE__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# ifndef __KERNEL_SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifndef __KERNEL_SSSE3__ -# define __KERNEL_SSSE3__ -# endif -# ifndef __KERNEL_SSE4__ -# define __KERNEL_SSE4__ -# endif -#endif - -/* SSE Intrinsics includes - * - * We assume __KERNEL_SSEX__ flags to have been defined at this point */ - -/* SSE intrinsics headers */ -#ifndef FREE_WINDOWS64 - -#ifdef _MSC_VER -# include <intrin.h> -#elif (defined(__x86_64__) || defined(__i386__)) -# include <x86intrin.h> -#endif - -#else - -/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. - * Since we can't avoid including <windows.h>, better only include that */ -#include "util/util_windows.h" - -#endif - #endif #endif /* __UTIL_OPTIMIZATION_H__ */ diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index f9c3b4bb139..bae5d5bd6d1 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -45,6 +45,7 @@ OIIO_NAMESPACE_USING # include <shlwapi.h> #endif +#include "util/util_map.h" #include "util/util_windows.h" CCL_NAMESPACE_BEGIN @@ -768,68 +769,180 @@ bool path_remove(const string& path) return remove(path.c_str()) == 0; } -static string line_directive(const string& base, const string& path, int line) +struct SourceReplaceState { + typedef map<string, string> ProcessedMapping; + /* Base director for all relative include headers. */ + string base; + /* Result of processed files. */ + ProcessedMapping processed_files; + /* Set of files which are considered "precompiled" and which are replaced + * with and empty string on a subsequent occurrence in include statement. + */ + set<string> precompiled_headers; +}; + +static string path_source_replace_includes_recursive( + const string& source, + const string& source_filepath, + SourceReplaceState *state); + +static string line_directive(const SourceReplaceState& state, + const string& path, + const int line) { - string escaped_path = path; + string unescaped_path = path; /* First we make path relative. */ - if(string_startswith(escaped_path, base.c_str())) { - const string base_file = path_filename(base); - const size_t base_len = base.length(); - escaped_path = base_file + escaped_path.substr(base_len, - escaped_path.length() - base_len); + if(string_startswith(unescaped_path, state.base.c_str())) { + const string base_file = path_filename(state.base); + const size_t base_len = state.base.length(); + unescaped_path = base_file + + unescaped_path.substr(base_len, + unescaped_path.length() - base_len); } /* Second, we replace all unsafe characters. */ - string_replace(escaped_path, "\"", "\\\""); - string_replace(escaped_path, "\'", "\\\'"); - string_replace(escaped_path, "\?", "\\\?"); - string_replace(escaped_path, "\\", "\\\\"); + const size_t length = unescaped_path.length(); + string escaped_path = ""; + for(size_t i = 0; i < length; ++i) { + const char ch = unescaped_path[i]; + if(strchr("\"\'\?\\", ch) != NULL) { + escaped_path += "\\"; + } + escaped_path += ch; + } + /* TODO(sergey): Check whether using std::to_string combined with several + * concatenation operations is any faster. + */ return string_printf("#line %d \"%s\"", line, escaped_path.c_str()); } +static string path_source_handle_preprocessor( + const string& preprocessor_line, + const string& source_filepath, + const size_t line_number, + SourceReplaceState *state) +{ + string result = preprocessor_line; + string token = string_strip( + preprocessor_line.substr(1, preprocessor_line.size() - 1)); + if(string_startswith(token, "include")) { + token = string_strip(token.substr(7, token.size() - 7)); + if(token[0] == '"') { + const size_t n_start = 1; + const size_t n_end = token.find("\"", n_start); + const string filename = token.substr(n_start, n_end - n_start); + const bool is_precompiled = string_endswith(token, "// PRECOMPILED"); + string filepath = path_join(state->base, filename); + if(!path_exists(filepath)) { + filepath = path_join(path_dirname(source_filepath), + filename); + } + if(is_precompiled) { + state->precompiled_headers.insert(filepath); + } + string text; + if(path_read_text(filepath, text)) { + text = path_source_replace_includes_recursive( + text, filepath, state); + /* Use line directives for better error messages. */ + result = line_directive(*state, filepath, 1) + "\n" + + text + "\n" + + line_directive(*state, source_filepath, line_number + 1); + } + } + } + return result; +} + +/* Our own little c preprocessor that replaces #includes with the file + * contents, to work around issue of OpenCL drivers not supporting + * include paths with spaces in them. + */ static string path_source_replace_includes_recursive( - const string& base, const string& source, - const string& source_filepath) + const string& source_filepath, + SourceReplaceState *state) { - /* Our own little c preprocessor that replaces #includes with the file - * contents, to work around issue of OpenCL drivers not supporting - * include paths with spaces in them. + /* Try to re-use processed file without spending time on replacing all + * include directives again. */ - + SourceReplaceState::ProcessedMapping::iterator replaced_file = + state->processed_files.find(source_filepath); + if(replaced_file != state->processed_files.end()) { + if(state->precompiled_headers.find(source_filepath) != + state->precompiled_headers.end()) { + return ""; + } + return replaced_file->second; + } + /* Perform full file processing. */ string result = ""; - vector<string> lines; - string_split(lines, source, "\n", false); - - for(size_t i = 0; i < lines.size(); ++i) { - string line = lines[i]; - if(line[0] == '#') { - string token = string_strip(line.substr(1, line.size() - 1)); - if(string_startswith(token, "include")) { - token = string_strip(token.substr(7, token.size() - 7)); - if(token[0] == '"') { - const size_t n_start = 1; - const size_t n_end = token.find("\"", n_start); - const string filename = token.substr(n_start, n_end - n_start); - string filepath = path_join(base, filename); - if(!path_exists(filepath)) { - filepath = path_join(path_dirname(source_filepath), - filename); - } - string text; - if(path_read_text(filepath, text)) { - text = path_source_replace_includes_recursive( - base, text, filepath); - /* Use line directives for better error messages. */ - line = line_directive(base, filepath, 1) - + token.replace(0, n_end + 1, "\n" + text + "\n") - + line_directive(base, source_filepath, i + 1); - } - } + const size_t source_length = source.length(); + size_t index = 0; + /* Information about where we are in the source. */ + size_t line_number = 0, column_number = 1; + /* Currently gathered non-preprocessor token. + * Store as start/length rather than token itself to avoid overhead of + * memory re-allocations on each character concatenation. + */ + size_t token_start = 0, token_length = 0; + /* Denotes whether we're inside of preprocessor line, together with + * preprocessor line itself. + * + * TODO(sergey): Investigate whether using token start/end position + * gives measurable speedup. + */ + bool inside_preprocessor = false; + string preprocessor_line = ""; + /* Actual loop over the whole source. */ + while(index < source_length) { + const char ch = source[index]; + if(ch == '\n') { + if(inside_preprocessor) { + result += path_source_handle_preprocessor(preprocessor_line, + source_filepath, + line_number, + state); + /* Start gathering net part of the token. */ + token_start = index; + token_length = 0; + } + inside_preprocessor = false; + preprocessor_line = ""; + column_number = 0; + ++line_number; + } + else if(ch == '#' && column_number == 1 && !inside_preprocessor) { + /* Append all possible non-preprocessor token to the result. */ + if(token_length != 0) { + result.append(source, token_start, token_length); + token_start = index; + token_length = 0; } + inside_preprocessor = true; + } + if(inside_preprocessor) { + preprocessor_line += ch; + } + else { + ++token_length; } - result += line + "\n"; + ++index; + ++column_number; } - + /* Append possible tokens which happened before special events handled + * above. + */ + if(token_length != 0) { + result.append(source, token_start, token_length); + } + if(inside_preprocessor) { + result += path_source_handle_preprocessor(preprocessor_line, + source_filepath, + line_number, + state); + } + /* Store result for further reuse. */ + state->processed_files[source_filepath] = result; return result; } @@ -837,10 +950,12 @@ string path_source_replace_includes(const string& source, const string& path, const string& source_filename) { + SourceReplaceState state; + state.base = path; return path_source_replace_includes_recursive( - path, source, - path_join(path, source_filename)); + path_join(path, source_filename), + &state); } FILE *path_fopen(const string& path, const string& mode) diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index bc672669e1f..134383e88db 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -41,6 +41,7 @@ public: denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -80,6 +81,7 @@ public: denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -146,6 +148,7 @@ public: thread_scoped_lock lock(progress_mutex); start_time = time_dt(); + end_time = 0.0; } void set_render_start_time() @@ -169,8 +172,15 @@ public: { thread_scoped_lock lock(progress_mutex); - total_time_ = time_dt() - start_time; - render_time_ = time_dt() - render_start_time; + double time = (end_time > 0) ? end_time : time_dt(); + + total_time_ = time - start_time; + render_time_ = time - render_start_time; + } + + void set_end_time() + { + end_time = time_dt(); } void reset_sample() @@ -226,6 +236,7 @@ public: int get_current_sample() { + thread_scoped_lock lock(progress_mutex); /* Note that the value here always belongs to the last tile that updated, * so it's only useful if there is only one active tile. */ return current_tile_sample; @@ -233,11 +244,13 @@ public: int get_rendered_tiles() { + thread_scoped_lock lock(progress_mutex); return rendered_tiles; } int get_denoised_tiles() { + thread_scoped_lock lock(progress_mutex); return denoised_tiles; } @@ -334,6 +347,8 @@ protected: int rendered_tiles, denoised_tiles; double start_time, render_start_time; + /* End time written when render is done, so it doesn't keep increasing on redraws. */ + double end_time; string status; string substatus; diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 587febe3e52..58b3d267266 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -18,19 +18,38 @@ #ifndef __UTIL_SIMD_TYPES_H__ #define __UTIL_SIMD_TYPES_H__ +#ifndef __KERNEL_GPU__ + #include <limits> #include "util/util_debug.h" -#include "util/util_types.h" +#include "util/util_defines.h" + +/* SSE Intrinsics includes + * + * We assume __KERNEL_SSEX__ flags to have been defined at this point */ + +/* SSE intrinsics headers */ +#ifndef FREE_WINDOWS64 + +#ifdef _MSC_VER +# include <intrin.h> +#elif (defined(__x86_64__) || defined(__i386__)) +# include <x86intrin.h> +#endif + +#else + +/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. + * Since we can't avoid including <windows.h>, better only include that */ +#include "util/util_windows.h" + +#endif CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ -struct sseb; -struct ssei; -struct ssef; - extern const __m128 _mm_lookupmask_ps[16]; /* Special Types */ @@ -328,12 +347,12 @@ __forceinline size_t __bscf(size_t& v) #endif /* _WIN32 */ -static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; -static const size_t BITSCAN_NO_BIT_SET_64 = 64; +/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test + * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other + * platforms when compiling code outside the kernel. */ +#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) -#ifdef __KERNEL_SSE3__ -/* Emulation of SSE4 functions with SSE3 */ -# ifndef __KERNEL_SSE41__ +/* Emulation of SSE4 functions with SSE2 */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -342,50 +361,50 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64; #define _MM_FROUND_CUR_DIRECTION 0x04 #undef _mm_blendv_ps -#define _mm_blendv_ps __emu_mm_blendv_ps -__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { - return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); +#define _mm_blendv_ps _mm_blendv_ps_emu +__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask) +{ + __m128i isignmask = _mm_set1_epi32(0x80000000); + __m128 signmask = _mm_castsi128_ps(isignmask); + __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask)); + __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask); + __m128 cmpmask = _mm_castsi128_ps(icmpmask); + return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); } #undef _mm_blend_ps -#define _mm_blend_ps __emu_mm_blend_ps -__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { +#define _mm_blend_ps _mm_blend_ps_emu +__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask) +{ assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } #undef _mm_blendv_epi8 -#define _mm_blendv_epi8 __emu_mm_blendv_epi8 -__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { +#define _mm_blendv_epi8 _mm_blendv_epi8_emu +__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask) +{ return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -#undef _mm_mullo_epi32 -#define _mm_mullo_epi32 __emu_mm_mullo_epi32 -__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { - __m128i rvalue; - char* _r = (char*)(&rvalue + 1); - char* _v = (char*)(& value + 1); - char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); - return rvalue; -} - #undef _mm_min_epi32 -#define _mm_min_epi32 __emu_mm_min_epi32 -__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { +#define _mm_min_epi32 _mm_min_epi32_emu +__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input) +{ return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } #undef _mm_max_epi32 -#define _mm_max_epi32 __emu_mm_max_epi32 -__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { +#define _mm_max_epi32 _mm_max_epi32_emu +__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input) +{ return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } #undef _mm_extract_epi32 -#define _mm_extract_epi32 __emu_mm_extract_epi32 -__forceinline int _mm_extract_epi32( __m128i input, const int index ) { - switch ( index ) { +#define _mm_extract_epi32 _mm_extract_epi32_emu +__forceinline int _mm_extract_epi32_emu( __m128i input, const int index) +{ + switch(index) { case 0: return _mm_cvtsi128_si32(input); case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); @@ -395,27 +414,26 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) { } #undef _mm_insert_epi32 -#define _mm_insert_epi32 __emu_mm_insert_epi32 -__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { +#define _mm_insert_epi32 _mm_insert_epi32_emu +__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index) +{ assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } -#undef _mm_extract_ps -#define _mm_extract_ps __emu_mm_extract_ps -__forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32_t* ptr = (int32_t*)&input; return ptr[index]; -} - #undef _mm_insert_ps -#define _mm_insert_ps __emu_mm_insert_ps -__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) -{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } +#define _mm_insert_ps _mm_insert_ps_emu +__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index) +{ + assert(index < 0x100); + ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; + return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); +} #undef _mm_round_ps -#define _mm_round_ps __emu_mm_round_ps -__forceinline __m128 _mm_round_ps( __m128 value, const int flags ) +#define _mm_round_ps _mm_round_ps_emu +__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags) { - switch ( flags ) + switch(flags) { case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); @@ -425,57 +443,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -# ifdef _M_X64 -#undef _mm_insert_epi64 -#define _mm_insert_epi64 __emu_mm_insert_epi64 -__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { - assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; -} - -#undef _mm_extract_epi64 -#define _mm_extract_epi64 __emu_mm_extract_epi64 -__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { - assert(size_t(index) < 2); - return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); -} -# endif - -# endif - -#undef _mm_fabs_ps -#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) - -/* Return a __m128 with every element set to the largest element of v. */ -ccl_device_inline __m128 _mm_hmax_ps(__m128 v) -{ - /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ - v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); - /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ - v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); - return v; -} - -/* Return the sum of the four elements of x. */ -ccl_device_inline float _mm_hsum_ss(__m128 x) -{ - __m128 a = _mm_movehdup_ps(x); - __m128 b = _mm_add_ps(x, a); - return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); -} - -/* Return a __m128 with every element set to the sum of the four elements of x. */ -ccl_device_inline __m128 _mm_hsum_ps(__m128 x) -{ - x = _mm_hadd_ps(x, x); - x = _mm_hadd_ps(x, x); - return x; -} - -/* Replace elements of x with zero where mask isn't set. */ -#undef _mm_mask_ps -#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) - -#endif +#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ #else /* __KERNEL_SSE2__ */ @@ -496,13 +464,19 @@ ccl_device_inline int bitscan(int value) #endif /* __KERNEL_SSE2__ */ +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) || \ + defined(__KERNEL_SSE3__) || \ + defined(__KERNEL_SSSE3__) || \ + defined(__KERNEL_SSE41__) || \ + defined(__KERNEL_AVX__) || \ + defined(__KERNEL_AVX2__) + /* do nothing */ +#endif + CCL_NAMESPACE_END -#include "util/util_math.h" -#include "util/util_sseb.h" -#include "util/util_ssei.h" -#include "util/util_ssef.h" -#include "util/util_avxf.h" +#endif /* __KERNEL_GPU__ */ #endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index 6e669701f3b..93c22aafdcd 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct ssei; +struct ssef; + /*! 4-wide SSE bool type. */ struct sseb { diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index cf99a08efae..bb007ff84a9 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE float type. */ struct ssef { diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h index 5f62569268c..ef2a9e68b7d 100644 --- a/intern/cycles/util/util_ssei.h +++ b/intern/cycles/util/util_ssei.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE integer type. */ struct ssei { @@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a #else -__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); } -__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); } +__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; } +__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; } +__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); } +__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); } __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; } #endif diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index baba549753d..7667f58eb7d 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -30,7 +30,7 @@ public: void mem_alloc(size_t size) { atomic_add_and_fetch_z(&mem_used, size); - atomic_update_max_z(&mem_peak, mem_used); + atomic_fetch_and_update_max_z(&mem_peak, mem_used); } void mem_free(size_t size) { diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp index a1008d510d1..94ad512982c 100644 --- a/intern/cycles/util/util_string.cpp +++ b/intern/cycles/util/util_string.cpp @@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other) string string_remove_trademark(const string &s) { string result = s; + + /* Special case, so we don;t leave sequential spaces behind. */ + /* TODO(sergey): Consider using regex perhaps? */ + string_replace(result, " (TM)", ""); + string_replace(result, " (R)", ""); + string_replace(result, "(TM)", ""); string_replace(result, "(R)", ""); diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp index fb0c34e1dc4..6ed97b0e0a6 100644 --- a/intern/cycles/util/util_task.cpp +++ b/intern/cycles/util/util_task.cpp @@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads) threads.resize(num_threads); const int num_groups = system_cpu_group_count(); - unsigned short num_process_groups; + unsigned short num_process_groups = 0; vector<unsigned short> process_groups; - int current_group_threads; + int current_group_threads = 0; if(num_groups > 1) { process_groups.resize(num_groups); num_process_groups = system_cpu_process_groups(num_groups, diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h index 65798244111..f03aa590e9b 100644 --- a/intern/cycles/util/util_time.h +++ b/intern/cycles/util/util_time.h @@ -37,7 +37,7 @@ public: ~scoped_timer() { if(value_ != NULL) { - *value_ = time_dt() - time_start_; + *value_ = get_time(); } } @@ -46,6 +46,11 @@ public: return time_start_; } + double get_time() const + { + return time_dt() - time_start_; + } + protected: double *value_; double time_start_; diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index a5d1d7152d5..aabca6c81fc 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -21,72 +21,18 @@ # include <stdlib.h> #endif -/* Bitness */ +/* Standard Integer Types */ -#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -# define __KERNEL_64_BIT__ +#if !defined(__KERNEL_GPU__) && !defined(_WIN32) +# include <stdint.h> #endif -/* Qualifiers for kernel code shared by CPU and GPU */ - -#ifndef __KERNEL_GPU__ -# define ccl_device static inline -# define ccl_device_noinline static -# define ccl_global -# define ccl_constant -# define ccl_local -# define ccl_local_param -# define ccl_private -# define ccl_restrict __restrict -# define __KERNEL_WITH_SSE_ALIGN__ - -# if defined(_WIN32) && !defined(FREE_WINDOWS) -# define ccl_device_inline static __forceinline -# define ccl_device_forceinline static __forceinline -# define ccl_align(...) __declspec(align(__VA_ARGS__)) -# ifdef __KERNEL_64_BIT__ -# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) -# else /* __KERNEL_64_BIT__ */ -# undef __KERNEL_WITH_SSE_ALIGN__ -/* No support for function arguments (error C2719). */ -# define ccl_try_align(...) -# endif /* __KERNEL_64_BIT__ */ -# define ccl_may_alias -# define ccl_always_inline __forceinline -# define ccl_never_inline __declspec(noinline) -# define ccl_maybe_unused -# else /* _WIN32 && !FREE_WINDOWS */ -# define ccl_device_inline static inline __attribute__((always_inline)) -# define ccl_device_forceinline static inline __attribute__((always_inline)) -# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) -# ifndef FREE_WINDOWS64 -# define __forceinline inline __attribute__((always_inline)) -# endif -# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) -# define ccl_may_alias __attribute__((__may_alias__)) -# define ccl_always_inline __attribute__((always_inline)) -# define ccl_never_inline __attribute__((noinline)) -# define ccl_maybe_unused __attribute__((used)) -# endif /* _WIN32 && !FREE_WINDOWS */ - -/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ -# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ -# define ATTR_FALLTHROUGH __attribute__((fallthrough)) -# else -# define ATTR_FALLTHROUGH ((void)0) -# endif -#endif /* __KERNEL_GPU__ */ - -/* Standard Integer Types */ +#include "util/util_defines.h" #ifndef __KERNEL_GPU__ -/* int8_t, uint16_t, and friends */ -# ifndef _WIN32 -# include <stdint.h> -# endif -/* SIMD Types */ # include "util/util_optimization.h" -#endif /* __KERNEL_GPU__ */ +# include "util/util_simd.h" +#endif CCL_NAMESPACE_BEGIN @@ -201,65 +147,8 @@ enum ExtensionType { EXTENSION_NUM_TYPES, }; -/* macros */ - -/* hints for branch prediction, only use in code that runs a _lot_ */ -#if defined(__GNUC__) && defined(__KERNEL_CPU__) -# define LIKELY(x) __builtin_expect(!!(x), 1) -# define UNLIKELY(x) __builtin_expect(!!(x), 0) -#else -# define LIKELY(x) (x) -# define UNLIKELY(x) (x) -#endif - -#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) -# define HAS_CPP11_FEATURES -#endif - -#if defined(__GNUC__) || defined(__clang__) -# if defined(HAS_CPP11_FEATURES) -/* Some magic to be sure we don't have reference in the type. */ -template<typename T> static inline T decltype_helper(T x) { return x; } -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif -#endif - -/* Causes warning: - * incompatible types when assigning to type 'Foo' from type 'Bar' - * ... the compiler optimizes away the temp var */ -#ifdef __GNUC__ -#define CHECK_TYPE(var, type) { \ - TYPEOF(var) *__tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ -} (void)0 - -#define CHECK_TYPE_PAIR(var_a, var_b) { \ - TYPEOF(var_a) *__tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ -} (void)0 -#else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) -#endif - -/* can be used in simple macros */ -#define CHECK_TYPE_INLINE(val, type) \ - ((void)(((type)0) != (val))) - - CCL_NAMESPACE_END -#ifndef __KERNEL_GPU__ -# include <cassert> -# define util_assert(statement) assert(statement) -#else -# define util_assert(statement) -#endif - /* Vectorized types declaration. */ #include "util/util_types_uchar2.h" #include "util/util_types_uchar3.h" @@ -298,5 +187,13 @@ CCL_NAMESPACE_END #include "util/util_types_vector3_impl.h" +/* SSE types. */ +#ifndef __KERNEL_GPU__ +# include "util/util_sseb.h" +# include "util/util_ssei.h" +# include "util/util_ssef.h" +# include "util/util_avxf.h" +#endif + #endif /* __UTIL_TYPES_H__ */ |