Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt1
-rw-r--r--intern/cycles/util/util_atomic.h14
-rw-r--r--intern/cycles/util/util_debug.cpp7
-rw-r--r--intern/cycles/util/util_debug.h7
-rw-r--r--intern/cycles/util/util_defines.h135
-rw-r--r--intern/cycles/util/util_logging.h16
-rw-r--r--intern/cycles/util/util_math.h12
-rw-r--r--intern/cycles/util/util_math_float3.h26
-rw-r--r--intern/cycles/util/util_math_float4.h109
-rw-r--r--intern/cycles/util/util_math_matrix.h56
-rw-r--r--intern/cycles/util/util_optimization.h52
-rw-r--r--intern/cycles/util/util_path.cpp213
-rw-r--r--intern/cycles/util/util_progress.h19
-rw-r--r--intern/cycles/util/util_simd.h184
-rw-r--r--intern/cycles/util/util_sseb.h3
-rw-r--r--intern/cycles/util/util_ssef.h3
-rw-r--r--intern/cycles/util/util_ssei.h9
-rw-r--r--intern/cycles/util/util_stats.h2
-rw-r--r--intern/cycles/util/util_string.cpp6
-rw-r--r--intern/cycles/util/util_task.cpp4
-rw-r--r--intern/cycles/util/util_time.h7
-rw-r--r--intern/cycles/util/util_types.h131
22 files changed, 593 insertions, 423 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 43f9a57d099..7f3747a0f58 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -38,6 +38,7 @@ set(SRC_HEADERS
util_atomic.h
util_boundbox.h
util_debug.h
+ util_defines.h
util_guarded_allocator.cpp
util_foreach.h
util_function.h
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 6c52117ef9a..f3c7ae546a0 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -22,19 +22,10 @@
/* Using atomic ops header from Blender. */
#include "atomic_ops.h"
-ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
-{
- size_t prev_value = *maximum_value;
- while(prev_value < value) {
- if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
- break;
- }
- }
-}
-
#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
#define CCL_LOCAL_MEM_FENCE 0
#define ccl_barrier(flags) (void)0
@@ -68,6 +59,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+#define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
#define ccl_barrier(flags) barrier(flags)
@@ -79,7 +71,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x))
#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
#define CCL_LOCAL_MEM_FENCE
#define ccl_barrier(flags) __syncthreads()
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index ab038d2b9fb..eb078d69252 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -122,13 +122,16 @@ void DebugFlags::OpenCL::reset()
}
DebugFlags::DebugFlags()
+: viewport_static_bvh(false)
{
/* Nothing for now. */
}
void DebugFlags::reset()
{
+ viewport_static_bvh = false;
cpu.reset();
+ cuda.reset();
opencl.reset();
}
@@ -184,8 +187,8 @@ std::ostream& operator <<(std::ostream &os,
<< " Device type : " << opencl_device_type << "\n"
<< " Kernel type : " << opencl_kernel_type << "\n"
<< " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n"
- << " Single program : " << string_from_bool(debug_flags.opencl.single_program)
- << "\n";
+ << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
+ << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
return os;
}
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 4505d584490..9255279c5ab 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -30,6 +30,9 @@ CCL_NAMESPACE_BEGIN
*/
class DebugFlags {
public:
+ /* Use static BVH in viewport, to match final render exactly. */
+ bool viewport_static_bvh;
+
/* Descriptor of CPU feature-set to be used. */
struct CPU {
CPU();
@@ -115,6 +118,10 @@ public:
/* Use single program */
bool single_program;
+
+ /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
+ /* Artificial memory limit in bytes (0 if disabled). */
+ size_t mem_limit;
};
/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
new file mode 100644
index 00000000000..ae654092c87
--- /dev/null
+++ b/intern/cycles/util/util_defines.h
@@ -0,0 +1,135 @@
+
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
+
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+# define __KERNEL_64_BIT__
+#endif
+
+/* Qualifiers for kernel code shared by CPU and GPU */
+
+#ifndef __KERNEL_GPU__
+# define ccl_device static inline
+# define ccl_device_noinline static
+# define ccl_global
+# define ccl_constant
+# define ccl_local
+# define ccl_local_param
+# define ccl_private
+# define ccl_restrict __restrict
+# define ccl_ref &
+# define __KERNEL_WITH_SSE_ALIGN__
+
+# if defined(_WIN32) && !defined(FREE_WINDOWS)
+# define ccl_device_inline static __forceinline
+# define ccl_device_forceinline static __forceinline
+# define ccl_align(...) __declspec(align(__VA_ARGS__))
+# ifdef __KERNEL_64_BIT__
+# define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+# else /* __KERNEL_64_BIT__ */
+# undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+# define ccl_try_align(...)
+# endif /* __KERNEL_64_BIT__ */
+# define ccl_may_alias
+# define ccl_always_inline __forceinline
+# define ccl_never_inline __declspec(noinline)
+# define ccl_maybe_unused
+# else /* _WIN32 && !FREE_WINDOWS */
+# define ccl_device_inline static inline __attribute__((always_inline))
+# define ccl_device_forceinline static inline __attribute__((always_inline))
+# define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+# ifndef FREE_WINDOWS64
+# define __forceinline inline __attribute__((always_inline))
+# endif
+# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+# define ccl_may_alias __attribute__((__may_alias__))
+# define ccl_always_inline __attribute__((always_inline))
+# define ccl_never_inline __attribute__((noinline))
+# define ccl_maybe_unused __attribute__((used))
+# endif /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
+# define ATTR_FALLTHROUGH __attribute__((fallthrough))
+# else
+# define ATTR_FALLTHROUGH ((void)0)
+# endif
+#endif /* __KERNEL_GPU__ */
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+# define LIKELY(x) __builtin_expect(!!(x), 1)
+# define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+# define LIKELY(x) (x)
+# define UNLIKELY(x) (x)
+#endif
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+# define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+# if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+# define TYPEOF(x) decltype(decltype_helper(x))
+# else
+# define TYPEOF(x) typeof(x)
+# endif
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type) { \
+ TYPEOF(var) *__tmp; \
+ __tmp = (type *)NULL; \
+ (void)__tmp; \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b) { \
+ TYPEOF(var_a) *__tmp; \
+ __tmp = (typeof(var_b) *)NULL; \
+ (void)__tmp; \
+} (void)0
+#else
+# define CHECK_TYPE(var, type)
+# define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+ ((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+# include <cassert>
+# define util_assert(statement) assert(statement)
+#else
+# define util_assert(statement)
+#endif
+
+#endif /* __UTIL_DEFINES_H__ */
+
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index ecf9c9cfee0..492f830e67c 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -19,28 +19,30 @@
#if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__)
# include <glog/logging.h>
-#else
-# include <iostream>
#endif
+#include <iostream>
+
CCL_NAMESPACE_BEGIN
#if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__)
-class StubStream : public std::ostream {
- public:
- StubStream() : std::ostream(NULL) { }
+class StubStream {
+public:
+ template<class T>
+ StubStream& operator<<(const T&) {
+ return *this;
+ }
};
class LogMessageVoidify {
public:
LogMessageVoidify() { }
- void operator&(::std::ostream&) { }
+ void operator&(StubStream&) { }
};
# define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
# define LOG(severity) LOG_SUPPRESS()
# define VLOG(severity) LOG_SUPPRESS()
-
#endif
#define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index b719640b19c..fb04d49bcd9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b)
#ifndef __KERNEL_GPU__
using std::isfinite;
using std::isnan;
+using std::sqrt;
ccl_device_inline int abs(int x)
{
@@ -223,7 +224,7 @@ ccl_device_inline bool isfinite_safe(float f)
{
/* By IEEE 754 rule, 2*Inf equals Inf */
unsigned int x = __float_as_uint(f);
- return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
+ return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
}
ccl_device_inline float ensure_finite(float v)
@@ -329,15 +330,22 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
return (A)(a * ((B)1 - t) + b * t);
}
+#endif /* __KERNEL_OPENCL__ */
+
/* Triangle */
+#ifndef __KERNEL_OPENCL__
ccl_device_inline float triangle_area(const float3& v1,
const float3& v2,
const float3& v3)
+#else
+ccl_device_inline float triangle_area(const float3 v1,
+ const float3 v2,
+ const float3 v3)
+#endif
{
return len(cross(v3 - v2, v1 - v2))*0.5f;
}
-#endif /* __KERNEL_OPENCL__ */
/* Orthonormal vectors */
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 5327d9f7cc6..e73e5bc17a2 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f)
ccl_device_inline float3 operator*(const float f, const float3& a)
{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
+#if defined(__KERNEL_SSE__)
return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
#else
return make_float3(a.x*f, a.y*f, a.z*f);
@@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a)
ccl_device_inline float3 operator/(const float f, const float3& a)
{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
- __m128 rc = _mm_rcp_ps(a.m128);
- return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#if defined(__KERNEL_SSE__)
+ return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
#else
return make_float3(f / a.x, f / a.y, f / a.z);
#endif
@@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f)
ccl_device_inline float3 operator/(const float3& a, const float3& b)
{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
- __m128 rc = _mm_rcp_ps(b.m128);
- return float3(_mm_mul_ps(a, rc));
+#if defined(__KERNEL_SSE__)
+ return float3(_mm_div_ps(a.m128, b.m128));
#else
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
#endif
@@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
ccl_device_inline float3 rcp(const float3& a)
{
#ifdef __KERNEL_SSE__
- const float4 r(_mm_rcp_ps(a.m128));
- return float3(_mm_sub_ps(_mm_add_ps(r, r),
- _mm_mul_ps(_mm_mul_ps(r, r), a)));
+ /* Don't use _mm_rcp_ps due to poor precision. */
+ return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
#else
return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
#endif
@@ -374,9 +368,9 @@ ccl_device_inline bool isfinite3_safe(float3 v)
ccl_device_inline float3 ensure_finite3(float3 v)
{
- if(!isfinite_safe(v.x)) v.x = 0.0;
- if(!isfinite_safe(v.y)) v.y = 0.0;
- if(!isfinite_safe(v.z)) v.z = 0.0;
+ if(!isfinite_safe(v.x)) v.x = 0.0f;
+ if(!isfinite_safe(v.y)) v.y = 0.0f;
+ if(!isfinite_safe(v.z)) v.z = 0.0f;
return v;
}
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index d89121b3a1d..aa7e56fefe9 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b);
ccl_device_inline float dot(const float4& a, const float4& b);
ccl_device_inline float len_squared(const float4& a);
ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
ccl_device_inline float4 cross(const float4& a, const float4& b);
ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
ccl_device_inline float average(const float4& a);
ccl_device_inline float len(const float4& a);
ccl_device_inline float4 normalize(const float4& a);
ccl_device_inline float4 safe_normalize(const float4& a);
ccl_device_inline float4 min(const float4& a, const float4& b);
ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
#endif /* !__KERNEL_OPENCL__*/
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
# ifdef __KERNEL_SSE3__
template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
@@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
const float4& b);
ccl_device_inline float4 reduce_min(const float4& a);
ccl_device_inline float4 reduce_max(const float4& a);
-# if 0
ccl_device_inline float4 reduce_add(const float4& a);
-# endif
#endif /* !__KERNEL_GPU__ */
/*******************************************************************************
@@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f)
ccl_device_inline float4 operator/(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
- return a * rcp(b);
+ return float4(_mm_div_ps(a.m128, b.m128));
#else
return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
#endif
@@ -171,8 +176,7 @@ ccl_device_inline float4 operator/=(float4& a, float f)
ccl_device_inline int4 operator<(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
- /* TODO(sergey): avoid cvt. */
- return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)));
+ return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
#else
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
#endif
@@ -181,8 +185,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
ccl_device_inline int4 operator>=(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
- /* TODO(sergey): avoid cvt. */
- return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)));
+ return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
#else
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
#endif
@@ -191,8 +194,7 @@ ccl_device_inline int4 operator>=(const float4& a, const float4& b)
ccl_device_inline int4 operator<=(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
- /* TODO(sergey): avoid cvt. */
- return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)));
+ return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
#else
return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
#endif
@@ -224,14 +226,30 @@ ccl_device_inline float len_squared(const float4& a)
ccl_device_inline float4 rcp(const float4& a)
{
#ifdef __KERNEL_SSE__
- float4 r(_mm_rcp_ps(a.m128));
- return float4(_mm_sub_ps(_mm_add_ps(r, r),
- _mm_mul_ps(_mm_mul_ps(r, r), a)));
+ /* Don't use _mm_rcp_ps due to poor precision. */
+ return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
#else
return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
#endif
}
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_sqrt_ps(a.m128));
+#else
+ return make_float4(sqrtf(a.x),
+ sqrtf(a.y),
+ sqrtf(a.z),
+ sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+ return a * a;
+}
+
ccl_device_inline float4 cross(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
@@ -254,20 +272,25 @@ ccl_device_inline bool is_zero(const float4& a)
#endif
}
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
{
#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE3__
+ float4 h(_mm_hadd_ps(a.m128, a.m128));
+ return float4( _mm_hadd_ps(h.m128, h.m128));
+# else
float4 h(shuffle<1,0,3,2>(a) + a);
- /* TODO(sergey): Investigate efficiency. */
- return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+ return shuffle<2,3,0,1>(h) + h;
+# endif
#else
- return ((a.x + a.y) + (a.z + a.w));
+ float sum = (a.x + a.y) + (a.z + a.w);
+ return make_float4(sum, sum, sum, sum);
#endif
}
ccl_device_inline float average(const float4& a)
{
- return reduce_add(a) * 0.25f;
+ return reduce_add(a).x * 0.25f;
}
ccl_device_inline float len(const float4& a)
@@ -309,6 +332,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
max(a.w, b.w));
#endif
}
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+ return make_float4(fabsf(a.x),
+ fabsf(a.y),
+ fabsf(a.z),
+ fabsf(a.w));
+#endif
+}
#endif /* !__KERNEL_OPENCL__*/
#ifdef __KERNEL_SSE__
@@ -320,11 +355,28 @@ __forceinline const float4 shuffle(const float4& b)
_MM_SHUFFLE(index_3, index_2, index_1, index_0))));
}
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+ return float4(_mm_shuffle_ps(a.m128, b.m128,
+ _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
{
return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
}
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+ return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+ return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
# ifdef __KERNEL_SSE3__
template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
{
@@ -344,9 +396,7 @@ ccl_device_inline float4 select(const int4& mask,
const float4& b)
{
#ifdef __KERNEL_SSE__
- /* TODO(sergey): avoid cvt. */
- return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
- _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+ return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
#else
return make_float4((mask.x)? a.x: b.x,
(mask.y)? a.y: b.y,
@@ -355,6 +405,13 @@ ccl_device_inline float4 select(const int4& mask,
#endif
}
+ccl_device_inline float4 mask(const int4& mask,
+ const float4& a)
+{
+ /* Replace elements of x with zero where mask isn't set. */
+ return select(mask, a, make_float4(0.0f));
+}
+
ccl_device_inline float4 reduce_min(const float4& a)
{
#ifdef __KERNEL_SSE__
@@ -375,17 +432,15 @@ ccl_device_inline float4 reduce_max(const float4& a)
#endif
}
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
+ccl_device_inline float4 load_float4(const float *v)
{
#ifdef __KERNEL_SSE__
- float4 h = shuffle<1,0,3,2>(a) + a;
- return shuffle<2,3,0,1>(h) + h;
+ return float4(_mm_loadu_ps(v));
#else
- return make_float4((a.x + a.y) + (a.z + a.w));
+ return make_float4(v[0], v[1], v[2], v[3]);
#endif
}
-#endif
+
#endif /* !__KERNEL_GPU__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index c7511f8306e..b31dbe4fc67 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
{
const float singular_epsilon = 1e-9f;
- for (int row = 0; row < n; row++) {
- for (int col = 0; col < n; col++) {
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col < n; col++) {
MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
}
}
- for (int sweep = 0; sweep < 8; sweep++) {
+ for(int sweep = 0; sweep < 8; sweep++) {
float off_diagonal = 0.0f;
- for (int row = 1; row < n; row++) {
- for (int col = 0; col < row; col++) {
+ for(int row = 1; row < n; row++) {
+ for(int col = 0; col < row; col++) {
off_diagonal += fabsf(MAT(A, n, row, col));
}
}
- if (off_diagonal < 1e-7f) {
+ if(off_diagonal < 1e-7f) {
/* The matrix has nearly reached diagonal form.
* Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
break;
@@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
float abs_element = fabsf(element);
/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
- if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+ if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
MAT(A, n, row, col) = 0.0f;
continue;
}
@@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
* Then, we compute sin(phi) and cos(phi) themselves. */
float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
float ratio;
- if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+ if(abs_element > singular_epsilon*fabsf(singular_diff)) {
float cot_2phi = 0.5f*singular_diff / element;
ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
- if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+ if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
}
else {
ratio = element / singular_diff;
@@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
}
/* Sort eigenvalues and the associated eigenvectors. */
- for (int i = 0; i < n - 1; i++) {
+ for(int i = 0; i < n - 1; i++) {
float v = MAT(A, n, i, i);
int k = i;
- for (int j = i; j < n; j++) {
- if (MAT(A, n, j, j) >= v) {
+ for(int j = i; j < n; j++) {
+ if(MAT(A, n, j, j) >= v) {
v = MAT(A, n, j, j);
k = j;
}
}
- if (k != i) {
+ if(k != i) {
/* Swap eigenvalues. */
MAT(A, n, k, k) = MAT(A, n, i, i);
MAT(A, n, i, i) = v;
/* Swap eigenvectors. */
- for (int j = 0; j < n; j++) {
+ for(int j = 0; j < n; j++) {
float v = MATS(V, n, i, j, v_stride);
MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
MATS(V, n, k, j, v_stride) = v;
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
}
#ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
{
for(int i = 0; i < n; i++) {
- A[i] = _mm_setzero_ps();
+ A[i] = make_float4(0.0f);
}
}
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
{
for(int row = 0; row < n; row++) {
for(int col = 0; col <= row; col++) {
- MAT(A, n, row, col) = _mm_setzero_ps();
+ MAT(A, n, row, col) = make_float4(0.0f);
}
}
}
/* Add Gramian matrix of v to A.
* The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
{
for(int row = 0; row < n; row++) {
for(int col = 0; col <= row; col++) {
- MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+ MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
}
}
}
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
{
for(int i = 0; i < n; i++) {
- V[i] = _mm_add_ps(V[i], a[i]);
+ V[i] += a[i];
}
}
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
{
for(int i = 0; i < n; i++) {
- V[i] = _mm_mul_ps(V[i], a[i]);
+ V[i] *= a[i];
}
}
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
{
for(int i = 0; i < n; i++) {
- a[i] = _mm_max_ps(a[i], b[i]);
+ a[i] = max(a[i], b[i]);
}
}
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
{
for(int row = 0; row < n; row++) {
for(int col = 0; col <= row; col++) {
- MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+ MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
}
}
}
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 6f70a474fe7..0382c0811dd 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -19,16 +19,6 @@
#ifndef __KERNEL_GPU__
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__) || \
- defined(__KERNEL_SSE3__) || \
- defined(__KERNEL_SSSE3__) || \
- defined(__KERNEL_SSE41__) || \
- defined(__KERNEL_AVX__) || \
- defined(__KERNEL_AVX2__)
- /* do nothing */
-#endif
-
/* x86
*
* Compile a regular, SSE2 and SSE3 kernel. */
@@ -73,48 +63,6 @@
#endif /* defined(__x86_64__) || defined(_M_X64) */
-/* SSE Experiment
- *
- * This is disabled code for an experiment to use SSE types globally for types
- * such as float3 and float4. Currently this gives an overall slowdown. */
-
-#if 0
-# define __KERNEL_SSE__
-# ifndef __KERNEL_SSE2__
-# define __KERNEL_SSE2__
-# endif
-# ifndef __KERNEL_SSE3__
-# define __KERNEL_SSE3__
-# endif
-# ifndef __KERNEL_SSSE3__
-# define __KERNEL_SSSE3__
-# endif
-# ifndef __KERNEL_SSE4__
-# define __KERNEL_SSE4__
-# endif
-#endif
-
-/* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
-
-#ifdef _MSC_VER
-# include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-# include <x86intrin.h>
-#endif
-
-#else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
-#include "util/util_windows.h"
-
-#endif
-
#endif
#endif /* __UTIL_OPTIMIZATION_H__ */
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index f9c3b4bb139..bae5d5bd6d1 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -45,6 +45,7 @@ OIIO_NAMESPACE_USING
# include <shlwapi.h>
#endif
+#include "util/util_map.h"
#include "util/util_windows.h"
CCL_NAMESPACE_BEGIN
@@ -768,68 +769,180 @@ bool path_remove(const string& path)
return remove(path.c_str()) == 0;
}
-static string line_directive(const string& base, const string& path, int line)
+struct SourceReplaceState {
+ typedef map<string, string> ProcessedMapping;
+ /* Base director for all relative include headers. */
+ string base;
+ /* Result of processed files. */
+ ProcessedMapping processed_files;
+ /* Set of files which are considered "precompiled" and which are replaced
+ * with and empty string on a subsequent occurrence in include statement.
+ */
+ set<string> precompiled_headers;
+};
+
+static string path_source_replace_includes_recursive(
+ const string& source,
+ const string& source_filepath,
+ SourceReplaceState *state);
+
+static string line_directive(const SourceReplaceState& state,
+ const string& path,
+ const int line)
{
- string escaped_path = path;
+ string unescaped_path = path;
/* First we make path relative. */
- if(string_startswith(escaped_path, base.c_str())) {
- const string base_file = path_filename(base);
- const size_t base_len = base.length();
- escaped_path = base_file + escaped_path.substr(base_len,
- escaped_path.length() - base_len);
+ if(string_startswith(unescaped_path, state.base.c_str())) {
+ const string base_file = path_filename(state.base);
+ const size_t base_len = state.base.length();
+ unescaped_path = base_file +
+ unescaped_path.substr(base_len,
+ unescaped_path.length() - base_len);
}
/* Second, we replace all unsafe characters. */
- string_replace(escaped_path, "\"", "\\\"");
- string_replace(escaped_path, "\'", "\\\'");
- string_replace(escaped_path, "\?", "\\\?");
- string_replace(escaped_path, "\\", "\\\\");
+ const size_t length = unescaped_path.length();
+ string escaped_path = "";
+ for(size_t i = 0; i < length; ++i) {
+ const char ch = unescaped_path[i];
+ if(strchr("\"\'\?\\", ch) != NULL) {
+ escaped_path += "\\";
+ }
+ escaped_path += ch;
+ }
+ /* TODO(sergey): Check whether using std::to_string combined with several
+ * concatenation operations is any faster.
+ */
return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
}
+static string path_source_handle_preprocessor(
+ const string& preprocessor_line,
+ const string& source_filepath,
+ const size_t line_number,
+ SourceReplaceState *state)
+{
+ string result = preprocessor_line;
+ string token = string_strip(
+ preprocessor_line.substr(1, preprocessor_line.size() - 1));
+ if(string_startswith(token, "include")) {
+ token = string_strip(token.substr(7, token.size() - 7));
+ if(token[0] == '"') {
+ const size_t n_start = 1;
+ const size_t n_end = token.find("\"", n_start);
+ const string filename = token.substr(n_start, n_end - n_start);
+ const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
+ string filepath = path_join(state->base, filename);
+ if(!path_exists(filepath)) {
+ filepath = path_join(path_dirname(source_filepath),
+ filename);
+ }
+ if(is_precompiled) {
+ state->precompiled_headers.insert(filepath);
+ }
+ string text;
+ if(path_read_text(filepath, text)) {
+ text = path_source_replace_includes_recursive(
+ text, filepath, state);
+ /* Use line directives for better error messages. */
+ result = line_directive(*state, filepath, 1) + "\n"
+ + text + "\n"
+ + line_directive(*state, source_filepath, line_number + 1);
+ }
+ }
+ }
+ return result;
+}
+
+/* Our own little c preprocessor that replaces #includes with the file
+ * contents, to work around issue of OpenCL drivers not supporting
+ * include paths with spaces in them.
+ */
static string path_source_replace_includes_recursive(
- const string& base,
const string& source,
- const string& source_filepath)
+ const string& source_filepath,
+ SourceReplaceState *state)
{
- /* Our own little c preprocessor that replaces #includes with the file
- * contents, to work around issue of OpenCL drivers not supporting
- * include paths with spaces in them.
+ /* Try to re-use processed file without spending time on replacing all
+ * include directives again.
*/
-
+ SourceReplaceState::ProcessedMapping::iterator replaced_file =
+ state->processed_files.find(source_filepath);
+ if(replaced_file != state->processed_files.end()) {
+ if(state->precompiled_headers.find(source_filepath) !=
+ state->precompiled_headers.end()) {
+ return "";
+ }
+ return replaced_file->second;
+ }
+ /* Perform full file processing. */
string result = "";
- vector<string> lines;
- string_split(lines, source, "\n", false);
-
- for(size_t i = 0; i < lines.size(); ++i) {
- string line = lines[i];
- if(line[0] == '#') {
- string token = string_strip(line.substr(1, line.size() - 1));
- if(string_startswith(token, "include")) {
- token = string_strip(token.substr(7, token.size() - 7));
- if(token[0] == '"') {
- const size_t n_start = 1;
- const size_t n_end = token.find("\"", n_start);
- const string filename = token.substr(n_start, n_end - n_start);
- string filepath = path_join(base, filename);
- if(!path_exists(filepath)) {
- filepath = path_join(path_dirname(source_filepath),
- filename);
- }
- string text;
- if(path_read_text(filepath, text)) {
- text = path_source_replace_includes_recursive(
- base, text, filepath);
- /* Use line directives for better error messages. */
- line = line_directive(base, filepath, 1)
- + token.replace(0, n_end + 1, "\n" + text + "\n")
- + line_directive(base, source_filepath, i + 1);
- }
- }
+ const size_t source_length = source.length();
+ size_t index = 0;
+ /* Information about where we are in the source. */
+ size_t line_number = 0, column_number = 1;
+ /* Currently gathered non-preprocessor token.
+ * Store as start/length rather than token itself to avoid overhead of
+ * memory re-allocations on each character concatenation.
+ */
+ size_t token_start = 0, token_length = 0;
+ /* Denotes whether we're inside of preprocessor line, together with
+ * preprocessor line itself.
+ *
+ * TODO(sergey): Investigate whether using token start/end position
+ * gives measurable speedup.
+ */
+ bool inside_preprocessor = false;
+ string preprocessor_line = "";
+ /* Actual loop over the whole source. */
+ while(index < source_length) {
+ const char ch = source[index];
+ if(ch == '\n') {
+ if(inside_preprocessor) {
+ result += path_source_handle_preprocessor(preprocessor_line,
+ source_filepath,
+ line_number,
+ state);
+ /* Start gathering net part of the token. */
+ token_start = index;
+ token_length = 0;
+ }
+ inside_preprocessor = false;
+ preprocessor_line = "";
+ column_number = 0;
+ ++line_number;
+ }
+ else if(ch == '#' && column_number == 1 && !inside_preprocessor) {
+ /* Append all possible non-preprocessor token to the result. */
+ if(token_length != 0) {
+ result.append(source, token_start, token_length);
+ token_start = index;
+ token_length = 0;
}
+ inside_preprocessor = true;
+ }
+ if(inside_preprocessor) {
+ preprocessor_line += ch;
+ }
+ else {
+ ++token_length;
}
- result += line + "\n";
+ ++index;
+ ++column_number;
}
-
+ /* Append possible tokens which happened before special events handled
+ * above.
+ */
+ if(token_length != 0) {
+ result.append(source, token_start, token_length);
+ }
+ if(inside_preprocessor) {
+ result += path_source_handle_preprocessor(preprocessor_line,
+ source_filepath,
+ line_number,
+ state);
+ }
+ /* Store result for further reuse. */
+ state->processed_files[source_filepath] = result;
return result;
}
@@ -837,10 +950,12 @@ string path_source_replace_includes(const string& source,
const string& path,
const string& source_filename)
{
+ SourceReplaceState state;
+ state.base = path;
return path_source_replace_includes_recursive(
- path,
source,
- path_join(path, source_filename));
+ path_join(path, source_filename),
+ &state);
}
FILE *path_fopen(const string& path, const string& mode)
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index bc672669e1f..134383e88db 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -41,6 +41,7 @@ public:
denoised_tiles = 0;
start_time = time_dt();
render_start_time = time_dt();
+ end_time = 0.0;
status = "Initializing";
substatus = "";
sync_status = "";
@@ -80,6 +81,7 @@ public:
denoised_tiles = 0;
start_time = time_dt();
render_start_time = time_dt();
+ end_time = 0.0;
status = "Initializing";
substatus = "";
sync_status = "";
@@ -146,6 +148,7 @@ public:
thread_scoped_lock lock(progress_mutex);
start_time = time_dt();
+ end_time = 0.0;
}
void set_render_start_time()
@@ -169,8 +172,15 @@ public:
{
thread_scoped_lock lock(progress_mutex);
- total_time_ = time_dt() - start_time;
- render_time_ = time_dt() - render_start_time;
+ double time = (end_time > 0) ? end_time : time_dt();
+
+ total_time_ = time - start_time;
+ render_time_ = time - render_start_time;
+ }
+
+ void set_end_time()
+ {
+ end_time = time_dt();
}
void reset_sample()
@@ -226,6 +236,7 @@ public:
int get_current_sample()
{
+ thread_scoped_lock lock(progress_mutex);
/* Note that the value here always belongs to the last tile that updated,
* so it's only useful if there is only one active tile. */
return current_tile_sample;
@@ -233,11 +244,13 @@ public:
int get_rendered_tiles()
{
+ thread_scoped_lock lock(progress_mutex);
return rendered_tiles;
}
int get_denoised_tiles()
{
+ thread_scoped_lock lock(progress_mutex);
return denoised_tiles;
}
@@ -334,6 +347,8 @@ protected:
int rendered_tiles, denoised_tiles;
double start_time, render_start_time;
+ /* End time written when render is done, so it doesn't keep increasing on redraws. */
+ double end_time;
string status;
string substatus;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 587febe3e52..58b3d267266 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,19 +18,38 @@
#ifndef __UTIL_SIMD_TYPES_H__
#define __UTIL_SIMD_TYPES_H__
+#ifndef __KERNEL_GPU__
+
#include <limits>
#include "util/util_debug.h"
-#include "util/util_types.h"
+#include "util/util_defines.h"
+
+/* SSE Intrinsics includes
+ *
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point */
+
+/* SSE intrinsics headers */
+#ifndef FREE_WINDOWS64
+
+#ifdef _MSC_VER
+# include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+# include <x86intrin.h>
+#endif
+
+#else
+
+/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * Since we can't avoid including <windows.h>, better only include that */
+#include "util/util_windows.h"
+
+#endif
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_SSE2__
-struct sseb;
-struct ssei;
-struct ssef;
-
extern const __m128 _mm_lookupmask_ps[16];
/* Special Types */
@@ -328,12 +347,12 @@ __forceinline size_t __bscf(size_t& v)
#endif /* _WIN32 */
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t BITSCAN_NO_BIT_SET_64 = 64;
+/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
+ * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
+ * platforms when compiling code outside the kernel. */
+#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
-#ifdef __KERNEL_SSE3__
-/* Emulation of SSE4 functions with SSE3 */
-# ifndef __KERNEL_SSE41__
+/* Emulation of SSE4 functions with SSE2 */
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_NEG_INF 0x01
@@ -342,50 +361,50 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64;
#define _MM_FROUND_CUR_DIRECTION 0x04
#undef _mm_blendv_ps
-#define _mm_blendv_ps __emu_mm_blendv_ps
-__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) {
- return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value));
+#define _mm_blendv_ps _mm_blendv_ps_emu
+__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask)
+{
+ __m128i isignmask = _mm_set1_epi32(0x80000000);
+ __m128 signmask = _mm_castsi128_ps(isignmask);
+ __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
+ __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
+ __m128 cmpmask = _mm_castsi128_ps(icmpmask);
+ return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
}
#undef _mm_blend_ps
-#define _mm_blend_ps __emu_mm_blend_ps
-__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) {
+#define _mm_blend_ps _mm_blend_ps_emu
+__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask)
+{
assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
}
#undef _mm_blendv_epi8
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
-__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) {
+#define _mm_blendv_epi8 _mm_blendv_epi8_emu
+__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask)
+{
return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
}
-#undef _mm_mullo_epi32
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
- __m128i rvalue;
- char* _r = (char*)(&rvalue + 1);
- char* _v = (char*)(& value + 1);
- char* _i = (char*)(& input + 1);
- for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i));
- return rvalue;
-}
-
#undef _mm_min_epi32
-#define _mm_min_epi32 __emu_mm_min_epi32
-__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) {
+#define _mm_min_epi32 _mm_min_epi32_emu
+__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input)
+{
return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
}
#undef _mm_max_epi32
-#define _mm_max_epi32 __emu_mm_max_epi32
-__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) {
+#define _mm_max_epi32 _mm_max_epi32_emu
+__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input)
+{
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
#undef _mm_extract_epi32
-#define _mm_extract_epi32 __emu_mm_extract_epi32
-__forceinline int _mm_extract_epi32( __m128i input, const int index ) {
- switch ( index ) {
+#define _mm_extract_epi32 _mm_extract_epi32_emu
+__forceinline int _mm_extract_epi32_emu( __m128i input, const int index)
+{
+ switch(index) {
case 0: return _mm_cvtsi128_si32(input);
case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
@@ -395,27 +414,26 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
}
#undef _mm_insert_epi32
-#define _mm_insert_epi32 __emu_mm_insert_epi32
-__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) {
+#define _mm_insert_epi32 _mm_insert_epi32_emu
+__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index)
+{
assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value;
}
-#undef _mm_extract_ps
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
- int32_t* ptr = (int32_t*)&input; return ptr[index];
-}
-
#undef _mm_insert_ps
-#define _mm_insert_ps __emu_mm_insert_ps
-__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
-{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
+#define _mm_insert_ps _mm_insert_ps_emu
+__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index)
+{
+ assert(index < 0x100);
+ ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6];
+ return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value);
+}
#undef _mm_round_ps
-#define _mm_round_ps __emu_mm_round_ps
-__forceinline __m128 _mm_round_ps( __m128 value, const int flags )
+#define _mm_round_ps _mm_round_ps_emu
+__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags)
{
- switch ( flags )
+ switch(flags)
{
case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
@@ -425,57 +443,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
return value;
}
-# ifdef _M_X64
-#undef _mm_insert_epi64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) {
- assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value;
-}
-
-#undef _mm_extract_epi64
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) {
- assert(size_t(index) < 2);
- return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input));
-}
-# endif
-
-# endif
-
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
- /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
- v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
- /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
- v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
- return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
- __m128 a = _mm_movehdup_ps(x);
- __m128 b = _mm_add_ps(x, a);
- return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
- x = _mm_hadd_ps(x, x);
- x = _mm_hadd_ps(x, x);
- return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
-#endif
+#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
#else /* __KERNEL_SSE2__ */
@@ -496,13 +464,19 @@ ccl_device_inline int bitscan(int value)
#endif /* __KERNEL_SSE2__ */
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__) || \
+ defined(__KERNEL_SSE3__) || \
+ defined(__KERNEL_SSSE3__) || \
+ defined(__KERNEL_SSE41__) || \
+ defined(__KERNEL_AVX__) || \
+ defined(__KERNEL_AVX2__)
+ /* do nothing */
+#endif
+
CCL_NAMESPACE_END
-#include "util/util_math.h"
-#include "util/util_sseb.h"
-#include "util/util_ssei.h"
-#include "util/util_ssef.h"
-#include "util/util_avxf.h"
+#endif /* __KERNEL_GPU__ */
#endif /* __UTIL_SIMD_TYPES_H__ */
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 6e669701f3b..93c22aafdcd 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_SSE2__
+struct ssei;
+struct ssef;
+
/*! 4-wide SSE bool type. */
struct sseb
{
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index cf99a08efae..bb007ff84a9 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_SSE2__
+struct sseb;
+struct ssef;
+
/*! 4-wide SSE float type. */
struct ssef
{
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 5f62569268c..ef2a9e68b7d 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_SSE2__
+struct sseb;
+struct ssef;
+
/*! 4-wide SSE integer type. */
struct ssei
{
@@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a
#else
-__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
-__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; }
+__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; }
+__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); }
__forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
#endif
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index baba549753d..7667f58eb7d 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -30,7 +30,7 @@ public:
void mem_alloc(size_t size) {
atomic_add_and_fetch_z(&mem_used, size);
- atomic_update_max_z(&mem_peak, mem_used);
+ atomic_fetch_and_update_max_z(&mem_peak, mem_used);
}
void mem_free(size_t size) {
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index a1008d510d1..94ad512982c 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other)
string string_remove_trademark(const string &s)
{
string result = s;
+
+ /* Special case, so we don;t leave sequential spaces behind. */
+ /* TODO(sergey): Consider using regex perhaps? */
+ string_replace(result, " (TM)", "");
+ string_replace(result, " (R)", "");
+
string_replace(result, "(TM)", "");
string_replace(result, "(R)", "");
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index fb0c34e1dc4..6ed97b0e0a6 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads)
threads.resize(num_threads);
const int num_groups = system_cpu_group_count();
- unsigned short num_process_groups;
+ unsigned short num_process_groups = 0;
vector<unsigned short> process_groups;
- int current_group_threads;
+ int current_group_threads = 0;
if(num_groups > 1) {
process_groups.resize(num_groups);
num_process_groups = system_cpu_process_groups(num_groups,
diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h
index 65798244111..f03aa590e9b 100644
--- a/intern/cycles/util/util_time.h
+++ b/intern/cycles/util/util_time.h
@@ -37,7 +37,7 @@ public:
~scoped_timer()
{
if(value_ != NULL) {
- *value_ = time_dt() - time_start_;
+ *value_ = get_time();
}
}
@@ -46,6 +46,11 @@ public:
return time_start_;
}
+ double get_time() const
+ {
+ return time_dt() - time_start_;
+ }
+
protected:
double *value_;
double time_start_;
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a5d1d7152d5..aabca6c81fc 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -21,72 +21,18 @@
# include <stdlib.h>
#endif
-/* Bitness */
+/* Standard Integer Types */
-#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-# define __KERNEL_64_BIT__
+#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+# include <stdint.h>
#endif
-/* Qualifiers for kernel code shared by CPU and GPU */
-
-#ifndef __KERNEL_GPU__
-# define ccl_device static inline
-# define ccl_device_noinline static
-# define ccl_global
-# define ccl_constant
-# define ccl_local
-# define ccl_local_param
-# define ccl_private
-# define ccl_restrict __restrict
-# define __KERNEL_WITH_SSE_ALIGN__
-
-# if defined(_WIN32) && !defined(FREE_WINDOWS)
-# define ccl_device_inline static __forceinline
-# define ccl_device_forceinline static __forceinline
-# define ccl_align(...) __declspec(align(__VA_ARGS__))
-# ifdef __KERNEL_64_BIT__
-# define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-# else /* __KERNEL_64_BIT__ */
-# undef __KERNEL_WITH_SSE_ALIGN__
-/* No support for function arguments (error C2719). */
-# define ccl_try_align(...)
-# endif /* __KERNEL_64_BIT__ */
-# define ccl_may_alias
-# define ccl_always_inline __forceinline
-# define ccl_never_inline __declspec(noinline)
-# define ccl_maybe_unused
-# else /* _WIN32 && !FREE_WINDOWS */
-# define ccl_device_inline static inline __attribute__((always_inline))
-# define ccl_device_forceinline static inline __attribute__((always_inline))
-# define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-# ifndef FREE_WINDOWS64
-# define __forceinline inline __attribute__((always_inline))
-# endif
-# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-# define ccl_may_alias __attribute__((__may_alias__))
-# define ccl_always_inline __attribute__((always_inline))
-# define ccl_never_inline __attribute__((noinline))
-# define ccl_maybe_unused __attribute__((used))
-# endif /* _WIN32 && !FREE_WINDOWS */
-
-/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
-# define ATTR_FALLTHROUGH __attribute__((fallthrough))
-# else
-# define ATTR_FALLTHROUGH ((void)0)
-# endif
-#endif /* __KERNEL_GPU__ */
-
-/* Standard Integer Types */
+#include "util/util_defines.h"
#ifndef __KERNEL_GPU__
-/* int8_t, uint16_t, and friends */
-# ifndef _WIN32
-# include <stdint.h>
-# endif
-/* SIMD Types */
# include "util/util_optimization.h"
-#endif /* __KERNEL_GPU__ */
+# include "util/util_simd.h"
+#endif
CCL_NAMESPACE_BEGIN
@@ -201,65 +147,8 @@ enum ExtensionType {
EXTENSION_NUM_TYPES,
};
-/* macros */
-
-/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
-# define LIKELY(x) __builtin_expect(!!(x), 1)
-# define UNLIKELY(x) __builtin_expect(!!(x), 0)
-#else
-# define LIKELY(x) (x)
-# define UNLIKELY(x) (x)
-#endif
-
-#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-# define HAS_CPP11_FEATURES
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-# if defined(HAS_CPP11_FEATURES)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x) { return x; }
-# define TYPEOF(x) decltype(decltype_helper(x))
-# else
-# define TYPEOF(x) typeof(x)
-# endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#define CHECK_TYPE(var, type) { \
- TYPEOF(var) *__tmp; \
- __tmp = (type *)NULL; \
- (void)__tmp; \
-} (void)0
-
-#define CHECK_TYPE_PAIR(var_a, var_b) { \
- TYPEOF(var_a) *__tmp; \
- __tmp = (typeof(var_b) *)NULL; \
- (void)__tmp; \
-} (void)0
-#else
-# define CHECK_TYPE(var, type)
-# define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) \
- ((void)(((type)0) != (val)))
-
-
CCL_NAMESPACE_END
-#ifndef __KERNEL_GPU__
-# include <cassert>
-# define util_assert(statement) assert(statement)
-#else
-# define util_assert(statement)
-#endif
-
/* Vectorized types declaration. */
#include "util/util_types_uchar2.h"
#include "util/util_types_uchar3.h"
@@ -298,5 +187,13 @@ CCL_NAMESPACE_END
#include "util/util_types_vector3_impl.h"
+/* SSE types. */
+#ifndef __KERNEL_GPU__
+# include "util/util_sseb.h"
+# include "util/util_ssei.h"
+# include "util/util_ssef.h"
+# include "util/util_avxf.h"
+#endif
+
#endif /* __UTIL_TYPES_H__ */