Merge branch 'master' into blender2.8

author: Bastien Montagne <montagne29@wanadoo.fr> 2017-08-07 17:16:43 +0300
committer: Bastien Montagne <montagne29@wanadoo.fr> 2017-08-07 17:16:43 +0300
commit: b282716c3aae68d5fc781e7fc5df0241c3d91198 (patch)
tree: b6c50064fe3a0c07341cd70cf9745ea3092c8fb7 /intern/cycles/util
parent: 459365443f62d2f8e8718c1d1b0fbaafd6d765de (diff)
parent: 580741b317ae60eb3bf999d636da0325c7e67373 (diff)
12 files changed, 314 insertions, 326 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 43f9a57d099..7f3747a0f58 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -38,6 +38,7 @@ set(SRC_HEADERS
 	util_atomic.h
 	util_boundbox.h
 	util_debug.h
+	util_defines.h
 	util_guarded_allocator.cpp
 	util_foreach.h
 	util_function.h
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
new file mode 100644
index 00000000000..d0d87e74332
--- /dev/null
+++ b/intern/cycles/util/util_defines.h
@@ -0,0 +1,134 @@
+
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
+
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
+
+/* Qualifiers for kernel code shared by CPU and GPU */
+
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_global
+#  define ccl_constant
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else  /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+#      define ccl_try_align(...)
+#    endif  /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else  /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif  /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#  else
+#    define ATTR_FALLTHROUGH ((void)0)
+#  endif
+#endif  /* __KERNEL_GPU__ */
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#  define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
+#  endif
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type)  {  \
+	TYPEOF(var) *__tmp;           \
+	__tmp = (type *)NULL;         \
+	(void)__tmp;                  \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
+	TYPEOF(var_a) *__tmp;                 \
+	__tmp = (typeof(var_b) *)NULL;        \
+	(void)__tmp;                          \
+} (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+	((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement)  assert(statement)
+#else
+#  define util_assert(statement)
+#endif
+
+#endif /* __UTIL_DEFINES_H__ */
+
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index b719640b19c..4d51ec5570a 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b)
 #ifndef __KERNEL_GPU__
 using std::isfinite;
 using std::isnan;
+using std::sqrt;
 
 ccl_device_inline int abs(int x)
 {
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index bb04c4aa2d9..e73e5bc17a2 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f)
 
 ccl_device_inline float3 operator*(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
+#if defined(__KERNEL_SSE__)
 	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(a.x*f, a.y*f, a.z*f);
@@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a)
 
 ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(a.m128);
-	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(f / a.x, f / a.y, f / a.z);
 #endif
@@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f)
 
 ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(b.m128);
-	return float3(_mm_mul_ps(a, rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 #endif
@@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 ccl_device_inline float3 rcp(const float3& a)
 {
 #ifdef __KERNEL_SSE__
-	const float4 r(_mm_rcp_ps(a.m128));
-	return float3(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
 #endif
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index d89121b3a1d..adb9a76a434 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b);
 ccl_device_inline float dot(const float4& a, const float4& b);
 ccl_device_inline float len_squared(const float4& a);
 ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
 ccl_device_inline float average(const float4& a);
 ccl_device_inline float len(const float4& a);
 ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
 template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
@@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
-#  if 0
 ccl_device_inline float4 reduce_add(const float4& a);
-#  endif
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f)
 ccl_device_inline float4 operator/(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	return a * rcp(b);
+	return float4(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
 #endif
@@ -224,14 +229,30 @@ ccl_device_inline float len_squared(const float4& a)
 ccl_device_inline float4 rcp(const float4& a)
 {
 #ifdef __KERNEL_SSE__
-	float4 r(_mm_rcp_ps(a.m128));
-	return float4(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
 #endif
 }
 
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sqrt_ps(a.m128));
+#else
+	return make_float4(sqrtf(a.x),
+	                   sqrtf(a.y),
+	                   sqrtf(a.z),
+	                   sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+	return a * a;
+}
+
 ccl_device_inline float4 cross(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
@@ -254,20 +275,24 @@ ccl_device_inline bool is_zero(const float4& a)
 #endif
 }
 
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE3__
+    float4 h(_mm_hadd_ps(a.m128, a.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
+#  else
 	float4 h(shuffle<1,0,3,2>(a) + a);
-	/* TODO(sergey): Investigate efficiency. */
-	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+	return  shuffle<2,3,0,1>(h) + h;
+#  endif
 #else
-	return ((a.x + a.y) + (a.z + a.w));
+	return make_float4(((a.x + a.y) + (a.z + a.w)));
 #endif
 }
 
 ccl_device_inline float average(const float4& a)
 {
-	return reduce_add(a) * 0.25f;
+	return reduce_add(a)[0] * 0.25f;
 }
 
 ccl_device_inline float len(const float4& a)
@@ -309,6 +334,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
 	                   max(a.w, b.w));
 #endif
 }
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+	return make_float4(fabsf(a.x),
+	                   fabsf(a.y),
+	                   fabsf(a.z),
+	                   fabsf(a.w));
+#endif
+}
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
@@ -320,11 +357,28 @@ __forceinline const float4 shuffle(const float4& b)
 	                          _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
 }
 
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+	return float4(_mm_shuffle_ps(a.m128, b.m128,
+	                             _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
 	return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
 }
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+	return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+	return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
@@ -344,9 +398,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
-	                        _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+	return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
 #else
 	return make_float4((mask.x)? a.x: b.x,
 	                   (mask.y)? a.y: b.y,
@@ -355,6 +407,13 @@ ccl_device_inline float4 select(const int4& mask,
 #endif
 }
 
+ccl_device_inline float4 mask(const int4& mask,
+                              const float4& a)
+{
+	/* Replace elements of x with zero where mask isn't set. */
+	return select(mask, a, make_float4(0.0f));
+}
+
 ccl_device_inline float4 reduce_min(const float4& a)
 {
 #ifdef __KERNEL_SSE__
@@ -375,17 +434,15 @@ ccl_device_inline float4 reduce_max(const float4& a)
 #endif
 }
 
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
+ccl_device_inline float4 load_float4(const float *v)
 {
 #ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return shuffle<2,3,0,1>(h) + h;
+	return float4(_mm_loadu_ps(v));
 #else
-	return make_float4((a.x + a.y) + (a.z + a.w));
+	return make_float4(v[0], v[1], v[2], v[3]);
 #endif
 }
-#endif
+
 #endif  /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index c7511f8306e..b31dbe4fc67 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 {
 	const float singular_epsilon = 1e-9f;
 
-	for (int row = 0; row < n; row++) {
-		for (int col = 0; col < n; col++) {
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col < n; col++) {
 			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
 		}
 	}
 
-	for (int sweep = 0; sweep < 8; sweep++) {
+	for(int sweep = 0; sweep < 8; sweep++) {
 		float off_diagonal = 0.0f;
-		for (int row = 1; row < n; row++) {
-			for (int col = 0; col < row; col++) {
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
 				off_diagonal += fabsf(MAT(A, n, row, col));
 			}
 		}
-		if (off_diagonal < 1e-7f) {
+		if(off_diagonal < 1e-7f) {
 			/* The matrix has nearly reached diagonal form.
 			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
 			break;
@@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 				float abs_element = fabsf(element);
 
 				/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
-				if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+				if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
 					MAT(A, n, row, col) = 0.0f;
 					continue;
 				}
@@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 				 * Then, we compute sin(phi) and cos(phi) themselves. */
 				float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
 				float ratio;
-				if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+				if(abs_element > singular_epsilon*fabsf(singular_diff)) {
 					float cot_2phi = 0.5f*singular_diff / element;
 					ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
-					if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+					if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
 				}
 				else {
 					ratio = element / singular_diff;
@@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 	}
 
 	/* Sort eigenvalues and the associated eigenvectors. */
-	for (int i = 0; i < n - 1; i++) {
+	for(int i = 0; i < n - 1; i++) {
 		float v = MAT(A, n, i, i);
 		int k = i;
-		for (int j = i; j < n; j++) {
-			if (MAT(A, n, j, j) >= v) {
+		for(int j = i; j < n; j++) {
+			if(MAT(A, n, j, j) >= v) {
 				v = MAT(A, n, j, j);
 				k = j;
 			}
 		}
-		if (k != i) {
+		if(k != i) {
 			/* Swap eigenvalues. */
 			MAT(A, n, k, k) = MAT(A, n, i, i);
 			MAT(A, n, i, i) = v;
 			/* Swap eigenvectors. */
-			for (int j = 0; j < n; j++) {
+			for(int j = 0; j < n; j++) {
 				float v = MATS(V, n, i, j, v_stride);
 				MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
 				MATS(V, n, k, j, v_stride) = v;
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
 	for(int i = 0; i < n; i++) {
-		A[i] = _mm_setzero_ps();
+		A[i] = make_float4(0.0f);
 	}
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_setzero_ps();
+			MAT(A, n, row, col) = make_float4(0.0f);
 		}
 	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+			MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
 		}
 	}
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_add_ps(V[i], a[i]);
+		V[i] += a[i];
 	}
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_mul_ps(V[i], a[i]);
+		V[i] *= a[i];
 	}
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
 	for(int i = 0; i < n; i++) {
-		a[i] = _mm_max_ps(a[i], b[i]);
+		a[i] = max(a[i], b[i]);
 	}
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+			MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
 		}
 	}
 }
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 6f70a474fe7..0382c0811dd 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -19,16 +19,6 @@
 
 #ifndef __KERNEL_GPU__
 
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)  || \
-	defined(__KERNEL_SSE3__)  || \
-	defined(__KERNEL_SSSE3__) || \
-	defined(__KERNEL_SSE41__) || \
-	defined(__KERNEL_AVX__)   || \
-	defined(__KERNEL_AVX2__)
-	/* do nothing */
-#endif
-
 /* x86
  *
  * Compile a regular, SSE2 and SSE3 kernel. */
@@ -73,48 +63,6 @@
 
 #endif  /* defined(__x86_64__) || defined(_M_X64) */
 
-/* SSE Experiment
- *
- * This is disabled code for an experiment to use SSE types globally for types
- * such as float3 and float4. Currently this gives an overall slowdown. */
-
-#if 0
-#  define __KERNEL_SSE__
-#  ifndef __KERNEL_SSE2__
-#    define __KERNEL_SSE2__
-#  endif
-#  ifndef __KERNEL_SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifndef __KERNEL_SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifndef __KERNEL_SSE4__
-#    define __KERNEL_SSE4__
-#  endif
-#endif
-
-/* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
-
-#ifdef _MSC_VER
-#  include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-#  include <x86intrin.h>
-#endif
-
-#else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
-#include "util/util_windows.h"
-
-#endif
-
 #endif
 
 #endif /* __UTIL_OPTIMIZATION_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 587febe3e52..a2b3247b207 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,19 +18,38 @@
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
+#ifndef __KERNEL_GPU__
+
 #include <limits>
 
 #include "util/util_debug.h"
-#include "util/util_types.h"
+#include "util/util_defines.h"
+
+/* SSE Intrinsics includes
+ *
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point */
+
+/* SSE intrinsics headers */
+#ifndef FREE_WINDOWS64
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#else
+
+/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * Since we can't avoid including <windows.h>, better only include that */
+#include "util/util_windows.h"
+
+#endif
 
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
-struct sseb;
-struct ssei;
-struct ssef;
-
 extern const __m128 _mm_lookupmask_ps[16];
 
 /* Special Types */
@@ -328,12 +347,9 @@ __forceinline size_t __bscf(size_t& v)
 
 #endif /* _WIN32 */
 
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
+#if !(defined(__SSE4_1__) || defined(__SSE4_2__))
 
-#ifdef __KERNEL_SSE3__
-/* Emulation of SSE4 functions with SSE3 */
-#  ifndef __KERNEL_SSE41__
+/* Emulation of SSE4 functions with SSE2 */
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -342,48 +358,31 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
 #undef _mm_blendv_ps
-#define _mm_blendv_ps __emu_mm_blendv_ps
 __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
     return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
 }
 
 #undef _mm_blend_ps
-#define _mm_blend_ps __emu_mm_blend_ps
 __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
 #undef _mm_blendv_epi8
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
-#undef _mm_mullo_epi32
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
-  __m128i rvalue;
-  char* _r = (char*)(&rvalue + 1);
-  char* _v = (char*)(& value + 1);
-  char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
-  return rvalue;
-}
-
 #undef _mm_min_epi32
-#define _mm_min_epi32 __emu_mm_min_epi32
 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_max_epi32
-#define _mm_max_epi32 __emu_mm_max_epi32
 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_extract_epi32
-#define _mm_extract_epi32 __emu_mm_extract_epi32
 __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   switch ( index ) {
   case 0: return _mm_cvtsi128_si32(input);
@@ -395,24 +394,15 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
 }
 
 #undef _mm_insert_epi32
-#define _mm_insert_epi32 __emu_mm_insert_epi32
 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
-#undef _mm_extract_ps
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32_t* ptr = (int32_t*)&input; return ptr[index];
-}
-
 #undef _mm_insert_ps
-#define _mm_insert_ps __emu_mm_insert_ps
 __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
 { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
 
 #undef _mm_round_ps
-#define _mm_round_ps __emu_mm_round_ps
 __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 {
   switch ( flags )
@@ -425,57 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#    ifdef _M_X64
-#undef _mm_insert_epi64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
-    assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
-}
-
-#undef _mm_extract_epi64
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
-    assert(size_t(index) < 2); 
-    return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
-}
-#    endif
-
-#  endif
-
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
-  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
-  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
-  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
-  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
-  return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
-    __m128 a = _mm_movehdup_ps(x);
-    __m128 b = _mm_add_ps(x, a);
-    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
-    x = _mm_hadd_ps(x, x);
-    x = _mm_hadd_ps(x, x);
-    return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
-#endif
+#endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #else  /* __KERNEL_SSE2__ */
 
@@ -496,13 +436,19 @@ ccl_device_inline int bitscan(int value)
 
 #endif /* __KERNEL_SSE2__ */
 
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)  || \
+	defined(__KERNEL_SSE3__)  || \
+	defined(__KERNEL_SSSE3__) || \
+	defined(__KERNEL_SSE41__) || \
+	defined(__KERNEL_AVX__)   || \
+	defined(__KERNEL_AVX2__)
+	/* do nothing */
+#endif
+
 CCL_NAMESPACE_END
 
-#include "util/util_math.h"
-#include "util/util_sseb.h"
-#include "util/util_ssei.h"
-#include "util/util_ssef.h"
-#include "util/util_avxf.h"
+#endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 6e669701f3b..93c22aafdcd 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct ssei;
+struct ssef;
+
 /*! 4-wide SSE bool type. */
 struct sseb
 {
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index cf99a08efae..bb007ff84a9 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE float type. */
 struct ssef
 {
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 5f62569268c..ef2a9e68b7d 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE integer type. */
 struct ssei
 {
@@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a
 
 #else
 
-__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
-__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; }
+__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; }
+__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); }
 __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
 
 #endif
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a5d1d7152d5..d9642df8005 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -21,72 +21,17 @@
 #  include <stdlib.h>
 #endif
 
-/* Bitness */
+/* Standard Integer Types */
 
-#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#  define __KERNEL_64_BIT__
+#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+#  include <stdint.h>
 #endif
 
-/* Qualifiers for kernel code shared by CPU and GPU */
-
-#ifndef __KERNEL_GPU__
-#  define ccl_device static inline
-#  define ccl_device_noinline static
-#  define ccl_global
-#  define ccl_constant
-#  define ccl_local
-#  define ccl_local_param
-#  define ccl_private
-#  define ccl_restrict __restrict
-#  define __KERNEL_WITH_SSE_ALIGN__
-
-#  if defined(_WIN32) && !defined(FREE_WINDOWS)
-#    define ccl_device_inline static __forceinline
-#    define ccl_device_forceinline static __forceinline
-#    define ccl_align(...) __declspec(align(__VA_ARGS__))
-#    ifdef __KERNEL_64_BIT__
-#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#    else  /* __KERNEL_64_BIT__ */
-#      undef __KERNEL_WITH_SSE_ALIGN__
-/* No support for function arguments (error C2719). */
-#      define ccl_try_align(...)
-#    endif  /* __KERNEL_64_BIT__ */
-#    define ccl_may_alias
-#    define ccl_always_inline __forceinline
-#    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
-#  else  /* _WIN32 && !FREE_WINDOWS */
-#    define ccl_device_inline static inline __attribute__((always_inline))
-#    define ccl_device_forceinline static inline __attribute__((always_inline))
-#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    ifndef FREE_WINDOWS64
-#      define __forceinline inline __attribute__((always_inline))
-#    endif
-#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    define ccl_may_alias __attribute__((__may_alias__))
-#    define ccl_always_inline __attribute__((always_inline))
-#    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
-#  endif  /* _WIN32 && !FREE_WINDOWS */
-
-/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
-#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
-#  else
-#    define ATTR_FALLTHROUGH ((void)0)
-#  endif
-#endif  /* __KERNEL_GPU__ */
-
-/* Standard Integer Types */
+#include "util/util_defines.h"
 
 #ifndef __KERNEL_GPU__
-/* int8_t, uint16_t, and friends */
-#  ifndef _WIN32
-#    include <stdint.h>
-#  endif
-/* SIMD Types */
-#  include "util/util_optimization.h"
-#endif  /* __KERNEL_GPU__ */
+#  include "util/util_simd.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -201,65 +146,8 @@ enum ExtensionType {
 	EXTENSION_NUM_TYPES,
 };
 
-/* macros */
-
-/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#  define LIKELY(x)       __builtin_expect(!!(x), 1)
-#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
-#else
-#  define LIKELY(x)       (x)
-#  define UNLIKELY(x)     (x)
-#endif
-
-#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-#  define HAS_CPP11_FEATURES
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#  if defined(HAS_CPP11_FEATURES)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x) { return x; }
-#    define TYPEOF(x) decltype(decltype_helper(x))
-#  else
-#    define TYPEOF(x) typeof(x)
-#  endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#define CHECK_TYPE(var, type)  {  \
-	TYPEOF(var) *__tmp;           \
-	__tmp = (type *)NULL;         \
-	(void)__tmp;                  \
-} (void)0
-
-#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
-	TYPEOF(var_a) *__tmp;                 \
-	__tmp = (typeof(var_b) *)NULL;        \
-	(void)__tmp;                          \
-} (void)0
-#else
-#  define CHECK_TYPE(var, type)
-#  define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) \
-	((void)(((type)0) != (val)))
-
-
 CCL_NAMESPACE_END
 
-#ifndef __KERNEL_GPU__
-#  include <cassert>
-#  define util_assert(statement)  assert(statement)
-#else
-#  define util_assert(statement)
-#endif
-
 /* Vectorized types declaration. */
 #include "util/util_types_uchar2.h"
 #include "util/util_types_uchar3.h"
@@ -298,5 +186,13 @@ CCL_NAMESPACE_END
 
 #include "util/util_types_vector3_impl.h"
 
+/* SSE types. */
+#ifndef __KERNEL_GPU__
+#  include "util/util_sseb.h"
+#  include "util/util_ssei.h"
+#  include "util/util_ssef.h"
+#  include "util/util_avxf.h"
+#endif
+
 #endif /* __UTIL_TYPES_H__ */
author	Bastien Montagne <montagne29@wanadoo.fr>	2017-08-07 17:16:43 +0300
committer	Bastien Montagne <montagne29@wanadoo.fr>	2017-08-07 17:16:43 +0300
commit	b282716c3aae68d5fc781e7fc5df0241c3d91198 (patch)
tree	b6c50064fe3a0c07341cd70cf9745ea3092c8fb7 /intern/cycles/util
parent	459365443f62d2f8e8718c1d1b0fbaafd6d765de (diff)
parent	580741b317ae60eb3bf999d636da0325c7e67373 (diff)