diff options
author | Julian Eisel <eiseljulian@gmail.com> | 2017-05-08 01:19:22 +0300 |
---|---|---|
committer | Julian Eisel <eiseljulian@gmail.com> | 2017-05-08 01:19:22 +0300 |
commit | 9181f13af710f39e28368461f9979ce2052a55ee (patch) | |
tree | bb4dae8ee41e31d25ba980aa996c3e62539a59e0 /intern/cycles/util | |
parent | c47926542acf1fc342b4d0b8f7ccaf1e3a8be6d8 (diff) | |
parent | ce28025eaf5014f16460dd1cb798ce5292087d94 (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern/cycles/util')
-rw-r--r-- | intern/cycles/util/CMakeLists.txt | 1 | ||||
-rw-r--r-- | intern/cycles/util/util_math.h | 151 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float3.h | 15 | ||||
-rw-r--r-- | intern/cycles/util/util_math_int4.h | 9 | ||||
-rw-r--r-- | intern/cycles/util/util_math_matrix.h | 379 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 45 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 5 |
7 files changed, 527 insertions, 78 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 388aba65460..43f9a57d099 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -59,6 +59,7 @@ set(SRC_HEADERS util_math_int2.h util_math_int3.h util_math_int4.h + util_math_matrix.h util_md5.h util_opengl.h util_optimization.h diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 52b4fa859b7..b719640b19c 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -160,6 +160,78 @@ ccl_device_inline float max4(float a, float b, float c, float d) } #ifndef __KERNEL_OPENCL__ +/* Int/Float conversion */ + +ccl_device_inline int as_int(uint i) +{ + union { uint ui; int i; } u; + u.ui = i; + return u.i; +} + +ccl_device_inline uint as_uint(int i) +{ + union { uint ui; int i; } u; + u.i = i; + return u.ui; +} + +ccl_device_inline uint as_uint(float f) +{ + union { uint i; float f; } u; + u.f = f; + return u.i; +} + +ccl_device_inline int __float_as_int(float f) +{ + union { int i; float f; } u; + u.f = f; + return u.i; +} + +ccl_device_inline float __int_as_float(int i) +{ + union { int i; float f; } u; + u.i = i; + return u.f; +} + +ccl_device_inline uint __float_as_uint(float f) +{ + union { uint i; float f; } u; + u.f = f; + return u.i; +} + +ccl_device_inline float __uint_as_float(uint i) +{ + union { uint i; float f; } u; + u.i = i; + return u.f; +} +#endif /* __KERNEL_OPENCL__ */ + +/* Versions of functions which are safe for fast math. */ +ccl_device_inline bool isnan_safe(float f) +{ + unsigned int x = __float_as_uint(f); + return (x << 1) > 0xff000000u; +} + +ccl_device_inline bool isfinite_safe(float f) +{ + /* By IEEE 754 rule, 2*Inf equals Inf */ + unsigned int x = __float_as_uint(f); + return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); +} + +ccl_device_inline float ensure_finite(float v) +{ + return isfinite_safe(v)? v : 0.0f; +} + +#ifndef __KERNEL_OPENCL__ ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); @@ -250,57 +322,6 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPENCL__ -/* Int/Float conversion */ - -ccl_device_inline int as_int(uint i) -{ - union { uint ui; int i; } u; - u.ui = i; - return u.i; -} - -ccl_device_inline uint as_uint(int i) -{ - union { uint ui; int i; } u; - u.i = i; - return u.ui; -} - -ccl_device_inline uint as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline int __float_as_int(float f) -{ - union { int i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __int_as_float(int i) -{ - union { int i; float f; } u; - u.i = i; - return u.f; -} - -ccl_device_inline uint __float_as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __uint_as_float(uint i) -{ - union { uint i; float f; } u; - u.i = i; - return u.f; -} - /* Interpolation */ template<class A, class B> A lerp(const A& a, const A& b, const B& t) @@ -318,20 +339,6 @@ ccl_device_inline float triangle_area(const float3& v1, } #endif /* __KERNEL_OPENCL__ */ -/* Versions of functions which are safe for fast math. */ -ccl_device_inline bool isnan_safe(float f) -{ - unsigned int x = __float_as_uint(f); - return (x << 1) > 0xff000000u; -} - -ccl_device_inline bool isfinite_safe(float f) -{ - /* By IEEE 754 rule, 2*Inf equals Inf */ - unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); -} - /* Orthonormal vectors */ ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b) @@ -485,17 +492,17 @@ ccl_device float safe_powf(float a, float b) return compatible_powf(a, b); } -ccl_device float safe_logf(float a, float b) +ccl_device float safe_divide(float a, float b) { - if(UNLIKELY(a < 0.0f || b < 0.0f)) - return 0.0f; - - return logf(a)/logf(b); + return (b != 0.0f)? a/b: 0.0f; } -ccl_device float safe_divide(float a, float b) +ccl_device float safe_logf(float a, float b) { - return (b != 0.0f)? a/b: 0.0f; + if(UNLIKELY(a <= 0.0f || b <= 0.0f)) + return 0.0f; + + return safe_divide(logf(a),logf(b)); } ccl_device float safe_modulo(float a, float b) diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index e0c6b551040..a754be413fe 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -38,6 +38,7 @@ ccl_device_inline float3 operator/(const float3& a, const float3& b); ccl_device_inline float3 operator+(const float3& a, const float3& b); ccl_device_inline float3 operator-(const float3& a, const float3& b); ccl_device_inline float3 operator+=(float3& a, const float3& b); +ccl_device_inline float3 operator-=(float3& a, const float3& b); ccl_device_inline float3 operator*=(float3& a, const float3& b); ccl_device_inline float3 operator*=(float3& a, float f); ccl_device_inline float3 operator/=(float3& a, const float3& b); @@ -166,6 +167,11 @@ ccl_device_inline float3 operator+=(float3& a, const float3& b) return a = a + b; } +ccl_device_inline float3 operator-=(float3& a, const float3& b) +{ + return a = a - b; +} + ccl_device_inline float3 operator*=(float3& a, const float3& b) { return a = a * b; @@ -360,6 +366,15 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b) return a == b; #endif } + +ccl_device_inline float3 ensure_finite3(float3 v) +{ + if(!isfinite_safe(v.x)) v.x = 0.0; + if(!isfinite_safe(v.y)) v.y = 0.0; + if(!isfinite_safe(v.z)) v.z = 0.0; + return v; +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h index 4b327c90c33..79a8c0841e7 100644 --- a/intern/cycles/util/util_math_int4.h +++ b/intern/cycles/util/util_math_int4.h @@ -103,6 +103,15 @@ ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) (mask.w)? a.w: b.w); #endif } + +ccl_device_inline int4 load_int4(const int *v) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_loadu_si128((__m128i*)v)); +#else + return make_int4(v[0], v[1], v[2], v[3]); +#endif +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h new file mode 100644 index 00000000000..31ea10f18a8 --- /dev/null +++ b/intern/cycles/util/util_math_matrix.h @@ -0,0 +1,379 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_MATRIX_H__ +#define __UTIL_MATH_MATRIX_H__ + +CCL_NAMESPACE_BEGIN + +#define MAT(A, size, row, col) A[(row)*(size)+(col)] + +/* Variants that use a constant stride on GPUS. */ +#ifdef __KERNEL_GPU__ +#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)] +/* Element access when only the lower-triangular elements are stored. */ +#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)] +#define VECS(V, i, s) V[(i)*(s)] +#else +#define MATS(A, n, r, c, s) MAT(A, n, r, c) +#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)] +#define VECS(V, i, s) V[i] +#endif + +/* Zeroing helpers. */ + +ccl_device_inline void math_vector_zero(float *v, int n) +{ + for(int i = 0; i < n; i++) + v[i] = 0.0f; +} + +ccl_device_inline void math_matrix_zero(float *A, int n) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = 0.0f; +} + +/* Elementary vector operations. */ + +ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] += b[i]; +} + +ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] *= b[i]; +} + +ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n) +{ + for(int i = 0; i < n; i++) + a[i*astride] *= b[i]; +} + +ccl_device_inline void math_vector_scale(float *a, float b, int n) +{ + for(int i = 0; i < n; i++) + a[i] *= b; +} + +ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] = max(a[i], b[i]); +} + +ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w) +{ + for(int i = 0; i < n; i++) + v[i] += w*x[i]; +} + +ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride) +{ + for(int i = 0; i < n; i++) + v[i*stride] += w*x[i]; +} + +/* Elementary matrix operations. + * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */ + +ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride) +{ + for(int row = 0; row < n; row++) + MATHS(A, row, row, stride) += val; +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian(float *A, + int n, + float ccl_restrict_ptr v, + float weight) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) += v[row]*v[col]*weight; +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, + int n, + float ccl_restrict_ptr v, + float weight, + int stride) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MATHS(A, row, col, stride) += v[row]*v[col]*weight; +} + +/* Transpose matrix A inplace. */ +ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride) +{ + for(int i = 0; i < n; i++) { + for(int j = 0; j < i; j++) { + float temp = MATS(A, n, i, j, stride); + MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride); + MATS(A, n, j, i, stride) = temp; + } + } +} + + + + +/* Solvers for matrix problems */ + +/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A + * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L. + * Also, only the lower triangular part of A is ever accessed. */ +ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + float sum_col = MATHS(A, row, col, stride); + for(int k = 0; k < col; k++) { + sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride); + } + if(row == col) { + sum_col = sqrtf(max(sum_col, 0.0f)); + } + else { + sum_col /= MATHS(A, col, col, stride); + } + MATHS(A, row, col, stride) = sum_col; + } + } +} + +/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process. + * + * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A. + * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S. + * Since L is lower triangular, finding b is relatively easy since y is known. + * Then, the remaining problem is Lt*S = b, which again can be solved easily. + * + * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is + * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */ +ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride) +{ + math_trimatrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */ + math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */ + + /* Use forward substitution to solve L*b = y, replacing y by b. */ + for(int row = 0; row < n; row++) { + float3 sum = VECS(y, row, stride); + for(int col = 0; col < row; col++) + sum -= MATHS(A, row, col, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } + + /* Use backward substitution to solve Lt*S = b, replacing b by S. */ + for(int row = n-1; row >= 0; row--) { + float3 sum = VECS(y, row, stride); + for(int col = row+1; col < n; col++) + sum -= MATHS(A, col, row, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } +} + + + + + +/* Perform the Jacobi Eigenvalue Methon on matrix A. + * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed. + * The algorithm overwrites the contents of A. + * + * After returning, A will be overwritten with D, which is (almost) diagonal, + * and V will contain the eigenvectors of the original A in its rows (!), + * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A. + */ +ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride) +{ + const float singular_epsilon = 1e-9f; + + for (int row = 0; row < n; row++) + for (int col = 0; col < n; col++) + MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; + + for (int sweep = 0; sweep < 8; sweep++) { + float off_diagonal = 0.0f; + for (int row = 1; row < n; row++) + for (int col = 0; col < row; col++) + off_diagonal += fabsf(MAT(A, n, row, col)); + if (off_diagonal < 1e-7f) { + /* The matrix has nearly reached diagonal form. + * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ + break; + } + + /* Set the threshold for the small element rotation skip in the first sweep: + * Skip all elements that are less than a tenth of the average off-diagonal element. */ + float threshold = 0.2f*off_diagonal / (n*n); + + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { + /* Perform a Jacobi rotation on this element that reduces it to zero. */ + float element = MAT(A, n, row, col); + float abs_element = fabsf(element); + + /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ + if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + MAT(A, n, row, col) = 0.0f; + continue; + } + + if(element == 0.0f) { + continue; + } + + /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */ + if(sweep < 3 && (abs_element < threshold)) { + continue; + } + + /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi). + * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case. + * Then, we compute sin(phi) and cos(phi) themselves. */ + float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); + float ratio; + if (abs_element > singular_epsilon*fabsf(singular_diff)) { + float cot_2phi = 0.5f*singular_diff / element; + ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); + if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + } + else { + ratio = element / singular_diff; + } + + float c = 1.0f / sqrtf(1.0f + ratio*ratio); + float s = ratio*c; + /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */ + float tan_phi_2 = s / (1.0f + c); + + /* Update the singular values in the diagonal. */ + float singular_delta = ratio*element; + MAT(A, n, row, row) += singular_delta; + MAT(A, n, col, col) -= singular_delta; + + /* Set the element itself to zero. */ + MAT(A, n, row, col) = 0.0f; + + /* Perform the actual rotations on the matrices. */ +#define ROT(M, r1, c1, r2, c2, stride) \ + { \ + float M1 = MATS(M, n, r1, c1, stride); \ + float M2 = MATS(M, n, r2, c2, stride); \ + MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \ + MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \ + } + + /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */ + for(int i = 0 ; i < col; i++) ROT(A, col, i, row, i, 1); + for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1); + for(int i = row+1; i < n ; i++) ROT(A, i, col, i, row, 1); + + for(int i = 0 ; i < n ; i++) ROT(V, col, i, row, i, v_stride); +#undef ROT + } + } + } + + /* Sort eigenvalues and the associated eigenvectors. */ + for (int i = 0; i < n - 1; i++) { + float v = MAT(A, n, i, i); + int k = i; + for (int j = i; j < n; j++) { + if (MAT(A, n, j, j) >= v) { + v = MAT(A, n, j, j); + k = j; + } + } + if (k != i) { + /* Swap eigenvalues. */ + MAT(A, n, k, k) = MAT(A, n, i, i); + MAT(A, n, i, i) = v; + /* Swap eigenvectors. */ + for (int j = 0; j < n; j++) { + float v = MATS(V, n, i, j, v_stride); + MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); + MATS(V, n, k, j, v_stride) = v; + } + } + } +} + +#ifdef __KERNEL_SSE3__ + +ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +{ + for(int i = 0; i < n; i++) + A[i] = _mm_setzero_ps(); +} +ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = _mm_setzero_ps(); +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); +} + +ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +{ + for(int i = 0; i < n; i++) + V[i] = _mm_add_ps(V[i], a[i]); +} + +ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +{ + for(int i = 0; i < n; i++) + V[i] = _mm_mul_ps(V[i], a[i]); +} + +ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] = _mm_max_ps(a[i], b[i]); +} + +ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); +} +#endif + +#undef MAT + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_MATRIX_H__ */ diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 557809a5719..545a3399f32 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v) static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; static const size_t BITSCAN_NO_BIT_SET_64 = 64; +#ifdef __KERNEL_SSE3__ /* Emulation of SSE4 functions with SSE3 */ - -#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__) +# ifndef __KERNEL_SSE41__ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -362,7 +362,7 @@ __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { char* _r = (char*)(&rvalue + 1); char* _v = (char*)(& value + 1); char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i)); + for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); return rvalue; } @@ -395,7 +395,7 @@ __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int inde #define _mm_extract_ps __emu_mm_extract_ps __forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32* ptr = (int32*)&input; return ptr[index]; + int32_t* ptr = (int32_t*)&input; return ptr[index]; } #define _mm_insert_ps __emu_mm_insert_ps @@ -415,7 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -#ifdef _M_X64 +# ifdef _M_X64 #define _mm_insert_epi64 __emu_mm_insert_epi64 __forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; @@ -426,7 +426,40 @@ __forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { assert(size_t(index) < 2); return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); } -#endif +# endif + +# endif + +#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) + +/* Return a __m128 with every element set to the largest element of v. */ +ccl_device_inline __m128 _mm_hmax_ps(__m128 v) +{ + /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ + v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); + /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ + v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); + return v; +} + +/* Return the sum of the four elements of x. */ +ccl_device_inline float _mm_hsum_ss(__m128 x) +{ + __m128 a = _mm_movehdup_ps(x); + __m128 b = _mm_add_ps(x, a); + return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); +} + +/* Return a __m128 with every element set to the sum of the four elements of x. */ +ccl_device_inline __m128 _mm_hsum_ps(__m128 x) +{ + x = _mm_hadd_ps(x, x); + x = _mm_hadd_ps(x, x); + return x; +} + +/* Replace elements of x with zero where mask isn't set. */ +#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) #endif diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index feab7996aee..0039c59ec48 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -133,6 +133,11 @@ ccl_device_inline size_t align_up(size_t offset, size_t alignment) return (offset + alignment - 1) & ~(alignment - 1); } +ccl_device_inline size_t divide_up(size_t x, size_t y) +{ + return (x + y - 1) / y; +} + ccl_device_inline size_t round_up(size_t x, size_t multiple) { return ((x + multiple - 1) / multiple) * multiple; |