From 0a07cdbe80b2999478fa0d062a846e9bcfafc872 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Fri, 14 Apr 2017 14:05:23 +0200 Subject: Cycles: Split vectorized math utilities to a dedicated files This file was even a bigger mess than vectorized types header, cleaning it up to make it easier to maintain this files and extend further. --- intern/cycles/kernel/CMakeLists.txt | 6 + intern/cycles/util/CMakeLists.txt | 6 + intern/cycles/util/util_math.h | 1111 ++------------------------- intern/cycles/util/util_math_float2.h | 227 ++++++ intern/cycles/util/util_math_float3.h | 365 +++++++++ intern/cycles/util/util_math_float4.h | 393 ++++++++++ intern/cycles/util/util_math_int2.h | 77 ++ intern/cycles/util/util_math_int3.h | 83 ++ intern/cycles/util/util_math_int4.h | 110 +++ intern/cycles/util/util_types_float2.h | 1 + intern/cycles/util/util_types_float2_impl.h | 9 + intern/cycles/util/util_types_float3.h | 1 + intern/cycles/util/util_types_float3_impl.h | 9 + intern/cycles/util/util_types_float4.h | 1 + intern/cycles/util/util_types_float4_impl.h | 11 + intern/cycles/util/util_types_int3.h | 1 + intern/cycles/util/util_types_int3_impl.h | 9 + intern/cycles/util/util_types_int4.h | 1 + intern/cycles/util/util_types_int4_impl.h | 9 + 19 files changed, 1397 insertions(+), 1033 deletions(-) create mode 100644 intern/cycles/util/util_math_float2.h create mode 100644 intern/cycles/util/util_math_float3.h create mode 100644 intern/cycles/util/util_math_float4.h create mode 100644 intern/cycles/util/util_math_int2.h create mode 100644 intern/cycles/util/util_math_int3.h create mode 100644 intern/cycles/util/util_math_int4.h (limited to 'intern') diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 10eff10d809..3750225571d 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -196,6 +196,12 @@ set(SRC_UTIL_HEADERS ../util/util_math.h ../util/util_math_fast.h ../util/util_math_intersect.h + ../util/util_math_float2.h + ../util/util_math_float3.h + ../util/util_math_float4.h + ../util/util_math_int2.h + ../util/util_math_int3.h + ../util/util_math_int4.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 8d0a6c9fff9..388aba65460 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -53,6 +53,12 @@ set(SRC_HEADERS util_math_cdf.h util_math_fast.h util_math_intersect.h + util_math_float2.h + util_math_float3.h + util_math_float4.h + util_math_int2.h + util_math_int3.h + util_math_int4.h util_md5.h util_opengl.h util_optimization.h diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 77781ed4574..52b4fa859b7 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -28,12 +28,10 @@ #ifndef __KERNEL_OPENCL__ - -#include -#include -#include - -#endif +# include +# include +# include +#endif /* __KERNEL_OPENCL__ */ #include "util/util_types.h" @@ -43,49 +41,44 @@ CCL_NAMESPACE_BEGIN /* Division */ #ifndef M_PI_F -#define M_PI_F (3.1415926535897932f) /* pi */ +# define M_PI_F (3.1415926535897932f) /* pi */ #endif #ifndef M_PI_2_F -#define M_PI_2_F (1.5707963267948966f) /* pi/2 */ +# define M_PI_2_F (1.5707963267948966f) /* pi/2 */ #endif #ifndef M_PI_4_F -#define M_PI_4_F (0.7853981633974830f) /* pi/4 */ +# define M_PI_4_F (0.7853981633974830f) /* pi/4 */ #endif #ifndef M_1_PI_F -#define M_1_PI_F (0.3183098861837067f) /* 1/pi */ +# define M_1_PI_F (0.3183098861837067f) /* 1/pi */ #endif #ifndef M_2_PI_F -#define M_2_PI_F (0.6366197723675813f) /* 2/pi */ +# define M_2_PI_F (0.6366197723675813f) /* 2/pi */ #endif /* Multiplication */ #ifndef M_2PI_F -#define M_2PI_F (6.2831853071795864f) /* 2*pi */ +# define M_2PI_F (6.2831853071795864f) /* 2*pi */ #endif #ifndef M_4PI_F -#define M_4PI_F (12.566370614359172f) /* 4*pi */ +# define M_4PI_F (12.566370614359172f) /* 4*pi */ #endif /* Float sqrt variations */ - #ifndef M_SQRT2_F -#define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */ +# define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */ #endif - #ifndef M_LN2_F -#define M_LN2_F (0.6931471805599453f) /* ln(2) */ +# define M_LN2_F (0.6931471805599453f) /* ln(2) */ #endif - #ifndef M_LN10_F -#define M_LN10_F (2.3025850929940457f) /* ln(10) */ +# define M_LN10_F (2.3025850929940457f) /* ln(10) */ #endif /* Scalar */ #ifdef _WIN32 - -#ifndef __KERNEL_OPENCL__ - +# ifndef __KERNEL_OPENCL__ ccl_device_inline float fmaxf(float a, float b) { return (a > b)? a: b; @@ -95,13 +88,10 @@ ccl_device_inline float fminf(float a, float b) { return (a < b)? a: b; } - -#endif - -#endif +# endif /* !__KERNEL_OPENCL__ */ +#endif /* _WIN32 */ #ifndef __KERNEL_GPU__ - using std::isfinite; using std::isnan; @@ -157,8 +147,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } - -#endif +#endif /* __KERNEL_GPU__ */ ccl_device_inline float min4(float a, float b, float c, float d) { @@ -170,13 +159,7 @@ ccl_device_inline float max4(float a, float b, float c, float d) return max(max(a, b), max(c, d)); } -ccl_device_inline float max3(float3 a) -{ - return max(max(a.x, a.y), a.z); -} - #ifndef __KERNEL_OPENCL__ - ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); @@ -191,17 +174,14 @@ ccl_device_inline float mix(float a, float b, float t) { return a + t*(b - a); } - -#endif +#endif /* __KERNEL_OPENCL__ */ #ifndef __KERNEL_CUDA__ - ccl_device_inline float saturate(float a) { return clamp(a, 0.0f, 1.0f); } - -#endif +#endif /* __KERNEL_CUDA__ */ ccl_device_inline int float_to_int(float f) { @@ -242,1036 +222,101 @@ ccl_device_inline int mod(int x, int m) return (x % m + m) % m; } -/* Float2 Vector */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool is_zero(const float2& a) -{ - return (a.x == 0.0f && a.y == 0.0f); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float average(const float2& a) -{ - return (a.x + a.y)*(1.0f/2.0f); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float2 operator-(const float2& a) -{ - return make_float2(-a.x, -a.y); -} - -ccl_device_inline float2 operator*(const float2& a, const float2& b) -{ - return make_float2(a.x*b.x, a.y*b.y); -} - -ccl_device_inline float2 operator*(const float2& a, float f) -{ - return make_float2(a.x*f, a.y*f); -} - -ccl_device_inline float2 operator*(float f, const float2& a) -{ - return make_float2(a.x*f, a.y*f); -} - -ccl_device_inline float2 operator/(float f, const float2& a) -{ - return make_float2(f/a.x, f/a.y); -} - -ccl_device_inline float2 operator/(const float2& a, float f) -{ - float invf = 1.0f/f; - return make_float2(a.x*invf, a.y*invf); -} - -ccl_device_inline float2 operator/(const float2& a, const float2& b) -{ - return make_float2(a.x/b.x, a.y/b.y); -} - -ccl_device_inline float2 operator+(const float2& a, const float2& b) -{ - return make_float2(a.x+b.x, a.y+b.y); -} - -ccl_device_inline float2 operator-(const float2& a, const float2& b) -{ - return make_float2(a.x-b.x, a.y-b.y); -} - -ccl_device_inline float2 operator+=(float2& a, const float2& b) -{ - return a = a + b; -} - -ccl_device_inline float2 operator*=(float2& a, const float2& b) -{ - return a = a * b; -} - -ccl_device_inline float2 operator*=(float2& a, float f) -{ - return a = a * f; -} - -ccl_device_inline float2 operator/=(float2& a, const float2& b) -{ - return a = a / b; -} - -ccl_device_inline float2 operator/=(float2& a, float f) -{ - float invf = 1.0f/f; - return a = a * invf; -} - - -ccl_device_inline float dot(const float2& a, const float2& b) -{ - return a.x*b.x + a.y*b.y; -} - -ccl_device_inline float cross(const float2& a, const float2& b) -{ - return (a.x*b.y - a.y*b.x); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool operator==(const int2 a, const int2 b) -{ - return (a.x == b.x && a.y == b.y); -} - -ccl_device_inline float len(const float2& a) -{ - return sqrtf(dot(a, a)); -} - -ccl_device_inline float2 normalize(const float2& a) -{ - return a/len(a); -} - -ccl_device_inline float2 normalize_len(const float2& a, float *t) -{ - *t = len(a); - return a/(*t); -} - -ccl_device_inline float2 safe_normalize(const float2& a) -{ - float t = len(a); - return (t != 0.0f)? a/t: a; -} - -ccl_device_inline bool operator==(const float2& a, const float2& b) -{ - return (a.x == b.x && a.y == b.y); -} - -ccl_device_inline bool operator!=(const float2& a, const float2& b) -{ - return !(a == b); -} - -ccl_device_inline float2 min(const float2& a, const float2& b) -{ - return make_float2(min(a.x, b.x), min(a.y, b.y)); -} - -ccl_device_inline float2 max(const float2& a, const float2& b) -{ - return make_float2(max(a.x, b.x), max(a.y, b.y)); -} - -ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline float2 fabs(const float2& a) +ccl_device_inline float3 float2_to_float3(const float2 a) { - return make_float2(fabsf(a.x), fabsf(a.y)); + return make_float3(a.x, a.y, 0.0f); } -ccl_device_inline float2 as_float2(const float4& a) +ccl_device_inline float3 float4_to_float3(const float4 a) { - return make_float2(a.x, a.y); + return make_float3(a.x, a.y, a.z); } -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_float2(const char *label, const float2& a) +ccl_device_inline float4 float3_to_float4(const float3 a) { - printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); + return make_float4(a.x, a.y, a.z, 1.0f); } -#endif - -#ifndef __KERNEL_OPENCL__ +CCL_NAMESPACE_END -ccl_device_inline float2 interp(const float2& a, const float2& b, float t) -{ - return a + t*(b - a); -} +#include "util/util_math_int2.h" +#include "util/util_math_int3.h" +#include "util/util_math_int4.h" -#endif +#include "util/util_math_float2.h" +#include "util/util_math_float3.h" +#include "util/util_math_float4.h" -/* Float3 Vector */ +CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPENCL__ +/* Int/Float conversion */ -ccl_device_inline float3 operator-(const float3& a) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#else - return make_float3(-a.x, -a.y, -a.z); -#endif -} - -ccl_device_inline float3 operator*(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_mul_ps(a.m128,b.m128)); -#else - return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); -#endif -} - -ccl_device_inline float3 operator*(const float3& a, const float f) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); -#else - return make_float3(a.x*f, a.y*f, a.z*f); -#endif -} - -ccl_device_inline float3 operator*(const float f, const float3& a) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); -#else - return make_float3(a.x*f, a.y*f, a.z*f); -#endif -} - -ccl_device_inline float3 operator/(const float f, const float3& a) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); -#else - return make_float3(f / a.x, f / a.y, f / a.z); -#endif -} - -ccl_device_inline float3 operator/(const float3& a, const float f) -{ - float invf = 1.0f/f; - return a * invf; -} - -ccl_device_inline float3 operator/(const float3& a, const float3& b) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); -#else - return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); -#endif -} - -ccl_device_inline float3 operator+(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_add_ps(a.m128, b.m128)); -#else - return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif -} - -ccl_device_inline float3 operator-(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_sub_ps(a.m128, b.m128)); -#else - return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif -} - -ccl_device_inline float3 operator+=(float3& a, const float3& b) -{ - return a = a + b; -} - -ccl_device_inline float3 operator*=(float3& a, const float3& b) -{ - return a = a * b; -} - -ccl_device_inline float3 operator*=(float3& a, float f) -{ - return a = a * f; -} - -ccl_device_inline float3 operator/=(float3& a, const float3& b) -{ - return a = a / b; -} - -ccl_device_inline float3 operator/=(float3& a, float f) -{ - float invf = 1.0f/f; - return a = a * invf; -} - -ccl_device_inline float dot(const float3& a, const float3& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); -#else - return a.x*b.x + a.y*b.y + a.z*b.z; -#endif -} - -ccl_device_inline float dot_xy(const float3& a, const float3& b) +ccl_device_inline int as_int(uint i) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); -#else - return a.x*b.x + a.y*b.y; -#endif + union { uint ui; int i; } u; + u.ui = i; + return u.i; } -ccl_device_inline float dot(const float4& a, const float4& b) +ccl_device_inline uint as_uint(int i) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -#else - return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w); -#endif + union { uint ui; int i; } u; + u.i = i; + return u.ui; } -ccl_device_inline float3 cross(const float3& a, const float3& b) +ccl_device_inline uint as_uint(float f) { - float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); - return r; + union { uint i; float f; } u; + u.f = f; + return u.i; } -#endif - -ccl_device_inline float len(const float3 a) +ccl_device_inline int __float_as_int(float f) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); -#else - return sqrtf(dot(a, a)); -#endif + union { int i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float len_squared(const float3 a) +ccl_device_inline float __int_as_float(int i) { - return dot(a, a); + union { int i; float f; } u; + u.i = i; + return u.f; } -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float len_squared(const float4& a) +ccl_device_inline uint __float_as_uint(float f) { - return dot(a, a); + union { uint i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float3 normalize(const float3& a) +ccl_device_inline float __uint_as_float(uint i) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); - return float3(_mm_div_ps(a.m128, norm)); -#else - return a/len(a); -#endif + union { uint i; float f; } u; + u.i = i; + return u.f; } -#endif +/* Interpolation */ -ccl_device_inline float3 saturate3(float3 a) +template A lerp(const A& a, const A& b, const B& t) { - return make_float3(saturate(a.x), saturate(a.y), saturate(a.z)); + return (A)(a * ((B)1 - t) + b * t); } -ccl_device_inline float3 normalize_len(const float3 a, float *t) -{ - *t = len(a); - float x = 1.0f / *t; - return a*x; -} +/* Triangle */ -ccl_device_inline float3 safe_normalize(const float3 a) +ccl_device_inline float triangle_area(const float3& v1, + const float3& v2, + const float3& v3) { - float t = len(a); - return (t != 0.0f)? a * (1.0f/t) : a; + return len(cross(v3 - v2, v1 - v2))*0.5f; } - -ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) -{ - *t = len(a); - return (*t != 0.0f)? a/(*t): a; -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool operator==(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; -#else - return (a.x == b.x && a.y == b.y && a.z == b.z); -#endif -} - -ccl_device_inline bool operator!=(const float3& a, const float3& b) -{ - return !(a == b); -} - -ccl_device_inline float3 min(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_min_ps(a.m128, b.m128)); -#else - return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif -} - -ccl_device_inline float3 max(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_max_ps(a.m128, b.m128)); -#else - return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif -} - -ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline float3 fabs(const float3& a) -{ -#ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return float3(_mm_and_ps(a.m128, mask)); -#else - return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); -#endif -} - -#endif - -ccl_device_inline float3 float2_to_float3(const float2 a) -{ - return make_float3(a.x, a.y, 0.0f); -} - -ccl_device_inline float3 float4_to_float3(const float4 a) -{ - return make_float3(a.x, a.y, a.z); -} - -ccl_device_inline float4 float3_to_float4(const float3 a) -{ - return make_float4(a.x, a.y, a.z, 1.0f); -} - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_float3(const char *label, const float3& a) -{ - printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); -} - -ccl_device_inline float3 rcp(const float3& a) -{ -#ifdef __KERNEL_SSE__ - const float4 r(_mm_rcp_ps(a.m128)); - return float3(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); -#else - return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); -#endif -} - -#endif - -ccl_device_inline float3 interp(float3 a, float3 b, float t) -{ - return a + t*(b - a); -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float3 mix(const float3& a, const float3& b, float t) -{ - return a + t*(b - a); -} - -#endif - -ccl_device_inline bool is_zero(const float3 a) -{ -#ifdef __KERNEL_SSE__ - return a == make_float3(0.0f); -#else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); -#endif -} - -ccl_device_inline float reduce_add(const float3 a) -{ - return (a.x + a.y + a.z); -} - -ccl_device_inline float average(const float3 a) -{ - return reduce_add(a)*(1.0f/3.0f); -} - -ccl_device_inline bool isequal_float3(const float3 a, const float3 b) -{ -#ifdef __KERNEL_OPENCL__ - return all(a == b); -#else - return a == b; -#endif -} - -/* Float4 Vector */ - -#ifdef __KERNEL_SSE__ - -template -__forceinline const float4 shuffle(const float4& b) -{ - return float4(_mm_castsi128_ps( - _mm_shuffle_epi32(_mm_castps_si128(b), - _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); -} - -#if defined(__KERNEL_SSE3__) -template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) -{ - return float4(_mm_moveldup_ps(b)); -} - -template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) -{ - return float4(_mm_movehdup_ps(b)); -} -#endif - -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) -{ - return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float4 operator-(const float4& a) -{ -#ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return float4(_mm_xor_ps(a.m128, mask)); -#else - return make_float4(-a.x, -a.y, -a.z, -a.w); -#endif -} - -ccl_device_inline float4 operator*(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return float4(_mm_mul_ps(a.m128, b.m128)); -#else - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -#endif -} - -ccl_device_inline float4 operator*(const float4& a, float f) -{ -#if defined(__KERNEL_SSE__) - return a * make_float4(f); -#else - return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); -#endif -} - -ccl_device_inline float4 operator*(float f, const float4& a) -{ - return a * f; -} - -ccl_device_inline float4 rcp(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 r(_mm_rcp_ps(a.m128)); - return float4(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); -#else - return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); -#endif -} - -ccl_device_inline float4 operator/(const float4& a, float f) -{ - return a * (1.0f/f); -} - -ccl_device_inline float4 operator/(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return a * rcp(b); -#else - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -#endif - -} - -ccl_device_inline float4 operator+(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return float4(_mm_add_ps(a.m128, b.m128)); -#else - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -#endif -} - -ccl_device_inline float4 operator-(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return float4(_mm_sub_ps(a.m128, b.m128)); -#else - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -#endif -} - -ccl_device_inline float4 operator+=(float4& a, const float4& b) -{ - return a = a + b; -} - -ccl_device_inline float4 operator*=(float4& a, const float4& b) -{ - return a = a * b; -} - -ccl_device_inline float4 operator/=(float4& a, float f) -{ - return a = a / f; -} - -ccl_device_inline int4 operator<(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128))); -#else - return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); -#endif -} - -ccl_device_inline int4 operator>=(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128))); -#else - return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); -#endif -} - -ccl_device_inline int4 operator<=(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128))); -#else - return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); -#endif -} - -ccl_device_inline bool operator==(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; -#else - return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); -#endif -} - -ccl_device_inline float4 cross(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); -#else - return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f); -#endif -} - -ccl_device_inline bool is_zero(const float4& a) -{ -#ifdef __KERNEL_SSE__ - return a == make_float4(0.0f); -#else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); -#endif -} - -ccl_device_inline float reduce_add(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h(shuffle<1,0,3,2>(a) + a); - /* TODO(sergey): Investigate efficiency. */ - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); -#else - return ((a.x + a.y) + (a.z + a.w)); -#endif -} - -ccl_device_inline float average(const float4& a) -{ - return reduce_add(a) * 0.25f; -} - -ccl_device_inline float len(const float4& a) -{ - return sqrtf(dot(a, a)); -} - -ccl_device_inline float4 normalize(const float4& a) -{ - return a/len(a); -} - -ccl_device_inline float4 safe_normalize(const float4& a) -{ - float t = len(a); - return (t != 0.0f)? a/t: a; -} - -ccl_device_inline float4 min(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return float4(_mm_min_ps(a.m128, b.m128)); -#else - return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif -} - -ccl_device_inline float4 max(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return float4(_mm_max_ps(a.m128, b.m128)); -#else - return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), - _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); -#else - return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); -#endif -} - -ccl_device_inline float4 reduce_min(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = min(shuffle<1,0,3,2>(a), a); - return min(shuffle<2,3,0,1>(h), h); -#else - return make_float4(min(min(a.x, a.y), min(a.z, a.w))); -#endif -} - -ccl_device_inline float4 reduce_max(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = max(shuffle<1,0,3,2>(a), a); - return max(shuffle<2,3,0,1>(h), h); -#else - return make_float4(max(max(a.x, a.y), max(a.z, a.w))); -#endif -} - -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; -#else - return make_float4((a.x + a.y) + (a.z + a.w)); -#endif -} -#endif - -ccl_device_inline void print_float4(const char *label, const float4& a) -{ - printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w); -} - -#endif - -/* Int2 */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int2 operator+(const int2 &a, const int2 &b) -{ - return make_int2(a.x + b.x, a.y + b.y); -} - -ccl_device_inline int2 operator+=(int2 &a, const int2 &b) -{ - return a = a + b; -} - -ccl_device_inline int2 operator-(const int2 &a, const int2 &b) -{ - return make_int2(a.x - b.x, a.y - b.y); -} - -ccl_device_inline int2 operator*(const int2 &a, const int2 &b) -{ - return make_int2(a.x * b.x, a.y * b.y); -} - -ccl_device_inline int2 operator/(const int2 &a, const int2 &b) -{ - return make_int2(a.x / b.x, a.y / b.y); -} - -#endif - -/* Int3 */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int3 min(int3 a, int3 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return int3(_mm_min_epi32(a.m128, b.m128)); -#else - return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif -} - -ccl_device_inline int3 max(int3 a, int3 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return int3(_mm_max_epi32(a.m128, b.m128)); -#else - return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif -} - -ccl_device_inline int3 clamp(const int3& a, int mn, int mx) -{ -#ifdef __KERNEL_SSE__ - return min(max(a, make_int3(mn)), make_int3(mx)); -#else - return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); -#endif -} - -ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx) -{ -#ifdef __KERNEL_SSE__ - return min(max(a, mn), make_int3(mx)); -#else - return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); -#endif -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_int3(const char *label, const int3& a) -{ - printf("%s: %d %d %d\n", label, a.x, a.y, a.z); -} - -#endif - -/* Int4 */ - -#ifndef __KERNEL_GPU__ - -ccl_device_inline int4 operator+(const int4& a, const int4& b) -{ -#ifdef __KERNEL_SSE__ - return int4(_mm_add_epi32(a.m128, b.m128)); -#else - return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -#endif -} - -ccl_device_inline int4 operator+=(int4& a, const int4& b) -{ - return a = a + b; -} - -ccl_device_inline int4 operator>>(const int4& a, int i) -{ -#ifdef __KERNEL_SSE__ - return int4(_mm_srai_epi32(a.m128, i)); -#else - return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); -#endif -} - -ccl_device_inline int4 min(int4 a, int4 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return int4(_mm_min_epi32(a.m128, b.m128)); -#else - return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif -} - -ccl_device_inline int4 max(int4 a, int4 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return int4(_mm_max_epi32(a.m128, b.m128)); -#else - return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif -} - -ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) -{ -#ifdef __KERNEL_SSE__ - const __m128 m = _mm_cvtepi32_ps(mask); - /* TODO(sergey): avoid cvt. */ - return int4(_mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), - _mm_andnot_ps(m, _mm_castsi128_ps(b))))); -#else - return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); -#endif -} - -ccl_device_inline void print_int4(const char *label, const int4& a) -{ - printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); -} - -#endif - -/* Int/Float conversion */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int as_int(uint i) -{ - union { uint ui; int i; } u; - u.ui = i; - return u.i; -} - -ccl_device_inline uint as_uint(int i) -{ - union { uint ui; int i; } u; - u.i = i; - return u.ui; -} - -ccl_device_inline uint as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline int __float_as_int(float f) -{ - union { int i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __int_as_float(int i) -{ - union { int i; float f; } u; - u.i = i; - return u.f; -} - -ccl_device_inline uint __float_as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __uint_as_float(uint i) -{ - union { uint i; float f; } u; - u.i = i; - return u.f; -} - - -/* Interpolation */ - -template A lerp(const A& a, const A& b, const B& t) -{ - return (A)(a * ((B)1 - t) + b * t); -} - -/* Triangle */ - -ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3) -{ - return len(cross(v3 - v2, v1 - v2))*0.5f; -} - -#endif +#endif /* __KERNEL_OPENCL__ */ /* Versions of functions which are safe for fast math. */ ccl_device_inline bool isnan_safe(float f) @@ -1382,16 +427,16 @@ ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle) float3 r; r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) + - (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) + - (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z); + (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) + + (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z); r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) + - ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) + - (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z); + ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) + + (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z); r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) + - (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) + - ((costheta + (1 - costheta) * axis.z * axis.z) * p.z); + (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) + + ((costheta + (1 - costheta) * axis.z * axis.z) * p.z); return r; } diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h new file mode 100644 index 00000000000..6f9d0855d50 --- /dev/null +++ b/intern/cycles/util/util_math_float2.h @@ -0,0 +1,227 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT2_H__ +#define __UTIL_MATH_FLOAT2_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float2 operator-(const float2& a); +ccl_device_inline float2 operator*(const float2& a, const float2& b); +ccl_device_inline float2 operator*(const float2& a, float f); +ccl_device_inline float2 operator*(float f, const float2& a); +ccl_device_inline float2 operator/(float f, const float2& a); +ccl_device_inline float2 operator/(const float2& a, float f); +ccl_device_inline float2 operator/(const float2& a, const float2& b); +ccl_device_inline float2 operator+(const float2& a, const float2& b); +ccl_device_inline float2 operator-(const float2& a, const float2& b); +ccl_device_inline float2 operator+=(float2& a, const float2& b); +ccl_device_inline float2 operator*=(float2& a, const float2& b); +ccl_device_inline float2 operator*=(float2& a, float f); +ccl_device_inline float2 operator/=(float2& a, const float2& b); +ccl_device_inline float2 operator/=(float2& a, float f); + +ccl_device_inline bool operator==(const float2& a, const float2& b); +ccl_device_inline bool operator!=(const float2& a, const float2& b); + +ccl_device_inline bool is_zero(const float2& a); +ccl_device_inline float average(const float2& a); +ccl_device_inline float dot(const float2& a, const float2& b); +ccl_device_inline float cross(const float2& a, const float2& b); +ccl_device_inline float len(const float2& a); +ccl_device_inline float2 normalize(const float2& a); +ccl_device_inline float2 normalize_len(const float2& a, float *t); +ccl_device_inline float2 safe_normalize(const float2& a); +ccl_device_inline float2 min(const float2& a, const float2& b); +ccl_device_inline float2 max(const float2& a, const float2& b); +ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx); +ccl_device_inline float2 fabs(const float2& a); +ccl_device_inline float2 as_float2(const float4& a); +ccl_device_inline float2 interp(const float2& a, const float2& b, float t); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float2 operator-(const float2& a) +{ + return make_float2(-a.x, -a.y); +} + +ccl_device_inline float2 operator*(const float2& a, const float2& b) +{ + return make_float2(a.x*b.x, a.y*b.y); +} + +ccl_device_inline float2 operator*(const float2& a, float f) +{ + return make_float2(a.x*f, a.y*f); +} + +ccl_device_inline float2 operator*(float f, const float2& a) +{ + return make_float2(a.x*f, a.y*f); +} + +ccl_device_inline float2 operator/(float f, const float2& a) +{ + return make_float2(f/a.x, f/a.y); +} + +ccl_device_inline float2 operator/(const float2& a, float f) +{ + float invf = 1.0f/f; + return make_float2(a.x*invf, a.y*invf); +} + +ccl_device_inline float2 operator/(const float2& a, const float2& b) +{ + return make_float2(a.x/b.x, a.y/b.y); +} + +ccl_device_inline float2 operator+(const float2& a, const float2& b) +{ + return make_float2(a.x+b.x, a.y+b.y); +} + +ccl_device_inline float2 operator-(const float2& a, const float2& b) +{ + return make_float2(a.x-b.x, a.y-b.y); +} + +ccl_device_inline float2 operator+=(float2& a, const float2& b) +{ + return a = a + b; +} + +ccl_device_inline float2 operator*=(float2& a, const float2& b) +{ + return a = a * b; +} + +ccl_device_inline float2 operator*=(float2& a, float f) +{ + return a = a * f; +} + +ccl_device_inline float2 operator/=(float2& a, const float2& b) +{ + return a = a / b; +} + +ccl_device_inline float2 operator/=(float2& a, float f) +{ + float invf = 1.0f/f; + return a = a * invf; +} + +ccl_device_inline bool operator==(const float2& a, const float2& b) +{ + return (a.x == b.x && a.y == b.y); +} + +ccl_device_inline bool operator!=(const float2& a, const float2& b) +{ + return !(a == b); +} + +ccl_device_inline bool is_zero(const float2& a) +{ + return (a.x == 0.0f && a.y == 0.0f); +} + +ccl_device_inline float average(const float2& a) +{ + return (a.x + a.y)*(1.0f/2.0f); +} + +ccl_device_inline float dot(const float2& a, const float2& b) +{ + return a.x*b.x + a.y*b.y; +} + +ccl_device_inline float cross(const float2& a, const float2& b) +{ + return (a.x*b.y - a.y*b.x); +} + +ccl_device_inline float len(const float2& a) +{ + return sqrtf(dot(a, a)); +} + +ccl_device_inline float2 normalize(const float2& a) +{ + return a/len(a); +} + +ccl_device_inline float2 normalize_len(const float2& a, float *t) +{ + *t = len(a); + return a/(*t); +} + +ccl_device_inline float2 safe_normalize(const float2& a) +{ + float t = len(a); + return (t != 0.0f)? a/t: a; +} + +ccl_device_inline float2 min(const float2& a, const float2& b) +{ + return make_float2(min(a.x, b.x), min(a.y, b.y)); +} + +ccl_device_inline float2 max(const float2& a, const float2& b) +{ + return make_float2(max(a.x, b.x), max(a.y, b.y)); +} + +ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float2 fabs(const float2& a) +{ + return make_float2(fabsf(a.x), fabsf(a.y)); +} + +ccl_device_inline float2 as_float2(const float4& a) +{ + return make_float2(a.x, a.y); +} + +ccl_device_inline float2 interp(const float2& a, const float2& b, float t) +{ + return a + t*(b - a); +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT2_H__ */ diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h new file mode 100644 index 00000000000..e0c6b551040 --- /dev/null +++ b/intern/cycles/util/util_math_float3.h @@ -0,0 +1,365 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT3_H__ +#define __UTIL_MATH_FLOAT3_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float3 operator-(const float3& a); +ccl_device_inline float3 operator*(const float3& a, const float3& b); +ccl_device_inline float3 operator*(const float3& a, const float f); +ccl_device_inline float3 operator*(const float f, const float3& a); +ccl_device_inline float3 operator/(const float f, const float3& a); +ccl_device_inline float3 operator/(const float3& a, const float f); +ccl_device_inline float3 operator/(const float3& a, const float3& b); +ccl_device_inline float3 operator+(const float3& a, const float3& b); +ccl_device_inline float3 operator-(const float3& a, const float3& b); +ccl_device_inline float3 operator+=(float3& a, const float3& b); +ccl_device_inline float3 operator*=(float3& a, const float3& b); +ccl_device_inline float3 operator*=(float3& a, float f); +ccl_device_inline float3 operator/=(float3& a, const float3& b); +ccl_device_inline float3 operator/=(float3& a, float f); + +ccl_device_inline bool operator==(const float3& a, const float3& b); +ccl_device_inline bool operator!=(const float3& a, const float3& b); + +ccl_device_inline float dot(const float3& a, const float3& b); +ccl_device_inline float dot_xy(const float3& a, const float3& b); +ccl_device_inline float3 cross(const float3& a, const float3& b); +ccl_device_inline float3 normalize(const float3& a); +ccl_device_inline float3 min(const float3& a, const float3& b); +ccl_device_inline float3 max(const float3& a, const float3& b); +ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx); +ccl_device_inline float3 fabs(const float3& a); +ccl_device_inline float3 mix(const float3& a, const float3& b, float t); +ccl_device_inline float3 rcp(const float3& a); +#endif /* !__KERNEL_OPENCL__ */ + +ccl_device_inline float max3(float3 a); +ccl_device_inline float len(const float3 a); +ccl_device_inline float len_squared(const float3 a); + +ccl_device_inline float3 saturate3(float3 a); +ccl_device_inline float3 safe_normalize(const float3 a); +ccl_device_inline float3 normalize_len(const float3 a, float *t);; +ccl_device_inline float3 safe_normalize_len(const float3 a, float *t); +ccl_device_inline float3 interp(float3 a, float3 b, float t); + +ccl_device_inline bool is_zero(const float3 a); +ccl_device_inline float reduce_add(const float3 a); +ccl_device_inline float average(const float3 a); +ccl_device_inline bool isequal_float3(const float3 a, const float3 b); + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float3 operator-(const float3& a) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#else + return make_float3(-a.x, -a.y, -a.z); +#endif +} + +ccl_device_inline float3 operator*(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,b.m128)); +#else + return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); +#endif +} + +ccl_device_inline float3 operator*(const float3& a, const float f) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); +#else + return make_float3(a.x*f, a.y*f, a.z*f); +#endif +} + +ccl_device_inline float3 operator*(const float f, const float3& a) +{ + /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ +#if defined(__KERNEL_SSE__) && 0 + return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); +#else + return make_float3(a.x*f, a.y*f, a.z*f); +#endif +} + +ccl_device_inline float3 operator/(const float f, const float3& a) +{ + /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ +#if defined(__KERNEL_SSE__) && 0 + __m128 rc = _mm_rcp_ps(a.m128); + return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#else + return make_float3(f / a.x, f / a.y, f / a.z); +#endif +} + +ccl_device_inline float3 operator/(const float3& a, const float f) +{ + float invf = 1.0f/f; + return a * invf; +} + +ccl_device_inline float3 operator/(const float3& a, const float3& b) +{ + /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ +#if defined(__KERNEL_SSE__) && 0 + __m128 rc = _mm_rcp_ps(b.m128); + return float3(_mm_mul_ps(a, rc)); +#else + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +#endif +} + +ccl_device_inline float3 operator+(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_add_ps(a.m128, b.m128)); +#else + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +#endif +} + +ccl_device_inline float3 operator-(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +#endif +} + +ccl_device_inline float3 operator+=(float3& a, const float3& b) +{ + return a = a + b; +} + +ccl_device_inline float3 operator*=(float3& a, const float3& b) +{ + return a = a * b; +} + +ccl_device_inline float3 operator*=(float3& a, float f) +{ + return a = a * f; +} + +ccl_device_inline float3 operator/=(float3& a, const float3& b) +{ + return a = a / b; +} + +ccl_device_inline float3 operator/=(float3& a, float f) +{ + float invf = 1.0f/f; + return a = a * invf; +} + +ccl_device_inline bool operator==(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z); +#endif +} + +ccl_device_inline bool operator!=(const float3& a, const float3& b) +{ + return !(a == b); +} + +ccl_device_inline float dot(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); +#else + return a.x*b.x + a.y*b.y + a.z*b.z; +#endif +} + +ccl_device_inline float dot_xy(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); +#else + return a.x*b.x + a.y*b.y; +#endif +} + +ccl_device_inline float3 cross(const float3& a, const float3& b) +{ + float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); + return r; +} + +ccl_device_inline float3 normalize(const float3& a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); + return float3(_mm_div_ps(a.m128, norm)); +#else + return a/len(a); +#endif +} + +ccl_device_inline float3 min(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_min_ps(a.m128, b.m128)); +#else + return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + +ccl_device_inline float3 max(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_max_ps(a.m128, b.m128)); +#else + return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif +} + +ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float3 fabs(const float3& a) +{ +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return float3(_mm_and_ps(a.m128, mask)); +#else + return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); +#endif +} + +ccl_device_inline float3 mix(const float3& a, const float3& b, float t) +{ + return a + t*(b - a); +} + +ccl_device_inline float3 rcp(const float3& a) +{ +#ifdef __KERNEL_SSE__ + const float4 r(_mm_rcp_ps(a.m128)); + return float3(_mm_sub_ps(_mm_add_ps(r, r), + _mm_mul_ps(_mm_mul_ps(r, r), a))); +#else + return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); +#endif +} +#endif /* !__KERNEL_OPENCL__ */ + +ccl_device_inline float max3(float3 a) +{ + return max(max(a.x, a.y), a.z); +} + +ccl_device_inline float len(const float3 a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); +#else + return sqrtf(dot(a, a)); +#endif +} + +ccl_device_inline float len_squared(const float3 a) +{ + return dot(a, a); +} + +ccl_device_inline float3 saturate3(float3 a) +{ + return make_float3(saturate(a.x), saturate(a.y), saturate(a.z)); +} + +ccl_device_inline float3 normalize_len(const float3 a, float *t) +{ + *t = len(a); + float x = 1.0f / *t; + return a*x; +} + +ccl_device_inline float3 safe_normalize(const float3 a) +{ + float t = len(a); + return (t != 0.0f)? a * (1.0f/t) : a; +} + +ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) +{ + *t = len(a); + return (*t != 0.0f)? a/(*t): a; +} + +ccl_device_inline float3 interp(float3 a, float3 b, float t) +{ + return a + t*(b - a); +} + +ccl_device_inline bool is_zero(const float3 a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float3(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); +#endif +} + +ccl_device_inline float reduce_add(const float3 a) +{ + return (a.x + a.y + a.z); +} + +ccl_device_inline float average(const float3 a) +{ + return reduce_add(a)*(1.0f/3.0f); +} + +ccl_device_inline bool isequal_float3(const float3 a, const float3 b) +{ +#ifdef __KERNEL_OPENCL__ + return all(a == b); +#else + return a == b; +#endif +} +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h new file mode 100644 index 00000000000..d89121b3a1d --- /dev/null +++ b/intern/cycles/util/util_math_float4.h @@ -0,0 +1,393 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT4_H__ +#define __UTIL_MATH_FLOAT4_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float4 operator-(const float4& a); +ccl_device_inline float4 operator*(const float4& a, const float4& b); +ccl_device_inline float4 operator*(const float4& a, float f); +ccl_device_inline float4 operator*(float f, const float4& a); +ccl_device_inline float4 operator/(const float4& a, float f); +ccl_device_inline float4 operator/(const float4& a, const float4& b); +ccl_device_inline float4 operator+(const float4& a, const float4& b); +ccl_device_inline float4 operator-(const float4& a, const float4& b); +ccl_device_inline float4 operator+=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator/=(float4& a, float f); + +ccl_device_inline int4 operator<(const float4& a, const float4& b); +ccl_device_inline int4 operator>=(const float4& a, const float4& b); +ccl_device_inline int4 operator<=(const float4& a, const float4& b); +ccl_device_inline bool operator==(const float4& a, const float4& b); + +ccl_device_inline float dot(const float4& a, const float4& b); +ccl_device_inline float len_squared(const float4& a); +ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 cross(const float4& a, const float4& b); +ccl_device_inline bool is_zero(const float4& a); +ccl_device_inline float reduce_add(const float4& a); +ccl_device_inline float average(const float4& a); +ccl_device_inline float len(const float4& a); +ccl_device_inline float4 normalize(const float4& a); +ccl_device_inline float4 safe_normalize(const float4& a); +ccl_device_inline float4 min(const float4& a, const float4& b); +ccl_device_inline float4 max(const float4& a, const float4& b); +#endif /* !__KERNEL_OPENCL__*/ + +#ifdef __KERNEL_SSE__ +template +__forceinline const float4 shuffle(const float4& b); + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); + +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); +# endif +#endif /* __KERNEL_SSE__ */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline float4 select(const int4& mask, + const float4& a, + const float4& b); +ccl_device_inline float4 reduce_min(const float4& a); +ccl_device_inline float4 reduce_max(const float4& a); +# if 0 +ccl_device_inline float4 reduce_add(const float4& a); +# endif +#endif /* !__KERNEL_GPU__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float4 operator-(const float4& a) +{ +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return float4(_mm_xor_ps(a.m128, mask)); +#else + return make_float4(-a.x, -a.y, -a.z, -a.w); +#endif +} + +ccl_device_inline float4 operator*(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_mul_ps(a.m128, b.m128)); +#else + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +#endif +} + +ccl_device_inline float4 operator*(const float4& a, float f) +{ +#if defined(__KERNEL_SSE__) + return a * make_float4(f); +#else + return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); +#endif +} + +ccl_device_inline float4 operator*(float f, const float4& a) +{ + return a * f; +} + +ccl_device_inline float4 operator/(const float4& a, float f) +{ + return a * (1.0f/f); +} + +ccl_device_inline float4 operator/(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return a * rcp(b); +#else + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +#endif + +} + +ccl_device_inline float4 operator+(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_add_ps(a.m128, b.m128)); +#else + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +ccl_device_inline float4 operator-(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +#endif +} + +ccl_device_inline float4 operator+=(float4& a, const float4& b) +{ + return a = a + b; +} + +ccl_device_inline float4 operator*=(float4& a, const float4& b) +{ + return a = a * b; +} + +ccl_device_inline float4 operator/=(float4& a, float f) +{ + return a = a / f; +} + +ccl_device_inline int4 operator<(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128))); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator<=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128))); +#else + return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); +#endif +} + +ccl_device_inline bool operator==(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); +#endif +} + +ccl_device_inline float dot(const float4& a, const float4& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); +#else + return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w); +#endif +} + +ccl_device_inline float len_squared(const float4& a) +{ + return dot(a, a); +} + +ccl_device_inline float4 rcp(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 r(_mm_rcp_ps(a.m128)); + return float4(_mm_sub_ps(_mm_add_ps(r, r), + _mm_mul_ps(_mm_mul_ps(r, r), a))); +#else + return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); +#endif +} + +ccl_device_inline float4 cross(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - + (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); +#else + return make_float4(a.y*b.z - a.z*b.y, + a.z*b.x - a.x*b.z, + a.x*b.y - a.y*b.x, + 0.0f); +#endif +} + +ccl_device_inline bool is_zero(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float4(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); +#endif +} + +ccl_device_inline float reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h(shuffle<1,0,3,2>(a) + a); + /* TODO(sergey): Investigate efficiency. */ + return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); +#else + return ((a.x + a.y) + (a.z + a.w)); +#endif +} + +ccl_device_inline float average(const float4& a) +{ + return reduce_add(a) * 0.25f; +} + +ccl_device_inline float len(const float4& a) +{ + return sqrtf(dot(a, a)); +} + +ccl_device_inline float4 normalize(const float4& a) +{ + return a/len(a); +} + +ccl_device_inline float4 safe_normalize(const float4& a) +{ + float t = len(a); + return (t != 0.0f)? a/t: a; +} + +ccl_device_inline float4 min(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_min_ps(a.m128, b.m128)); +#else + return make_float4(min(a.x, b.x), + min(a.y, b.y), + min(a.z, b.z), + min(a.w, b.w)); +#endif +} + +ccl_device_inline float4 max(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_max_ps(a.m128, b.m128)); +#else + return make_float4(max(a.x, b.x), + max(a.y, b.y), + max(a.z, b.z), + max(a.w, b.w)); +#endif +} +#endif /* !__KERNEL_OPENCL__*/ + +#ifdef __KERNEL_SSE__ +template +__forceinline const float4 shuffle(const float4& b) +{ + return float4(_mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(b), + _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); +} + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) +{ + return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); +} + +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) +{ + return float4(_mm_moveldup_ps(b)); +} + +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) +{ + return float4(_mm_movehdup_ps(b)); +} +# endif /* __KERNEL_SSE3__ */ +#endif /* __KERNEL_SSE__ */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline float4 select(const int4& mask, + const float4& a, + const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), + _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); +#else + return make_float4((mask.x)? a.x: b.x, + (mask.y)? a.y: b.y, + (mask.z)? a.z: b.z, + (mask.w)? a.w: b.w); +#endif +} + +ccl_device_inline float4 reduce_min(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = min(shuffle<1,0,3,2>(a), a); + return min(shuffle<2,3,0,1>(h), h); +#else + return make_float4(min(min(a.x, a.y), min(a.z, a.w))); +#endif +} + +ccl_device_inline float4 reduce_max(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = max(shuffle<1,0,3,2>(a), a); + return max(shuffle<2,3,0,1>(h), h); +#else + return make_float4(max(max(a.x, a.y), max(a.z, a.w))); +#endif +} + +#if 0 +ccl_device_inline float4 reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = shuffle<1,0,3,2>(a) + a; + return shuffle<2,3,0,1>(h) + h; +#else + return make_float4((a.x + a.y) + (a.z + a.w)); +#endif +} +#endif +#endif /* !__KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT4_H__ */ diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h new file mode 100644 index 00000000000..828c49a131c --- /dev/null +++ b/intern/cycles/util/util_math_int2.h @@ -0,0 +1,77 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT2_H__ +#define __UTIL_MATH_INT2_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline bool operator==(const int2 a, const int2 b); +ccl_device_inline int2 operator+(const int2 &a, const int2 &b); +ccl_device_inline int2 operator+=(int2 &a, const int2 &b); +ccl_device_inline int2 operator-(const int2 &a, const int2 &b); +ccl_device_inline int2 operator*(const int2 &a, const int2 &b); +ccl_device_inline int2 operator/(const int2 &a, const int2 &b); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline bool operator==(const int2 a, const int2 b) +{ + return (a.x == b.x && a.y == b.y); +} + +ccl_device_inline int2 operator+(const int2 &a, const int2 &b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} + +ccl_device_inline int2 operator+=(int2 &a, const int2 &b) +{ + return a = a + b; +} + +ccl_device_inline int2 operator-(const int2 &a, const int2 &b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} + +ccl_device_inline int2 operator*(const int2 &a, const int2 &b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} + +ccl_device_inline int2 operator/(const int2 &a, const int2 &b) +{ + return make_int2(a.x / b.x, a.y / b.y); +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT2_H__ */ diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h new file mode 100644 index 00000000000..fa7a02636de --- /dev/null +++ b/intern/cycles/util/util_math_int3.h @@ -0,0 +1,83 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT3_H__ +#define __UTIL_MATH_INT3_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline int3 min(int3 a, int3 b); +ccl_device_inline int3 max(int3 a, int3 b); +ccl_device_inline int3 clamp(const int3& a, int mn, int mx); +ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline int3 min(int3 a, int3 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int3(_mm_min_epi32(a.m128, b.m128)); +#else + return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + +ccl_device_inline int3 max(int3 a, int3 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int3(_mm_max_epi32(a.m128, b.m128)); +#else + return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif +} + +ccl_device_inline int3 clamp(const int3& a, int mn, int mx) +{ +#ifdef __KERNEL_SSE__ + return min(max(a, make_int3(mn)), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); +#endif +} + +ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx) +{ +#ifdef __KERNEL_SSE__ + return min(max(a, mn), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn.x, mx), + clamp(a.y, mn.y, mx), + clamp(a.z, mn.z, mx)); +#endif +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT3_H__ */ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h new file mode 100644 index 00000000000..4b327c90c33 --- /dev/null +++ b/intern/cycles/util/util_math_int4.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT4_H__ +#define __UTIL_MATH_INT4_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline int4 operator+(const int4& a, const int4& b); +ccl_device_inline int4 operator+=(int4& a, const int4& b); +ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 min(int4 a, int4 b); +ccl_device_inline int4 max(int4 a, int4 b); +ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); +ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b); +#endif /* __KERNEL_GPU__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline int4 operator+(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_add_epi32(a.m128, b.m128)); +#else + return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +ccl_device_inline int4 operator+=(int4& a, const int4& b) +{ + return a = a + b; +} + +ccl_device_inline int4 operator>>(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_srai_epi32(a.m128, i)); +#else + return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); +#endif +} + +ccl_device_inline int4 min(int4 a, int4 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int4(_mm_min_epi32(a.m128, b.m128)); +#else + return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +#endif +} + +ccl_device_inline int4 max(int4 a, int4 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int4(_mm_max_epi32(a.m128, b.m128)); +#else + return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); +#endif +} + +ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + const __m128 m = _mm_cvtepi32_ps(mask); + /* TODO(sergey): avoid cvt. */ + return int4(_mm_castps_si128( + _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), + _mm_andnot_ps(m, _mm_castsi128_ps(b))))); +#else + return make_int4((mask.x)? a.x: b.x, + (mask.y)? a.y: b.y, + (mask.z)? a.z: b.z, + (mask.w)? a.w: b.w); +#endif +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT4_H__ */ diff --git a/intern/cycles/util/util_types_float2.h b/intern/cycles/util/util_types_float2.h index 877ea7c9a3b..ec7a1f717a1 100644 --- a/intern/cycles/util/util_types_float2.h +++ b/intern/cycles/util/util_types_float2.h @@ -32,6 +32,7 @@ struct float2 { }; ccl_device_inline float2 make_float2(float x, float y); +ccl_device_inline void print_float2(const char *label, const float2& a); #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_float2_impl.h b/intern/cycles/util/util_types_float2_impl.h index 7ef390d7da4..782dda195eb 100644 --- a/intern/cycles/util/util_types_float2_impl.h +++ b/intern/cycles/util/util_types_float2_impl.h @@ -21,6 +21,10 @@ # error "Do not include this file directly, include util_types.h instead." #endif +#ifndef __KERNEL_GPU__ +# include +#endif + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ @@ -43,6 +47,11 @@ ccl_device_inline float2 make_float2(float x, float y) float2 a = {x, y}; return a; } + +ccl_device_inline void print_float2(const char *label, const float2& a) +{ + printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h index 31646f64a1d..28146ad04f7 100644 --- a/intern/cycles/util/util_types_float3.h +++ b/intern/cycles/util/util_types_float3.h @@ -49,6 +49,7 @@ struct ccl_try_align(16) float3 { ccl_device_inline float3 make_float3(float f); ccl_device_inline float3 make_float3(float x, float y, float z); +ccl_device_inline void print_float3(const char *label, const float3& a); #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h index f062d970b26..45f61767d3f 100644 --- a/intern/cycles/util/util_types_float3_impl.h +++ b/intern/cycles/util/util_types_float3_impl.h @@ -21,6 +21,10 @@ # error "Do not include this file directly, include util_types.h instead." #endif +#ifndef __KERNEL_GPU__ +# include +#endif + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ @@ -89,6 +93,11 @@ ccl_device_inline float3 make_float3(float x, float y, float z) #endif return a; } + +ccl_device_inline void print_float3(const char *label, const float3& a) +{ + printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h index f0fc0d8f73a..a7d9abe1b95 100644 --- a/intern/cycles/util/util_types_float4.h +++ b/intern/cycles/util/util_types_float4.h @@ -53,6 +53,7 @@ struct ccl_try_align(16) float4 { ccl_device_inline float4 make_float4(float f); ccl_device_inline float4 make_float4(float x, float y, float z, float w); ccl_device_inline float4 make_float4(const int4& i); +ccl_device_inline void print_float4(const char *label, const float4& a); #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h index 1c59f5fb4f9..ff3ec4d4ecf 100644 --- a/intern/cycles/util/util_types_float4_impl.h +++ b/intern/cycles/util/util_types_float4_impl.h @@ -21,6 +21,10 @@ # error "Do not include this file directly, include util_types.h instead." #endif +#ifndef __KERNEL_GPU__ +# include +#endif + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ @@ -99,6 +103,13 @@ ccl_device_inline float4 make_float4(const int4& i) #endif return a; } + +ccl_device_inline void print_float4(const char *label, const float4& a) +{ + printf("%s: %.8f %.8f %.8f %.8f\n", + label, + (double)a.x, (double)a.y, (double)a.z, (double)a.w); +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h index 6179148a1b2..9d43b201c02 100644 --- a/intern/cycles/util/util_types_int3.h +++ b/intern/cycles/util/util_types_int3.h @@ -49,6 +49,7 @@ struct ccl_try_align(16) int3 { ccl_device_inline int3 make_int3(int i); ccl_device_inline int3 make_int3(int x, int y, int z); +ccl_device_inline void print_int3(const char *label, const int3& a); #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h index 4fc9daf5fd8..ada50c4812c 100644 --- a/intern/cycles/util/util_types_int3_impl.h +++ b/intern/cycles/util/util_types_int3_impl.h @@ -21,6 +21,10 @@ # error "Do not include this file directly, include util_types.h instead." #endif +#ifndef __KERNEL_GPU__ +# include +#endif + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ @@ -90,6 +94,11 @@ ccl_device_inline int3 make_int3(int x, int y, int z) return a; } + +ccl_device_inline void print_int3(const char *label, const int3& a) +{ + printf("%s: %d %d %d\n", label, a.x, a.y, a.z); +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h index 1963893d3cc..cdd0ecbdae5 100644 --- a/intern/cycles/util/util_types_int4.h +++ b/intern/cycles/util/util_types_int4.h @@ -53,6 +53,7 @@ struct ccl_try_align(16) int4 { ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline void print_int4(const char *label, const int4& a); #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h index 52cda1c74b5..07cdc88f2dc 100644 --- a/intern/cycles/util/util_types_int4_impl.h +++ b/intern/cycles/util/util_types_int4_impl.h @@ -21,6 +21,10 @@ # error "Do not include this file directly, include util_types.h instead." #endif +#ifndef __KERNEL_GPU__ +# include +#endif + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ @@ -99,6 +103,11 @@ ccl_device_inline int4 make_int4(const float3& f) #endif return a; } + +ccl_device_inline void print_int4(const char *label, const int4& a) +{ + printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END -- cgit v1.2.3