diff options
Diffstat (limited to 'intern/cycles/util')
-rw-r--r-- | intern/cycles/util/CMakeLists.txt | 2 | ||||
-rw-r--r-- | intern/cycles/util/util_boundbox.h | 86 | ||||
-rw-r--r-- | intern/cycles/util/util_math.h | 485 | ||||
-rw-r--r-- | intern/cycles/util/util_task.cpp | 223 | ||||
-rw-r--r-- | intern/cycles/util/util_task.h | 122 | ||||
-rw-r--r-- | intern/cycles/util/util_thread.h | 127 | ||||
-rw-r--r-- | intern/cycles/util/util_transform.cpp | 23 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 268 |
8 files changed, 992 insertions, 344 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 9182ee4cbe1..87bd84b4e0f 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -15,6 +15,7 @@ set(SRC util_path.cpp util_string.cpp util_system.cpp + util_task.cpp util_time.cpp util_transform.cpp ) @@ -50,6 +51,7 @@ set(SRC_HEADERS util_set.h util_string.h util_system.h + util_task.h util_thread.h util_time.h util_transform.h diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h index bb1df0b220f..9511b48e103 100644 --- a/intern/cycles/util/util_boundbox.h +++ b/intern/cycles/util/util_boundbox.h @@ -23,6 +23,7 @@ #include <float.h> #include "util_math.h" +#include "util_string.h" #include "util_transform.h" #include "util_types.h" @@ -35,45 +36,81 @@ class BoundBox public: float3 min, max; - BoundBox(void) + __forceinline BoundBox() { - min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX); - max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX); } - BoundBox(const float3& min_, const float3& max_) + __forceinline BoundBox(const float3& pt) + : min(pt), max(pt) + { + } + + __forceinline BoundBox(const float3& min_, const float3& max_) : min(min_), max(max_) { } - void grow(const float3& pt) + static struct empty_t {} empty; + + __forceinline BoundBox(empty_t) + : min(make_float3(FLT_MAX, FLT_MAX, FLT_MAX)), max(make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX)) + { + } + + __forceinline void grow(const float3& pt) { min = ccl::min(min, pt); max = ccl::max(max, pt); } - void grow(const BoundBox& bbox) + __forceinline void grow(const BoundBox& bbox) { grow(bbox.min); grow(bbox.max); } - void intersect(const BoundBox& bbox) + __forceinline void intersect(const BoundBox& bbox) { min = ccl::max(min, bbox.min); max = ccl::min(max, bbox.max); } - float area(void) const + /* todo: avoid using this */ + __forceinline float safe_area() const { - if(!valid()) + if(!((min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z))) return 0.0f; + return area(); + } + + __forceinline float area() const + { + return half_area()*2.0f; + } + + __forceinline float half_area() const + { float3 d = max - min; - return dot(d, d)*2.0f; + return (d.x*d.z + d.y*d.z + d.x*d.y); + } + + __forceinline float3 center() const + { + return 0.5f*(min + max); } - bool valid(void) const + __forceinline float3 center2() const + { + return min + max; + } + + __forceinline float3 size() const + { + return max - min; + } + + __forceinline bool valid() const { return (min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z) && (isfinite(min.x) && isfinite(min.y) && isfinite(min.z)) && @@ -82,7 +119,7 @@ public: BoundBox transformed(const Transform *tfm) { - BoundBox result; + BoundBox result = BoundBox::empty; for(int i = 0; i < 8; i++) { float3 p; @@ -98,6 +135,31 @@ public: } }; +__forceinline BoundBox merge(const BoundBox& bbox, const float3& pt) +{ + return BoundBox(min(bbox.min, pt), max(bbox.max, pt)); +} + +__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b) +{ + return BoundBox(min(a.min, b.min), max(a.max, b.max)); +} + +__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b, const BoundBox& c, const BoundBox& d) +{ + return merge(merge(a, b), merge(c, d)); +} + +__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b) +{ + return BoundBox(max(a.min, b.min), min(a.max, b.max)); +} + +__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b, const BoundBox& c) +{ + return intersect(a, intersect(b, c)); +} + CCL_NAMESPACE_END #endif /* __UTIL_BOUNDBOX_H__ */ diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 019dede07fa..33e351c74e9 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -182,93 +182,74 @@ __device_inline float average(const float2 a) __device_inline float2 operator-(const float2 a) { - float2 r = {-a.x, -a.y}; - return r; + return make_float2(-a.x, -a.y); } __device_inline float2 operator*(const float2 a, const float2 b) { - float2 r = {a.x*b.x, a.y*b.y}; - return r; + return make_float2(a.x*b.x, a.y*b.y); } __device_inline float2 operator*(const float2 a, float f) { - float2 r = {a.x*f, a.y*f}; - return r; + return make_float2(a.x*f, a.y*f); } __device_inline float2 operator*(float f, const float2 a) { - float2 r = {a.x*f, a.y*f}; - return r; + return make_float2(a.x*f, a.y*f); } __device_inline float2 operator/(float f, const float2 a) { - float2 r = {f/a.x, f/a.y}; - return r; + return make_float2(f/a.x, f/a.y); } __device_inline float2 operator/(const float2 a, float f) { float invf = 1.0f/f; - float2 r = {a.x*invf, a.y*invf}; - return r; + return make_float2(a.x*invf, a.y*invf); } __device_inline float2 operator/(const float2 a, const float2 b) { - float2 r = {a.x/b.x, a.y/b.y}; - return r; + return make_float2(a.x/b.x, a.y/b.y); } __device_inline float2 operator+(const float2 a, const float2 b) { - float2 r = {a.x+b.x, a.y+b.y}; - return r; + return make_float2(a.x+b.x, a.y+b.y); } __device_inline float2 operator-(const float2 a, const float2 b) { - float2 r = {a.x-b.x, a.y-b.y}; - return r; + return make_float2(a.x-b.x, a.y-b.y); } __device_inline float2 operator+=(float2& a, const float2 b) { - a.x += b.x; - a.y += b.y; - return a; + return a = a + b; } __device_inline float2 operator*=(float2& a, const float2 b) { - a.x *= b.x; - a.y *= b.y; - return a; + return a = a * b; } __device_inline float2 operator*=(float2& a, float f) { - a.x *= f; - a.y *= f; - return a; + return a = a * f; } __device_inline float2 operator/=(float2& a, const float2 b) { - a.x /= b.x; - a.y /= b.y; - return a; + return a = a / b; } __device_inline float2 operator/=(float2& a, float f) { float invf = 1.0f/f; - a.x *= invf; - a.y *= invf; - return a; + return a = a * invf; } @@ -314,14 +295,12 @@ __device_inline bool operator!=(const float2 a, const float2 b) __device_inline float2 min(float2 a, float2 b) { - float2 r = {min(a.x, b.x), min(a.y, b.y)}; - return r; + return make_float2(min(a.x, b.x), min(a.y, b.y)); } __device_inline float2 max(float2 a, float2 b) { - float2 r = {max(a.x, b.x), max(a.y, b.y)}; - return r; + return make_float2(max(a.x, b.x), max(a.y, b.y)); } __device_inline float2 clamp(float2 a, float2 mn, float2 mx) @@ -361,112 +340,78 @@ __device_inline float2 interp(float2 a, float2 b, float t) /* Float3 Vector */ -__device_inline bool is_zero(const float3 a) -{ - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); -} - -__device_inline float average(const float3 a) -{ - return (a.x + a.y + a.z)*(1.0f/3.0f); -} - #ifndef __KERNEL_OPENCL__ __device_inline float3 operator-(const float3 a) { - float3 r = make_float3(-a.x, -a.y, -a.z); - return r; + return make_float3(-a.x, -a.y, -a.z); } __device_inline float3 operator*(const float3 a, const float3 b) { - float3 r = make_float3(a.x*b.x, a.y*b.y, a.z*b.z); - return r; + return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); } __device_inline float3 operator*(const float3 a, float f) { - float3 r = make_float3(a.x*f, a.y*f, a.z*f); - return r; + return make_float3(a.x*f, a.y*f, a.z*f); } __device_inline float3 operator*(float f, const float3 a) { - float3 r = make_float3(a.x*f, a.y*f, a.z*f); - return r; + return make_float3(a.x*f, a.y*f, a.z*f); } __device_inline float3 operator/(float f, const float3 a) { - float3 r = make_float3(f/a.x, f/a.y, f/a.z); - return r; + return make_float3(f/a.x, f/a.y, f/a.z); } __device_inline float3 operator/(const float3 a, float f) { float invf = 1.0f/f; - float3 r = make_float3(a.x*invf, a.y*invf, a.z*invf); - return r; + return make_float3(a.x*invf, a.y*invf, a.z*invf); } __device_inline float3 operator/(const float3 a, const float3 b) { - float3 r = make_float3(a.x/b.x, a.y/b.y, a.z/b.z); - return r; + return make_float3(a.x/b.x, a.y/b.y, a.z/b.z); } __device_inline float3 operator+(const float3 a, const float3 b) { - float3 r = make_float3(a.x+b.x, a.y+b.y, a.z+b.z); - return r; + return make_float3(a.x+b.x, a.y+b.y, a.z+b.z); } __device_inline float3 operator-(const float3 a, const float3 b) { - float3 r = make_float3(a.x-b.x, a.y-b.y, a.z-b.z); - return r; + return make_float3(a.x-b.x, a.y-b.y, a.z-b.z); } __device_inline float3 operator+=(float3& a, const float3 b) { - a.x += b.x; - a.y += b.y; - a.z += b.z; - return a; + return a = a + b; } __device_inline float3 operator*=(float3& a, const float3 b) { - a.x *= b.x; - a.y *= b.y; - a.z *= b.z; - return a; + return a = a * b; } __device_inline float3 operator*=(float3& a, float f) { - a.x *= f; - a.y *= f; - a.z *= f; - return a; + return a = a * f; } __device_inline float3 operator/=(float3& a, const float3 b) { - a.x /= b.x; - a.y /= b.y; - a.z /= b.z; - return a; + return a = a / b; } __device_inline float3 operator/=(float3& a, float f) { float invf = 1.0f/f; - a.x *= invf; - a.y *= invf; - a.z *= invf; - return a; + return a = a * invf; } __device_inline float dot(const float3 a, const float3 b) @@ -506,7 +451,11 @@ __device_inline float3 normalize_len(const float3 a, float *t) __device_inline bool operator==(const float3 a, const float3 b) { +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; +#else return (a.x == b.x && a.y == b.y && a.z == b.z); +#endif } __device_inline bool operator!=(const float3 a, const float3 b) @@ -516,14 +465,20 @@ __device_inline bool operator!=(const float3 a, const float3 b) __device_inline float3 min(float3 a, float3 b) { - float3 r = make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); - return r; +#ifdef __KERNEL_SSE__ + return _mm_min_ps(a.m128, b.m128); +#else + return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif } __device_inline float3 max(float3 a, float3 b) { - float3 r = make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); - return r; +#ifdef __KERNEL_SSE__ + return _mm_max_ps(a.m128, b.m128); +#else + return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif } __device_inline float3 clamp(float3 a, float3 mn, float3 mx) @@ -533,7 +488,12 @@ __device_inline float3 clamp(float3 a, float3 mn, float3 mx) __device_inline float3 fabs(float3 a) { +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); +#else return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); +#endif } #endif @@ -555,6 +515,16 @@ __device_inline void print_float3(const char *label, const float3& a) printf("%s: %.8f %.8f %.8f\n", label, a.x, a.y, a.z); } +__device_inline float3 rcp(const float3& a) +{ +#ifdef __KERNEL_SSE__ + float4 r = _mm_rcp_ps(a.m128); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#else + return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); +#endif +} + #endif __device_inline float3 interp(float3 a, float3 b, float t) @@ -562,122 +532,257 @@ __device_inline float3 interp(float3 a, float3 b, float t) return a + t*(b - a); } +__device_inline bool is_zero(const float3 a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float3(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); +#endif +} + +__device_inline float reduce_add(const float3& a) +{ +#ifdef __KERNEL_SSE__ + return (a.x + a.y + a.z); +#else + return (a.x + a.y + a.z); +#endif +} + +__device_inline float average(const float3 a) +{ + return reduce_add(a)*(1.0f/3.0f); +} + /* Float4 Vector */ -#ifndef __KERNEL_OPENCL__ +#ifdef __KERNEL_SSE__ -__device_inline bool is_zero(const float4& a) +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b) { - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))); } -__device_inline float average(const float4& a) +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { - return (a.x + a.y + a.z + a.w)*(1.0f/4.0f); + return _mm_moveldup_ps(b); } +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) +{ + return _mm_movehdup_ps(b); +} + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) +{ + return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); +} + +#endif + +#ifndef __KERNEL_OPENCL__ + __device_inline float4 operator-(const float4& a) { - float4 r = {-a.x, -a.y, -a.z, -a.w}; - return r; +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); +#else + return make_float4(-a.x, -a.y, -a.z, -a.w); +#endif } __device_inline float4 operator*(const float4& a, const float4& b) { - float4 r = {a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w}; - return r; +#ifdef __KERNEL_SSE__ + return _mm_mul_ps(a.m128, b.m128); +#else + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +#endif } __device_inline float4 operator*(const float4& a, float f) { - float4 r = {a.x*f, a.y*f, a.z*f, a.w*f}; - return r; +#ifdef __KERNEL_SSE__ + return a * make_float4(f); +#else + return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); +#endif } __device_inline float4 operator*(float f, const float4& a) { - float4 r = {a.x*f, a.y*f, a.z*f, a.w*f}; - return r; + return a * f; +} + +__device_inline float4 rcp(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 r = _mm_rcp_ps(a.m128); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#else + return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); +#endif } __device_inline float4 operator/(const float4& a, float f) { - float invf = 1.0f/f; - float4 r = {a.x*invf, a.y*invf, a.z*invf, a.w*invf}; - return r; + return a * (1.0f/f); } __device_inline float4 operator/(const float4& a, const float4& b) { - float4 r = {a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w}; - return r; +#ifdef __KERNEL_SSE__ + return a * rcp(b); +#else + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +#endif + } __device_inline float4 operator+(const float4& a, const float4& b) { - float4 r = {a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w}; - return r; +#ifdef __KERNEL_SSE__ + return _mm_add_ps(a.m128, b.m128); +#else + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif } __device_inline float4 operator-(const float4& a, const float4& b) { - float4 r = {a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w}; - return r; +#ifdef __KERNEL_SSE__ + return _mm_sub_ps(a.m128, b.m128); +#else + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +#endif } __device_inline float4 operator+=(float4& a, const float4& b) { - a.x += b.x; - a.y += b.y; - a.z += b.z; - a.w += b.w; - return a; + return a = a + b; } __device_inline float4 operator*=(float4& a, const float4& b) { - a.x *= b.x; - a.y *= b.y; - a.z *= b.z; - a.w *= b.w; - return a; + return a = a * b; } __device_inline float4 operator/=(float4& a, float f) { - float invf = 1.0f/f; - a.x *= invf; - a.y *= invf; - a.z *= invf; - a.w *= invf; - return a; + return a = a / f; } -__device_inline float dot(const float4& a, const float4& b) +__device_inline int4 operator<(const float4& a, const float4& b) { - return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; +#ifdef __KERNEL_SSE__ + return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */ +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +__device_inline int4 operator>=(float4 a, float4 b) +{ +#ifdef __KERNEL_SSE__ + return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */ +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +__device_inline int4 operator<=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */ +#else + return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); +#endif +} + +__device_inline bool operator==(const float4 a, const float4 b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); +#endif } __device_inline float4 cross(const float4& a, const float4& b) { - float4 r = {a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f}; - return r; +#ifdef __KERNEL_SSE__ + return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); +#else + return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f); +#endif } __device_inline float4 min(float4 a, float4 b) { +#ifdef __KERNEL_SSE__ + return _mm_min_ps(a.m128, b.m128); +#else return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +#endif } __device_inline float4 max(float4 a, float4 b) { +#ifdef __KERNEL_SSE__ + return _mm_max_ps(a.m128, b.m128); +#else return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); +#endif } #endif #ifndef __KERNEL_GPU__ +__device_inline float4 select(const int4& mask, const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* blendv is sse4, and apparently broken on vs2008 */ + return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */ +#else + return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); +#endif +} + +__device_inline float4 reduce_min(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = min(shuffle<1,0,3,2>(a), a); + return min(shuffle<2,3,0,1>(h), h); +#else + return make_float4(min(min(a.x, a.y), min(a.z, a.w))); +#endif +} + +__device_inline float4 reduce_max(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = max(shuffle<1,0,3,2>(a), a); + return max(shuffle<2,3,0,1>(h), h); +#else + return make_float4(max(max(a.x, a.y), max(a.z, a.w))); +#endif +} + +#if 0 +__device_inline float4 reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = shuffle<1,0,3,2>(a) + a; + return shuffle<2,3,0,1>(h) + h; +#else + return make_float4((a.x + a.y) + (a.z + a.w)); +#endif +} +#endif + __device_inline void print_float4(const char *label, const float4& a) { printf("%s: %.8f %.8f %.8f %.8f\n", label, a.x, a.y, a.z, a.w); @@ -685,26 +790,77 @@ __device_inline void print_float4(const char *label, const float4& a) #endif +#ifndef __KERNEL_OPENCL__ + +__device_inline bool is_zero(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float4(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); +#endif +} + +__device_inline float reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = shuffle<1,0,3,2>(a) + a; + return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */ +#else + return ((a.x + a.y) + (a.z + a.w)); +#endif +} + +__device_inline float average(const float4& a) +{ + return reduce_add(a) * 0.25f; +} + +__device_inline float dot(const float4& a, const float4& b) +{ + return reduce_add(a * b); +} + +#endif + /* Int3 */ #ifndef __KERNEL_OPENCL__ +__device_inline int3 min(int3 a, int3 b) +{ +#ifdef __KERNEL_SSE__ + return _mm_min_epi32(a.m128, b.m128); +#else + return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + __device_inline int3 max(int3 a, int3 b) { - int3 r = {max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)}; - return r; +#ifdef __KERNEL_SSE__ + return _mm_max_epi32(a.m128, b.m128); +#else + return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif } __device_inline int3 clamp(const int3& a, int mn, int mx) { - int3 r = {clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)}; - return r; +#ifdef __KERNEL_SSE__ + return min(max(a, make_int3(mn)), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); +#endif } __device_inline int3 clamp(const int3& a, int3& mn, int mx) { - int3 r = {clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)}; - return r; +#ifdef __KERNEL_SSE__ + return min(max(a, mn), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); +#endif } #endif @@ -720,16 +876,63 @@ __device_inline void print_int3(const char *label, const int3& a) /* Int4 */ -#ifndef __KERNEL_OPENCL__ +#ifndef __KERNEL_GPU__ -__device_inline int4 operator>=(float4 a, float4 b) +__device_inline int4 operator+(const int4& a, const int4& b) { - return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#ifdef __KERNEL_SSE__ + return _mm_add_epi32(a.m128, b.m128); +#else + return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +__device_inline int4 operator+=(int4& a, const int4& b) +{ + return a = a + b; } +__device_inline int4 operator>>(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return _mm_srai_epi32(a.m128, i); +#else + return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); #endif +} -#ifndef __KERNEL_GPU__ +__device_inline int4 min(int4 a, int4 b) +{ +#ifdef __KERNEL_SSE__ + return _mm_min_epi32(a.m128, b.m128); +#else + return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +#endif +} + +__device_inline int4 max(int4 a, int4 b) +{ +#ifdef __KERNEL_SSE__ + return _mm_max_epi32(a.m128, b.m128); +#else + return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); +#endif +} + +__device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) +{ + return min(max(a, mn), mx); +} + +__device_inline int4 select(const int4& mask, const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + __m128 m = _mm_cvtepi32_ps(mask); + return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */ +#else + return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); +#endif +} __device_inline void print_int4(const char *label, const int4& a) { diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp new file mode 100644 index 00000000000..6da9a70ec0c --- /dev/null +++ b/intern/cycles/util/util_task.cpp @@ -0,0 +1,223 @@ +/* + * Copyright 2011, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "util_debug.h" +#include "util_foreach.h" +#include "util_system.h" +#include "util_task.h" + +CCL_NAMESPACE_BEGIN + +/* Task Pool */ + +TaskPool::TaskPool(const TaskRunFunction& run_) +{ + num = 0; + num_done = 0; + + do_cancel = false; + + run = run_; +} + +TaskPool::~TaskPool() +{ + stop(); +} + +void TaskPool::push(Task *task, bool front) +{ + TaskScheduler::Entry entry; + + entry.task = task; + entry.pool = this; + + TaskScheduler::push(entry, front); +} + +void TaskPool::wait() +{ + thread_scoped_lock lock(done_mutex); + + while(num_done != num) + done_cond.wait(lock); +} + +void TaskPool::cancel() +{ + TaskScheduler::clear(this); + + do_cancel = true; + wait(); + do_cancel = false; +} + +void TaskPool::stop() +{ + TaskScheduler::clear(this); + + assert(num_done == num); +} + +bool TaskPool::cancelled() +{ + return do_cancel; +} + +void TaskPool::done_increase(int done) +{ + done_mutex.lock(); + num_done += done; + done_mutex.unlock(); + + assert(num_done <= num); + done_cond.notify_all(); +} + +/* Task Scheduler */ + +thread_mutex TaskScheduler::mutex; +int TaskScheduler::users = 0; +vector<thread*> TaskScheduler::threads; +volatile bool TaskScheduler::do_exit = false; + +list<TaskScheduler::Entry> TaskScheduler::queue; +thread_mutex TaskScheduler::queue_mutex; +thread_condition_variable TaskScheduler::queue_cond; + +void TaskScheduler::init(int num_threads) +{ + thread_scoped_lock lock(mutex); + + /* multiple cycles instances can use this task scheduler, sharing the same + threads, so we keep track of the number of users. */ + if(users == 0) { + do_exit = false; + + /* launch threads that will be waiting for work */ + if(num_threads == 0) + num_threads = system_cpu_thread_count(); + + threads.resize(num_threads); + + for(size_t i = 0; i < threads.size(); i++) + threads[i] = new thread(function_bind(&TaskScheduler::thread_run, i)); + } + + users++; +} + +void TaskScheduler::exit() +{ + thread_scoped_lock lock(mutex); + + users--; + + if(users == 0) { + /* stop all waiting threads */ + do_exit = true; + TaskScheduler::queue_cond.notify_all(); + + /* delete threads */ + foreach(thread *t, threads) { + t->join(); + delete t; + } + + threads.clear(); + } +} + +bool TaskScheduler::thread_wait_pop(Entry& entry) +{ + thread_scoped_lock lock(queue_mutex); + + while(queue.empty() && !do_exit) + queue_cond.wait(lock); + + if(queue.empty()) { + assert(do_exit); + return false; + } + + entry = queue.front(); + queue.pop_front(); + + return true; +} + +void TaskScheduler::thread_run(int thread_id) +{ + Entry entry; + + /* todo: test affinity/denormal mask */ + + /* keep popping off tasks */ + while(thread_wait_pop(entry)) { + /* run task */ + entry.pool->run(entry.task, thread_id); + + /* delete task */ + delete entry.task; + + /* notify pool task was done */ + entry.pool->done_increase(1); + } +} + +void TaskScheduler::push(Entry& entry, bool front) +{ + /* add entry to queue */ + TaskScheduler::queue_mutex.lock(); + if(front) + TaskScheduler::queue.push_front(entry); + else + TaskScheduler::queue.push_back(entry); + entry.pool->num++; + TaskScheduler::queue_mutex.unlock(); + + TaskScheduler::queue_cond.notify_one(); +} + +void TaskScheduler::clear(TaskPool *pool) +{ + thread_scoped_lock lock(TaskScheduler::queue_mutex); + + /* erase all tasks from this pool from the queue */ + list<TaskScheduler::Entry>::iterator it = TaskScheduler::queue.begin(); + int done = 0; + + while(it != TaskScheduler::queue.end()) { + TaskScheduler::Entry& entry = *it; + + if(entry.pool == pool) { + done++; + delete entry.task; + + it = TaskScheduler::queue.erase(it); + } + else + it++; + } + + /* notify done */ + pool->done_increase(done); +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h new file mode 100644 index 00000000000..acdb2cb50a2 --- /dev/null +++ b/intern/cycles/util/util_task.h @@ -0,0 +1,122 @@ +/* + * Copyright 2011, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __UTIL_TASK_H__ +#define __UTIL_TASK_H__ + +#include "util_list.h" +#include "util_thread.h" +#include "util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Task; +class TaskPool; +class TaskScheduler; + +typedef boost::function<void(Task*,int)> TaskRunFunction; + +/* Task + * + * Base class for tasks to be executed in threads. */ + +class Task +{ +public: + Task() {}; + virtual ~Task() {} +}; + +/* Task Pool + * + * Pool of tasks that will be executed by the central TaskScheduler.For each + * pool, we can wait for all tasks to be done, or cancel them before they are + * done. + * + * The run callback that actually executes the task may be create like this: + * function_bind(&MyClass::task_execute, this, _1, _2) */ + +class TaskPool +{ +public: + TaskPool(const TaskRunFunction& run); + ~TaskPool(); + + void push(Task *task, bool front = false); + + void wait(); /* wait until all tasks are done */ + void cancel(); /* cancel all tasks, keep worker threads running */ + void stop(); /* stop all worker threads */ + + bool cancelled(); /* for worker threads, test if cancelled */ + +protected: + friend class TaskScheduler; + + void done_increase(int done); + + TaskRunFunction run; + + thread_mutex done_mutex; + thread_condition_variable done_cond; + + volatile int num, num_done; + volatile bool do_cancel; +}; + +/* Task Scheduler + * + * Central scheduler that holds running threads ready to execute tasks. A singe + * queue holds the task from all pools. */ + +class TaskScheduler +{ +public: + static void init(int num_threads = 0); + static void exit(); + + static int num_threads() { return threads.size(); } + +protected: + friend class TaskPool; + + struct Entry { + Task *task; + TaskPool *pool; + }; + + static thread_mutex mutex; + static int users; + static vector<thread*> threads; + static volatile bool do_exit; + + static list<Entry> queue; + static thread_mutex queue_mutex; + static thread_condition_variable queue_cond; + + static void thread_run(int thread_id); + static bool thread_wait_pop(Entry& entry); + + static void push(Entry& entry, bool front); + static void clear(TaskPool *pool); +}; + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h index 6836be203f5..3d15b342fe5 100644 --- a/intern/cycles/util/util_thread.h +++ b/intern/cycles/util/util_thread.h @@ -69,133 +69,6 @@ protected: bool joined; }; -/* Thread Safe Queue to pass tasks from one thread to another. Tasks should be - * pushed into the queue, while the worker thread waits to pop the next task - * off the queue. Once all tasks are into the queue, calling stop() will stop - * the worker threads from waiting for more tasks once all tasks are done. */ - -template<typename T> class ThreadQueue -{ -public: - ThreadQueue() - { - tot = 0; - tot_done = 0; - do_stop = false; - do_cancel = false; - } - - /* Main thread functions */ - - /* push a task to be executed */ - void push(const T& value) - { - thread_scoped_lock lock(queue_mutex); - queue.push(value); - tot++; - lock.unlock(); - - queue_cond.notify_one(); - } - - /* wait until all tasks are done */ - void wait_done() - { - thread_scoped_lock lock(done_mutex); - - while(tot_done != tot) - done_cond.wait(lock); - } - - /* stop all worker threads */ - void stop() - { - clear(); - do_stop = true; - queue_cond.notify_all(); - } - - /* cancel all tasks, but keep worker threads running */ - void cancel() - { - clear(); - do_cancel = true; - wait_done(); - do_cancel = false; - } - - /* Worker thread functions - * - * while(queue.worker_wait_pop(task)) { - * for(..) { - * ... do work ... - * - * if(queue.worker_cancel()) - * break; - * } - * - * queue.worker_done(); - * } - */ - - bool worker_wait_pop(T& value) - { - thread_scoped_lock lock(queue_mutex); - - while(queue.empty() && !do_stop) - queue_cond.wait(lock); - - if(queue.empty()) - return false; - - value = queue.front(); - queue.pop(); - - return true; - } - - void worker_done() - { - thread_scoped_lock lock(done_mutex); - tot_done++; - lock.unlock(); - - assert(tot_done <= tot); - - done_cond.notify_all(); - } - - bool worker_cancel() - { - return do_cancel; - } - -protected: - void clear() - { - thread_scoped_lock lock(queue_mutex); - - while(!queue.empty()) { - thread_scoped_lock done_lock(done_mutex); - tot_done++; - done_lock.unlock(); - - queue.pop(); - } - - done_cond.notify_all(); - } - - std::queue<T> queue; - thread_mutex queue_mutex; - thread_mutex done_mutex; - thread_condition_variable queue_cond; - thread_condition_variable done_cond; - volatile bool do_stop; - volatile bool do_cancel; - volatile int tot, tot_done; -}; - /* Thread Local Storage * * Boost implementation is a bit slow, and Mac OS X __thread is not supported diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp index 61bc36ae888..0fd26825911 100644 --- a/intern/cycles/util/util_transform.cpp +++ b/intern/cycles/util/util_transform.cpp @@ -129,23 +129,26 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4]) Transform transform_inverse(const Transform& tfm) { - union { Transform T; float M[4][4]; } R, M; - - R.T = transform_identity(); - M.T = tfm; + Transform tfmR = transform_identity(); + float M[4][4], R[4][4]; - if(!transform_matrix4_gj_inverse(R.M, M.M)) { + memcpy(R, &tfmR, sizeof(R)); + memcpy(M, &tfm, sizeof(M)); + + if(!transform_matrix4_gj_inverse(R, M)) { /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should never be in this situation, but try to invert it anyway with tweak */ - M.M[0][0] += 1e-8f; - M.M[1][1] += 1e-8f; - M.M[2][2] += 1e-8f; + M[0][0] += 1e-8f; + M[1][1] += 1e-8f; + M[2][2] += 1e-8f; - if(!transform_matrix4_gj_inverse(R.M, M.M)) + if(!transform_matrix4_gj_inverse(R, M)) return transform_identity(); } - return R.T; + memcpy(&tfmR, R, sizeof(R)); + + return tfmR; } CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index efdda98571a..cf167707e47 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -36,23 +36,37 @@ #define __shared #define __constant -#ifdef __GNUC__ -#define __device_inline static inline __attribute__((always_inline)) -#else +#ifdef _WIN32 #define __device_inline static __forceinline +#define __align(...) __declspec(align(__VA_ARGS__)) +#else +#define __device_inline static inline __attribute__((always_inline)) +#define __forceinline inline __attribute__((always_inline)) +#define __align(...) __attribute__((aligned(__VA_ARGS__))) #endif #endif +/* Bitness */ + +#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +#define __KERNEL_64_BIT__ +#endif + /* SIMD Types */ -/* not needed yet, will be for qbvh -#ifndef __KERNEL_GPU__ +/* not enabled, globally applying it just gives slowdown, + * but useful for testing. */ +//#define __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ -#include <emmintrin.h> -#include <xmmintrin.h> +#include <xmmintrin.h> /* SSE 1 */ +#include <emmintrin.h> /* SSE 2 */ +#include <pmmintrin.h> /* SSE 3 */ +#include <tmmintrin.h> /* SSE 3 */ +#include <smmintrin.h> /* SSE 4 */ -#endif*/ +#endif #ifndef _WIN32 #ifndef __KERNEL_GPU__ @@ -97,6 +111,12 @@ typedef unsigned int uint32_t; typedef long long int64_t; typedef unsigned long long uint64_t; +#ifdef __KERNEL_64_BIT__ +typedef int64_t ssize_t; +#else +typedef int32_t ssize_t; +#endif + #endif /* Generic Memory Pointer */ @@ -108,89 +128,137 @@ typedef uint64_t device_ptr; struct uchar2 { uchar x, y; - uchar operator[](int i) const { return *(&x + i); } - uchar& operator[](int i) { return *(&x + i); } + __forceinline uchar operator[](int i) const { return *(&x + i); } + __forceinline uchar& operator[](int i) { return *(&x + i); } }; struct uchar3 { uchar x, y, z; - uchar operator[](int i) const { return *(&x + i); } - uchar& operator[](int i) { return *(&x + i); } + __forceinline uchar operator[](int i) const { return *(&x + i); } + __forceinline uchar& operator[](int i) { return *(&x + i); } }; struct uchar4 { uchar x, y, z, w; - uchar operator[](int i) const { return *(&x + i); } - uchar& operator[](int i) { return *(&x + i); } + __forceinline uchar operator[](int i) const { return *(&x + i); } + __forceinline uchar& operator[](int i) { return *(&x + i); } }; struct int2 { int x, y; - int operator[](int i) const { return *(&x + i); } - int& operator[](int i) { return *(&x + i); } + __forceinline int operator[](int i) const { return *(&x + i); } + __forceinline int& operator[](int i) { return *(&x + i); } }; +#ifdef __KERNEL_SSE__ +struct __align(16) int3 { + union { + __m128i m128; + struct { int x, y, z, w; }; + }; + + __forceinline int3() {} + __forceinline int3(const __m128i a) : m128(a) {} + __forceinline operator const __m128i&(void) const { return m128; } + __forceinline operator __m128i&(void) { return m128; } +#else struct int3 { - int x, y, z; + int x, y, z, w; +#endif - int operator[](int i) const { return *(&x + i); } - int& operator[](int i) { return *(&x + i); } + __forceinline int operator[](int i) const { return *(&x + i); } + __forceinline int& operator[](int i) { return *(&x + i); } }; +#ifdef __KERNEL_SSE__ +struct __align(16) int4 { + union { + __m128i m128; + struct { int x, y, z, w; }; + }; + + __forceinline int4() {} + __forceinline int4(const __m128i a) : m128(a) {} + __forceinline operator const __m128i&(void) const { return m128; } + __forceinline operator __m128i&(void) { return m128; } +#else struct int4 { int x, y, z, w; +#endif - int operator[](int i) const { return *(&x + i); } - int& operator[](int i) { return *(&x + i); } + __forceinline int operator[](int i) const { return *(&x + i); } + __forceinline int& operator[](int i) { return *(&x + i); } }; struct uint2 { uint x, y; - uint operator[](int i) const { return *(&x + i); } - uint& operator[](int i) { return *(&x + i); } + __forceinline uint operator[](uint i) const { return *(&x + i); } + __forceinline uint& operator[](uint i) { return *(&x + i); } }; struct uint3 { uint x, y, z; - uint operator[](int i) const { return *(&x + i); } - uint& operator[](int i) { return *(&x + i); } + __forceinline uint operator[](uint i) const { return *(&x + i); } + __forceinline uint& operator[](uint i) { return *(&x + i); } }; struct uint4 { uint x, y, z, w; - uint operator[](int i) const { return *(&x + i); } - uint& operator[](int i) { return *(&x + i); } + __forceinline uint operator[](uint i) const { return *(&x + i); } + __forceinline uint& operator[](uint i) { return *(&x + i); } }; struct float2 { float x, y; - float operator[](int i) const { return *(&x + i); } - float& operator[](int i) { return *(&x + i); } + __forceinline float operator[](int i) const { return *(&x + i); } + __forceinline float& operator[](int i) { return *(&x + i); } }; +#ifdef __KERNEL_SSE__ +struct __align(16) float3 { + union { + __m128 m128; + struct { float x, y, z, w; }; + }; + + __forceinline float3() {} + __forceinline float3(const __m128 a) : m128(a) {} + __forceinline operator const __m128&(void) const { return m128; } + __forceinline operator __m128&(void) { return m128; } +#else struct float3 { - float x, y, z; - -#ifdef WITH_OPENCL - float w; + float x, y, z, w; #endif - float operator[](int i) const { return *(&x + i); } - float& operator[](int i) { return *(&x + i); } + __forceinline float operator[](int i) const { return *(&x + i); } + __forceinline float& operator[](int i) { return *(&x + i); } }; +#ifdef __KERNEL_SSE__ +struct __align(16) float4 { + union { + __m128 m128; + struct { float x, y, z, w; }; + }; + + __forceinline float4() {} + __forceinline float4(const __m128 a) : m128(a) {} + __forceinline operator const __m128&(void) const { return m128; } + __forceinline operator __m128&(void) { return m128; } +#else struct float4 { float x, y, z, w; +#endif - float operator[](int i) const { return *(&x + i); } - float& operator[](int i) { return *(&x + i); } + __forceinline float operator[](int i) const { return *(&x + i); } + __forceinline float& operator[](int i) { return *(&x + i); } }; #endif @@ -201,87 +269,179 @@ struct float4 { * * OpenCL does not support C++ class, so we use these instead. */ -__device uchar2 make_uchar2(uchar x, uchar y) +__device_inline uchar2 make_uchar2(uchar x, uchar y) { uchar2 a = {x, y}; return a; } -__device uchar3 make_uchar3(uchar x, uchar y, uchar z) +__device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) { uchar3 a = {x, y, z}; return a; } -__device uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) +__device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) { uchar4 a = {x, y, z, w}; return a; } -__device int2 make_int2(int x, int y) +__device_inline int2 make_int2(int x, int y) { int2 a = {x, y}; return a; } -__device int3 make_int3(int x, int y, int z) +__device_inline int3 make_int3(int x, int y, int z) { - int3 a = {x, y, z}; +#ifdef __KERNEL_SSE__ + int3 a; + a.m128 = _mm_set_epi32(0, z, y, x); +#else + int3 a = {x, y, z, 0}; +#endif + return a; } -__device int4 make_int4(int x, int y, int z, int w) +__device_inline int4 make_int4(int x, int y, int z, int w) { +#ifdef __KERNEL_SSE__ + int4 a; + a.m128 = _mm_set_epi32(w, z, y, x); +#else int4 a = {x, y, z, w}; +#endif + return a; } -__device uint2 make_uint2(uint x, uint y) +__device_inline uint2 make_uint2(uint x, uint y) { uint2 a = {x, y}; return a; } -__device uint3 make_uint3(uint x, uint y, uint z) +__device_inline uint3 make_uint3(uint x, uint y, uint z) { uint3 a = {x, y, z}; return a; } -__device uint4 make_uint4(uint x, uint y, uint z, uint w) +__device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) { uint4 a = {x, y, z, w}; return a; } -__device float2 make_float2(float x, float y) +__device_inline float2 make_float2(float x, float y) { float2 a = {x, y}; return a; } -__device float3 make_float3(float x, float y, float z) +__device_inline float3 make_float3(float x, float y, float z) { -#ifdef WITH_OPENCL - float3 a = {x, y, z, 0.0f}; +#ifdef __KERNEL_SSE__ + float3 a; + a.m128 = _mm_set_ps(0.0f, z, y, x); #else - float3 a = {x, y, z}; + float3 a = {x, y, z, 0.0f}; #endif + return a; } -__device float4 make_float4(float x, float y, float z, float w) +__device_inline float4 make_float4(float x, float y, float z, float w) { +#ifdef __KERNEL_SSE__ + float4 a; + a.m128 = _mm_set_ps(w, z, y, x); +#else float4 a = {x, y, z, w}; +#endif + return a; } -__device int align_up(int offset, int alignment) +__device_inline int align_up(int offset, int alignment) { return (offset + alignment - 1) & ~(alignment - 1); } +__device_inline int3 make_int3(int i) +{ +#ifdef __KERNEL_SSE__ + int3 a; + a.m128 = _mm_set1_epi32(i); +#else + int3 a = {i, i, i, i}; +#endif + + return a; +} + +__device_inline int4 make_int4(int i) +{ +#ifdef __KERNEL_SSE__ + int4 a; + a.m128 = _mm_set1_epi32(i); +#else + int4 a = {i, i, i, i}; +#endif + + return a; +} + +__device_inline float3 make_float3(float f) +{ +#ifdef __KERNEL_SSE__ + float3 a; + a.m128 = _mm_set1_ps(f); +#else + float3 a = {f, f, f, f}; +#endif + + return a; +} + +__device_inline float4 make_float4(float f) +{ +#ifdef __KERNEL_SSE__ + float4 a; + a.m128 = _mm_set1_ps(f); +#else + float4 a = {f, f, f, f}; +#endif + + return a; +} + +__device_inline float4 make_float4(const int4& i) +{ +#ifdef __KERNEL_SSE__ + float4 a; + a.m128 = _mm_cvtepi32_ps(i.m128); +#else + float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w}; +#endif + + return a; +} + +__device_inline int4 make_int4(const float3& f) +{ +#ifdef __KERNEL_SSE__ + int4 a; + a.m128 = _mm_cvtps_epi32(f.m128); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + + return a; +} + #endif CCL_NAMESPACE_END |