Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt2
-rw-r--r--intern/cycles/util/util_boundbox.h86
-rw-r--r--intern/cycles/util/util_math.h485
-rw-r--r--intern/cycles/util/util_task.cpp223
-rw-r--r--intern/cycles/util/util_task.h122
-rw-r--r--intern/cycles/util/util_thread.h127
-rw-r--r--intern/cycles/util/util_transform.cpp23
-rw-r--r--intern/cycles/util/util_types.h268
8 files changed, 992 insertions, 344 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 9182ee4cbe1..87bd84b4e0f 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
util_path.cpp
util_string.cpp
util_system.cpp
+ util_task.cpp
util_time.cpp
util_transform.cpp
)
@@ -50,6 +51,7 @@ set(SRC_HEADERS
util_set.h
util_string.h
util_system.h
+ util_task.h
util_thread.h
util_time.h
util_transform.h
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index bb1df0b220f..9511b48e103 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -23,6 +23,7 @@
#include <float.h>
#include "util_math.h"
+#include "util_string.h"
#include "util_transform.h"
#include "util_types.h"
@@ -35,45 +36,81 @@ class BoundBox
public:
float3 min, max;
- BoundBox(void)
+ __forceinline BoundBox()
{
- min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
- max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
}
- BoundBox(const float3& min_, const float3& max_)
+ __forceinline BoundBox(const float3& pt)
+ : min(pt), max(pt)
+ {
+ }
+
+ __forceinline BoundBox(const float3& min_, const float3& max_)
: min(min_), max(max_)
{
}
- void grow(const float3& pt)
+ static struct empty_t {} empty;
+
+ __forceinline BoundBox(empty_t)
+ : min(make_float3(FLT_MAX, FLT_MAX, FLT_MAX)), max(make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX))
+ {
+ }
+
+ __forceinline void grow(const float3& pt)
{
min = ccl::min(min, pt);
max = ccl::max(max, pt);
}
- void grow(const BoundBox& bbox)
+ __forceinline void grow(const BoundBox& bbox)
{
grow(bbox.min);
grow(bbox.max);
}
- void intersect(const BoundBox& bbox)
+ __forceinline void intersect(const BoundBox& bbox)
{
min = ccl::max(min, bbox.min);
max = ccl::min(max, bbox.max);
}
- float area(void) const
+ /* todo: avoid using this */
+ __forceinline float safe_area() const
{
- if(!valid())
+ if(!((min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z)))
return 0.0f;
+ return area();
+ }
+
+ __forceinline float area() const
+ {
+ return half_area()*2.0f;
+ }
+
+ __forceinline float half_area() const
+ {
float3 d = max - min;
- return dot(d, d)*2.0f;
+ return (d.x*d.z + d.y*d.z + d.x*d.y);
+ }
+
+ __forceinline float3 center() const
+ {
+ return 0.5f*(min + max);
}
- bool valid(void) const
+ __forceinline float3 center2() const
+ {
+ return min + max;
+ }
+
+ __forceinline float3 size() const
+ {
+ return max - min;
+ }
+
+ __forceinline bool valid() const
{
return (min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z) &&
(isfinite(min.x) && isfinite(min.y) && isfinite(min.z)) &&
@@ -82,7 +119,7 @@ public:
BoundBox transformed(const Transform *tfm)
{
- BoundBox result;
+ BoundBox result = BoundBox::empty;
for(int i = 0; i < 8; i++) {
float3 p;
@@ -98,6 +135,31 @@ public:
}
};
+__forceinline BoundBox merge(const BoundBox& bbox, const float3& pt)
+{
+ return BoundBox(min(bbox.min, pt), max(bbox.max, pt));
+}
+
+__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b)
+{
+ return BoundBox(min(a.min, b.min), max(a.max, b.max));
+}
+
+__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b, const BoundBox& c, const BoundBox& d)
+{
+ return merge(merge(a, b), merge(c, d));
+}
+
+__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b)
+{
+ return BoundBox(max(a.min, b.min), min(a.max, b.max));
+}
+
+__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b, const BoundBox& c)
+{
+ return intersect(a, intersect(b, c));
+}
+
CCL_NAMESPACE_END
#endif /* __UTIL_BOUNDBOX_H__ */
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 019dede07fa..33e351c74e9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -182,93 +182,74 @@ __device_inline float average(const float2 a)
__device_inline float2 operator-(const float2 a)
{
- float2 r = {-a.x, -a.y};
- return r;
+ return make_float2(-a.x, -a.y);
}
__device_inline float2 operator*(const float2 a, const float2 b)
{
- float2 r = {a.x*b.x, a.y*b.y};
- return r;
+ return make_float2(a.x*b.x, a.y*b.y);
}
__device_inline float2 operator*(const float2 a, float f)
{
- float2 r = {a.x*f, a.y*f};
- return r;
+ return make_float2(a.x*f, a.y*f);
}
__device_inline float2 operator*(float f, const float2 a)
{
- float2 r = {a.x*f, a.y*f};
- return r;
+ return make_float2(a.x*f, a.y*f);
}
__device_inline float2 operator/(float f, const float2 a)
{
- float2 r = {f/a.x, f/a.y};
- return r;
+ return make_float2(f/a.x, f/a.y);
}
__device_inline float2 operator/(const float2 a, float f)
{
float invf = 1.0f/f;
- float2 r = {a.x*invf, a.y*invf};
- return r;
+ return make_float2(a.x*invf, a.y*invf);
}
__device_inline float2 operator/(const float2 a, const float2 b)
{
- float2 r = {a.x/b.x, a.y/b.y};
- return r;
+ return make_float2(a.x/b.x, a.y/b.y);
}
__device_inline float2 operator+(const float2 a, const float2 b)
{
- float2 r = {a.x+b.x, a.y+b.y};
- return r;
+ return make_float2(a.x+b.x, a.y+b.y);
}
__device_inline float2 operator-(const float2 a, const float2 b)
{
- float2 r = {a.x-b.x, a.y-b.y};
- return r;
+ return make_float2(a.x-b.x, a.y-b.y);
}
__device_inline float2 operator+=(float2& a, const float2 b)
{
- a.x += b.x;
- a.y += b.y;
- return a;
+ return a = a + b;
}
__device_inline float2 operator*=(float2& a, const float2 b)
{
- a.x *= b.x;
- a.y *= b.y;
- return a;
+ return a = a * b;
}
__device_inline float2 operator*=(float2& a, float f)
{
- a.x *= f;
- a.y *= f;
- return a;
+ return a = a * f;
}
__device_inline float2 operator/=(float2& a, const float2 b)
{
- a.x /= b.x;
- a.y /= b.y;
- return a;
+ return a = a / b;
}
__device_inline float2 operator/=(float2& a, float f)
{
float invf = 1.0f/f;
- a.x *= invf;
- a.y *= invf;
- return a;
+ return a = a * invf;
}
@@ -314,14 +295,12 @@ __device_inline bool operator!=(const float2 a, const float2 b)
__device_inline float2 min(float2 a, float2 b)
{
- float2 r = {min(a.x, b.x), min(a.y, b.y)};
- return r;
+ return make_float2(min(a.x, b.x), min(a.y, b.y));
}
__device_inline float2 max(float2 a, float2 b)
{
- float2 r = {max(a.x, b.x), max(a.y, b.y)};
- return r;
+ return make_float2(max(a.x, b.x), max(a.y, b.y));
}
__device_inline float2 clamp(float2 a, float2 mn, float2 mx)
@@ -361,112 +340,78 @@ __device_inline float2 interp(float2 a, float2 b, float t)
/* Float3 Vector */
-__device_inline bool is_zero(const float3 a)
-{
- return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
-}
-
-__device_inline float average(const float3 a)
-{
- return (a.x + a.y + a.z)*(1.0f/3.0f);
-}
-
#ifndef __KERNEL_OPENCL__
__device_inline float3 operator-(const float3 a)
{
- float3 r = make_float3(-a.x, -a.y, -a.z);
- return r;
+ return make_float3(-a.x, -a.y, -a.z);
}
__device_inline float3 operator*(const float3 a, const float3 b)
{
- float3 r = make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
- return r;
+ return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
}
__device_inline float3 operator*(const float3 a, float f)
{
- float3 r = make_float3(a.x*f, a.y*f, a.z*f);
- return r;
+ return make_float3(a.x*f, a.y*f, a.z*f);
}
__device_inline float3 operator*(float f, const float3 a)
{
- float3 r = make_float3(a.x*f, a.y*f, a.z*f);
- return r;
+ return make_float3(a.x*f, a.y*f, a.z*f);
}
__device_inline float3 operator/(float f, const float3 a)
{
- float3 r = make_float3(f/a.x, f/a.y, f/a.z);
- return r;
+ return make_float3(f/a.x, f/a.y, f/a.z);
}
__device_inline float3 operator/(const float3 a, float f)
{
float invf = 1.0f/f;
- float3 r = make_float3(a.x*invf, a.y*invf, a.z*invf);
- return r;
+ return make_float3(a.x*invf, a.y*invf, a.z*invf);
}
__device_inline float3 operator/(const float3 a, const float3 b)
{
- float3 r = make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
- return r;
+ return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
}
__device_inline float3 operator+(const float3 a, const float3 b)
{
- float3 r = make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
- return r;
+ return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
}
__device_inline float3 operator-(const float3 a, const float3 b)
{
- float3 r = make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
- return r;
+ return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}
__device_inline float3 operator+=(float3& a, const float3 b)
{
- a.x += b.x;
- a.y += b.y;
- a.z += b.z;
- return a;
+ return a = a + b;
}
__device_inline float3 operator*=(float3& a, const float3 b)
{
- a.x *= b.x;
- a.y *= b.y;
- a.z *= b.z;
- return a;
+ return a = a * b;
}
__device_inline float3 operator*=(float3& a, float f)
{
- a.x *= f;
- a.y *= f;
- a.z *= f;
- return a;
+ return a = a * f;
}
__device_inline float3 operator/=(float3& a, const float3 b)
{
- a.x /= b.x;
- a.y /= b.y;
- a.z /= b.z;
- return a;
+ return a = a / b;
}
__device_inline float3 operator/=(float3& a, float f)
{
float invf = 1.0f/f;
- a.x *= invf;
- a.y *= invf;
- a.z *= invf;
- return a;
+ return a = a * invf;
}
__device_inline float dot(const float3 a, const float3 b)
@@ -506,7 +451,11 @@ __device_inline float3 normalize_len(const float3 a, float *t)
__device_inline bool operator==(const float3 a, const float3 b)
{
+#ifdef __KERNEL_SSE__
+ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
+#else
return (a.x == b.x && a.y == b.y && a.z == b.z);
+#endif
}
__device_inline bool operator!=(const float3 a, const float3 b)
@@ -516,14 +465,20 @@ __device_inline bool operator!=(const float3 a, const float3 b)
__device_inline float3 min(float3 a, float3 b)
{
- float3 r = make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
- return r;
+#ifdef __KERNEL_SSE__
+ return _mm_min_ps(a.m128, b.m128);
+#else
+ return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
}
__device_inline float3 max(float3 a, float3 b)
{
- float3 r = make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
- return r;
+#ifdef __KERNEL_SSE__
+ return _mm_max_ps(a.m128, b.m128);
+#else
+ return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
}
__device_inline float3 clamp(float3 a, float3 mn, float3 mx)
@@ -533,7 +488,12 @@ __device_inline float3 clamp(float3 a, float3 mn, float3 mx)
__device_inline float3 fabs(float3 a)
{
+#ifdef __KERNEL_SSE__
+ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+ return _mm_and_ps(a.m128, mask);
+#else
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
}
#endif
@@ -555,6 +515,16 @@ __device_inline void print_float3(const char *label, const float3& a)
printf("%s: %.8f %.8f %.8f\n", label, a.x, a.y, a.z);
}
+__device_inline float3 rcp(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 r = _mm_rcp_ps(a.m128);
+ return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#else
+ return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
+#endif
+}
+
#endif
__device_inline float3 interp(float3 a, float3 b, float t)
@@ -562,122 +532,257 @@ __device_inline float3 interp(float3 a, float3 b, float t)
return a + t*(b - a);
}
+__device_inline bool is_zero(const float3 a)
+{
+#ifdef __KERNEL_SSE__
+ return a == make_float3(0.0f);
+#else
+ return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
+#endif
+}
+
+__device_inline float reduce_add(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+ return (a.x + a.y + a.z);
+#else
+ return (a.x + a.y + a.z);
+#endif
+}
+
+__device_inline float average(const float3 a)
+{
+ return reduce_add(a)*(1.0f/3.0f);
+}
+
/* Float4 Vector */
-#ifndef __KERNEL_OPENCL__
+#ifdef __KERNEL_SSE__
-__device_inline bool is_zero(const float4& a)
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b)
{
- return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+ return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
}
-__device_inline float average(const float4& a)
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
{
- return (a.x + a.y + a.z + a.w)*(1.0f/4.0f);
+ return _mm_moveldup_ps(b);
}
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
+{
+ return _mm_movehdup_ps(b);
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
+{
+ return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)));
+}
+
+#endif
+
+#ifndef __KERNEL_OPENCL__
+
__device_inline float4 operator-(const float4& a)
{
- float4 r = {-a.x, -a.y, -a.z, -a.w};
- return r;
+#ifdef __KERNEL_SSE__
+ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+ return _mm_xor_ps(a.m128, mask);
+#else
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
+#endif
}
__device_inline float4 operator*(const float4& a, const float4& b)
{
- float4 r = {a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w};
- return r;
+#ifdef __KERNEL_SSE__
+ return _mm_mul_ps(a.m128, b.m128);
+#else
+ return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+#endif
}
__device_inline float4 operator*(const float4& a, float f)
{
- float4 r = {a.x*f, a.y*f, a.z*f, a.w*f};
- return r;
+#ifdef __KERNEL_SSE__
+ return a * make_float4(f);
+#else
+ return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
+#endif
}
__device_inline float4 operator*(float f, const float4& a)
{
- float4 r = {a.x*f, a.y*f, a.z*f, a.w*f};
- return r;
+ return a * f;
+}
+
+__device_inline float4 rcp(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 r = _mm_rcp_ps(a.m128);
+ return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#else
+ return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
+#endif
}
__device_inline float4 operator/(const float4& a, float f)
{
- float invf = 1.0f/f;
- float4 r = {a.x*invf, a.y*invf, a.z*invf, a.w*invf};
- return r;
+ return a * (1.0f/f);
}
__device_inline float4 operator/(const float4& a, const float4& b)
{
- float4 r = {a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w};
- return r;
+#ifdef __KERNEL_SSE__
+ return a * rcp(b);
+#else
+ return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+#endif
+
}
__device_inline float4 operator+(const float4& a, const float4& b)
{
- float4 r = {a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w};
- return r;
+#ifdef __KERNEL_SSE__
+ return _mm_add_ps(a.m128, b.m128);
+#else
+ return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
}
__device_inline float4 operator-(const float4& a, const float4& b)
{
- float4 r = {a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w};
- return r;
+#ifdef __KERNEL_SSE__
+ return _mm_sub_ps(a.m128, b.m128);
+#else
+ return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+#endif
}
__device_inline float4 operator+=(float4& a, const float4& b)
{
- a.x += b.x;
- a.y += b.y;
- a.z += b.z;
- a.w += b.w;
- return a;
+ return a = a + b;
}
__device_inline float4 operator*=(float4& a, const float4& b)
{
- a.x *= b.x;
- a.y *= b.y;
- a.z *= b.z;
- a.w *= b.w;
- return a;
+ return a = a * b;
}
__device_inline float4 operator/=(float4& a, float f)
{
- float invf = 1.0f/f;
- a.x *= invf;
- a.y *= invf;
- a.z *= invf;
- a.w *= invf;
- return a;
+ return a = a / f;
}
-__device_inline float dot(const float4& a, const float4& b)
+__device_inline int4 operator<(const float4& a, const float4& b)
{
- return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+#ifdef __KERNEL_SSE__
+ return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+ return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+__device_inline int4 operator>=(float4 a, float4 b)
+{
+#ifdef __KERNEL_SSE__
+ return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+ return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+__device_inline int4 operator<=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+ return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
+#endif
+}
+
+__device_inline bool operator==(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_SSE__
+ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
+#else
+ return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+#endif
}
__device_inline float4 cross(const float4& a, const float4& b)
{
- float4 r = {a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f};
- return r;
+#ifdef __KERNEL_SSE__
+ return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
+#else
+ return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f);
+#endif
}
__device_inline float4 min(float4 a, float4 b)
{
+#ifdef __KERNEL_SSE__
+ return _mm_min_ps(a.m128, b.m128);
+#else
return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
}
__device_inline float4 max(float4 a, float4 b)
{
+#ifdef __KERNEL_SSE__
+ return _mm_max_ps(a.m128, b.m128);
+#else
return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
}
#endif
#ifndef __KERNEL_GPU__
+__device_inline float4 select(const int4& mask, const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+ /* blendv is sse4, and apparently broken on vs2008 */
+ return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
+#else
+ return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
+#endif
+}
+
+__device_inline float4 reduce_min(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = min(shuffle<1,0,3,2>(a), a);
+ return min(shuffle<2,3,0,1>(h), h);
+#else
+ return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
+#endif
+}
+
+__device_inline float4 reduce_max(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = max(shuffle<1,0,3,2>(a), a);
+ return max(shuffle<2,3,0,1>(h), h);
+#else
+ return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
+#endif
+}
+
+#if 0
+__device_inline float4 reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = shuffle<1,0,3,2>(a) + a;
+ return shuffle<2,3,0,1>(h) + h;
+#else
+ return make_float4((a.x + a.y) + (a.z + a.w));
+#endif
+}
+#endif
+
__device_inline void print_float4(const char *label, const float4& a)
{
printf("%s: %.8f %.8f %.8f %.8f\n", label, a.x, a.y, a.z, a.w);
@@ -685,26 +790,77 @@ __device_inline void print_float4(const char *label, const float4& a)
#endif
+#ifndef __KERNEL_OPENCL__
+
+__device_inline bool is_zero(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ return a == make_float4(0.0f);
+#else
+ return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#endif
+}
+
+__device_inline float reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+ float4 h = shuffle<1,0,3,2>(a) + a;
+ return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */
+#else
+ return ((a.x + a.y) + (a.z + a.w));
+#endif
+}
+
+__device_inline float average(const float4& a)
+{
+ return reduce_add(a) * 0.25f;
+}
+
+__device_inline float dot(const float4& a, const float4& b)
+{
+ return reduce_add(a * b);
+}
+
+#endif
+
/* Int3 */
#ifndef __KERNEL_OPENCL__
+__device_inline int3 min(int3 a, int3 b)
+{
+#ifdef __KERNEL_SSE__
+ return _mm_min_epi32(a.m128, b.m128);
+#else
+ return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
__device_inline int3 max(int3 a, int3 b)
{
- int3 r = {max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
- return r;
+#ifdef __KERNEL_SSE__
+ return _mm_max_epi32(a.m128, b.m128);
+#else
+ return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
}
__device_inline int3 clamp(const int3& a, int mn, int mx)
{
- int3 r = {clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)};
- return r;
+#ifdef __KERNEL_SSE__
+ return min(max(a, make_int3(mn)), make_int3(mx));
+#else
+ return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
+#endif
}
__device_inline int3 clamp(const int3& a, int3& mn, int mx)
{
- int3 r = {clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)};
- return r;
+#ifdef __KERNEL_SSE__
+ return min(max(a, mn), make_int3(mx));
+#else
+ return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
+#endif
}
#endif
@@ -720,16 +876,63 @@ __device_inline void print_int3(const char *label, const int3& a)
/* Int4 */
-#ifndef __KERNEL_OPENCL__
+#ifndef __KERNEL_GPU__
-__device_inline int4 operator>=(float4 a, float4 b)
+__device_inline int4 operator+(const int4& a, const int4& b)
{
- return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#ifdef __KERNEL_SSE__
+ return _mm_add_epi32(a.m128, b.m128);
+#else
+ return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+__device_inline int4 operator+=(int4& a, const int4& b)
+{
+ return a = a + b;
}
+__device_inline int4 operator>>(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+ return _mm_srai_epi32(a.m128, i);
+#else
+ return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
#endif
+}
-#ifndef __KERNEL_GPU__
+__device_inline int4 min(int4 a, int4 b)
+{
+#ifdef __KERNEL_SSE__
+ return _mm_min_epi32(a.m128, b.m128);
+#else
+ return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
+}
+
+__device_inline int4 max(int4 a, int4 b)
+{
+#ifdef __KERNEL_SSE__
+ return _mm_max_epi32(a.m128, b.m128);
+#else
+ return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
+}
+
+__device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
+{
+ return min(max(a, mn), mx);
+}
+
+__device_inline int4 select(const int4& mask, const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+ __m128 m = _mm_cvtepi32_ps(mask);
+ return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */
+#else
+ return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
+#endif
+}
__device_inline void print_int4(const char *label, const int4& a)
{
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
new file mode 100644
index 00000000000..6da9a70ec0c
--- /dev/null
+++ b/intern/cycles/util/util_task.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "util_debug.h"
+#include "util_foreach.h"
+#include "util_system.h"
+#include "util_task.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Task Pool */
+
+TaskPool::TaskPool(const TaskRunFunction& run_)
+{
+ num = 0;
+ num_done = 0;
+
+ do_cancel = false;
+
+ run = run_;
+}
+
+TaskPool::~TaskPool()
+{
+ stop();
+}
+
+void TaskPool::push(Task *task, bool front)
+{
+ TaskScheduler::Entry entry;
+
+ entry.task = task;
+ entry.pool = this;
+
+ TaskScheduler::push(entry, front);
+}
+
+void TaskPool::wait()
+{
+ thread_scoped_lock lock(done_mutex);
+
+ while(num_done != num)
+ done_cond.wait(lock);
+}
+
+void TaskPool::cancel()
+{
+ TaskScheduler::clear(this);
+
+ do_cancel = true;
+ wait();
+ do_cancel = false;
+}
+
+void TaskPool::stop()
+{
+ TaskScheduler::clear(this);
+
+ assert(num_done == num);
+}
+
+bool TaskPool::cancelled()
+{
+ return do_cancel;
+}
+
+void TaskPool::done_increase(int done)
+{
+ done_mutex.lock();
+ num_done += done;
+ done_mutex.unlock();
+
+ assert(num_done <= num);
+ done_cond.notify_all();
+}
+
+/* Task Scheduler */
+
+thread_mutex TaskScheduler::mutex;
+int TaskScheduler::users = 0;
+vector<thread*> TaskScheduler::threads;
+volatile bool TaskScheduler::do_exit = false;
+
+list<TaskScheduler::Entry> TaskScheduler::queue;
+thread_mutex TaskScheduler::queue_mutex;
+thread_condition_variable TaskScheduler::queue_cond;
+
+void TaskScheduler::init(int num_threads)
+{
+ thread_scoped_lock lock(mutex);
+
+ /* multiple cycles instances can use this task scheduler, sharing the same
+ threads, so we keep track of the number of users. */
+ if(users == 0) {
+ do_exit = false;
+
+ /* launch threads that will be waiting for work */
+ if(num_threads == 0)
+ num_threads = system_cpu_thread_count();
+
+ threads.resize(num_threads);
+
+ for(size_t i = 0; i < threads.size(); i++)
+ threads[i] = new thread(function_bind(&TaskScheduler::thread_run, i));
+ }
+
+ users++;
+}
+
+void TaskScheduler::exit()
+{
+ thread_scoped_lock lock(mutex);
+
+ users--;
+
+ if(users == 0) {
+ /* stop all waiting threads */
+ do_exit = true;
+ TaskScheduler::queue_cond.notify_all();
+
+ /* delete threads */
+ foreach(thread *t, threads) {
+ t->join();
+ delete t;
+ }
+
+ threads.clear();
+ }
+}
+
+bool TaskScheduler::thread_wait_pop(Entry& entry)
+{
+ thread_scoped_lock lock(queue_mutex);
+
+ while(queue.empty() && !do_exit)
+ queue_cond.wait(lock);
+
+ if(queue.empty()) {
+ assert(do_exit);
+ return false;
+ }
+
+ entry = queue.front();
+ queue.pop_front();
+
+ return true;
+}
+
+void TaskScheduler::thread_run(int thread_id)
+{
+ Entry entry;
+
+ /* todo: test affinity/denormal mask */
+
+ /* keep popping off tasks */
+ while(thread_wait_pop(entry)) {
+ /* run task */
+ entry.pool->run(entry.task, thread_id);
+
+ /* delete task */
+ delete entry.task;
+
+ /* notify pool task was done */
+ entry.pool->done_increase(1);
+ }
+}
+
+void TaskScheduler::push(Entry& entry, bool front)
+{
+ /* add entry to queue */
+ TaskScheduler::queue_mutex.lock();
+ if(front)
+ TaskScheduler::queue.push_front(entry);
+ else
+ TaskScheduler::queue.push_back(entry);
+ entry.pool->num++;
+ TaskScheduler::queue_mutex.unlock();
+
+ TaskScheduler::queue_cond.notify_one();
+}
+
+void TaskScheduler::clear(TaskPool *pool)
+{
+ thread_scoped_lock lock(TaskScheduler::queue_mutex);
+
+ /* erase all tasks from this pool from the queue */
+ list<TaskScheduler::Entry>::iterator it = TaskScheduler::queue.begin();
+ int done = 0;
+
+ while(it != TaskScheduler::queue.end()) {
+ TaskScheduler::Entry& entry = *it;
+
+ if(entry.pool == pool) {
+ done++;
+ delete entry.task;
+
+ it = TaskScheduler::queue.erase(it);
+ }
+ else
+ it++;
+ }
+
+ /* notify done */
+ pool->done_increase(done);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
new file mode 100644
index 00000000000..acdb2cb50a2
--- /dev/null
+++ b/intern/cycles/util/util_task.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __UTIL_TASK_H__
+#define __UTIL_TASK_H__
+
+#include "util_list.h"
+#include "util_thread.h"
+#include "util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Task;
+class TaskPool;
+class TaskScheduler;
+
+typedef boost::function<void(Task*,int)> TaskRunFunction;
+
+/* Task
+ *
+ * Base class for tasks to be executed in threads. */
+
+class Task
+{
+public:
+ Task() {};
+ virtual ~Task() {}
+};
+
+/* Task Pool
+ *
+ * Pool of tasks that will be executed by the central TaskScheduler.For each
+ * pool, we can wait for all tasks to be done, or cancel them before they are
+ * done.
+ *
+ * The run callback that actually executes the task may be create like this:
+ * function_bind(&MyClass::task_execute, this, _1, _2) */
+
+class TaskPool
+{
+public:
+ TaskPool(const TaskRunFunction& run);
+ ~TaskPool();
+
+ void push(Task *task, bool front = false);
+
+ void wait(); /* wait until all tasks are done */
+ void cancel(); /* cancel all tasks, keep worker threads running */
+ void stop(); /* stop all worker threads */
+
+ bool cancelled(); /* for worker threads, test if cancelled */
+
+protected:
+ friend class TaskScheduler;
+
+ void done_increase(int done);
+
+ TaskRunFunction run;
+
+ thread_mutex done_mutex;
+ thread_condition_variable done_cond;
+
+ volatile int num, num_done;
+ volatile bool do_cancel;
+};
+
+/* Task Scheduler
+ *
+ * Central scheduler that holds running threads ready to execute tasks. A singe
+ * queue holds the task from all pools. */
+
+class TaskScheduler
+{
+public:
+ static void init(int num_threads = 0);
+ static void exit();
+
+ static int num_threads() { return threads.size(); }
+
+protected:
+ friend class TaskPool;
+
+ struct Entry {
+ Task *task;
+ TaskPool *pool;
+ };
+
+ static thread_mutex mutex;
+ static int users;
+ static vector<thread*> threads;
+ static volatile bool do_exit;
+
+ static list<Entry> queue;
+ static thread_mutex queue_mutex;
+ static thread_condition_variable queue_cond;
+
+ static void thread_run(int thread_id);
+ static bool thread_wait_pop(Entry& entry);
+
+ static void push(Entry& entry, bool front);
+ static void clear(TaskPool *pool);
+};
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 6836be203f5..3d15b342fe5 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -69,133 +69,6 @@ protected:
bool joined;
};
-/* Thread Safe Queue to pass tasks from one thread to another. Tasks should be
- * pushed into the queue, while the worker thread waits to pop the next task
- * off the queue. Once all tasks are into the queue, calling stop() will stop
- * the worker threads from waiting for more tasks once all tasks are done. */
-
-template<typename T> class ThreadQueue
-{
-public:
- ThreadQueue()
- {
- tot = 0;
- tot_done = 0;
- do_stop = false;
- do_cancel = false;
- }
-
- /* Main thread functions */
-
- /* push a task to be executed */
- void push(const T& value)
- {
- thread_scoped_lock lock(queue_mutex);
- queue.push(value);
- tot++;
- lock.unlock();
-
- queue_cond.notify_one();
- }
-
- /* wait until all tasks are done */
- void wait_done()
- {
- thread_scoped_lock lock(done_mutex);
-
- while(tot_done != tot)
- done_cond.wait(lock);
- }
-
- /* stop all worker threads */
- void stop()
- {
- clear();
- do_stop = true;
- queue_cond.notify_all();
- }
-
- /* cancel all tasks, but keep worker threads running */
- void cancel()
- {
- clear();
- do_cancel = true;
- wait_done();
- do_cancel = false;
- }
-
- /* Worker thread functions
- *
- * while(queue.worker_wait_pop(task)) {
- * for(..) {
- * ... do work ...
- *
- * if(queue.worker_cancel())
- * break;
- * }
- *
- * queue.worker_done();
- * }
- */
-
- bool worker_wait_pop(T& value)
- {
- thread_scoped_lock lock(queue_mutex);
-
- while(queue.empty() && !do_stop)
- queue_cond.wait(lock);
-
- if(queue.empty())
- return false;
-
- value = queue.front();
- queue.pop();
-
- return true;
- }
-
- void worker_done()
- {
- thread_scoped_lock lock(done_mutex);
- tot_done++;
- lock.unlock();
-
- assert(tot_done <= tot);
-
- done_cond.notify_all();
- }
-
- bool worker_cancel()
- {
- return do_cancel;
- }
-
-protected:
- void clear()
- {
- thread_scoped_lock lock(queue_mutex);
-
- while(!queue.empty()) {
- thread_scoped_lock done_lock(done_mutex);
- tot_done++;
- done_lock.unlock();
-
- queue.pop();
- }
-
- done_cond.notify_all();
- }
-
- std::queue<T> queue;
- thread_mutex queue_mutex;
- thread_mutex done_mutex;
- thread_condition_variable queue_cond;
- thread_condition_variable done_cond;
- volatile bool do_stop;
- volatile bool do_cancel;
- volatile int tot, tot_done;
-};
-
/* Thread Local Storage
*
* Boost implementation is a bit slow, and Mac OS X __thread is not supported
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 61bc36ae888..0fd26825911 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -129,23 +129,26 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
Transform transform_inverse(const Transform& tfm)
{
- union { Transform T; float M[4][4]; } R, M;
-
- R.T = transform_identity();
- M.T = tfm;
+ Transform tfmR = transform_identity();
+ float M[4][4], R[4][4];
- if(!transform_matrix4_gj_inverse(R.M, M.M)) {
+ memcpy(R, &tfmR, sizeof(R));
+ memcpy(M, &tfm, sizeof(M));
+
+ if(!transform_matrix4_gj_inverse(R, M)) {
/* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
never be in this situation, but try to invert it anyway with tweak */
- M.M[0][0] += 1e-8f;
- M.M[1][1] += 1e-8f;
- M.M[2][2] += 1e-8f;
+ M[0][0] += 1e-8f;
+ M[1][1] += 1e-8f;
+ M[2][2] += 1e-8f;
- if(!transform_matrix4_gj_inverse(R.M, M.M))
+ if(!transform_matrix4_gj_inverse(R, M))
return transform_identity();
}
- return R.T;
+ memcpy(&tfmR, R, sizeof(R));
+
+ return tfmR;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index efdda98571a..cf167707e47 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -36,23 +36,37 @@
#define __shared
#define __constant
-#ifdef __GNUC__
-#define __device_inline static inline __attribute__((always_inline))
-#else
+#ifdef _WIN32
#define __device_inline static __forceinline
+#define __align(...) __declspec(align(__VA_ARGS__))
+#else
+#define __device_inline static inline __attribute__((always_inline))
+#define __forceinline inline __attribute__((always_inline))
+#define __align(...) __attribute__((aligned(__VA_ARGS__)))
#endif
#endif
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __KERNEL_64_BIT__
+#endif
+
/* SIMD Types */
-/* not needed yet, will be for qbvh
-#ifndef __KERNEL_GPU__
+/* not enabled, globally applying it just gives slowdown,
+ * but useful for testing. */
+//#define __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
-#include <emmintrin.h>
-#include <xmmintrin.h>
+#include <xmmintrin.h> /* SSE 1 */
+#include <emmintrin.h> /* SSE 2 */
+#include <pmmintrin.h> /* SSE 3 */
+#include <tmmintrin.h> /* SSE 3 */
+#include <smmintrin.h> /* SSE 4 */
-#endif*/
+#endif
#ifndef _WIN32
#ifndef __KERNEL_GPU__
@@ -97,6 +111,12 @@ typedef unsigned int uint32_t;
typedef long long int64_t;
typedef unsigned long long uint64_t;
+#ifdef __KERNEL_64_BIT__
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+
#endif
/* Generic Memory Pointer */
@@ -108,89 +128,137 @@ typedef uint64_t device_ptr;
struct uchar2 {
uchar x, y;
- uchar operator[](int i) const { return *(&x + i); }
- uchar& operator[](int i) { return *(&x + i); }
+ __forceinline uchar operator[](int i) const { return *(&x + i); }
+ __forceinline uchar& operator[](int i) { return *(&x + i); }
};
struct uchar3 {
uchar x, y, z;
- uchar operator[](int i) const { return *(&x + i); }
- uchar& operator[](int i) { return *(&x + i); }
+ __forceinline uchar operator[](int i) const { return *(&x + i); }
+ __forceinline uchar& operator[](int i) { return *(&x + i); }
};
struct uchar4 {
uchar x, y, z, w;
- uchar operator[](int i) const { return *(&x + i); }
- uchar& operator[](int i) { return *(&x + i); }
+ __forceinline uchar operator[](int i) const { return *(&x + i); }
+ __forceinline uchar& operator[](int i) { return *(&x + i); }
};
struct int2 {
int x, y;
- int operator[](int i) const { return *(&x + i); }
- int& operator[](int i) { return *(&x + i); }
+ __forceinline int operator[](int i) const { return *(&x + i); }
+ __forceinline int& operator[](int i) { return *(&x + i); }
};
+#ifdef __KERNEL_SSE__
+struct __align(16) int3 {
+ union {
+ __m128i m128;
+ struct { int x, y, z, w; };
+ };
+
+ __forceinline int3() {}
+ __forceinline int3(const __m128i a) : m128(a) {}
+ __forceinline operator const __m128i&(void) const { return m128; }
+ __forceinline operator __m128i&(void) { return m128; }
+#else
struct int3 {
- int x, y, z;
+ int x, y, z, w;
+#endif
- int operator[](int i) const { return *(&x + i); }
- int& operator[](int i) { return *(&x + i); }
+ __forceinline int operator[](int i) const { return *(&x + i); }
+ __forceinline int& operator[](int i) { return *(&x + i); }
};
+#ifdef __KERNEL_SSE__
+struct __align(16) int4 {
+ union {
+ __m128i m128;
+ struct { int x, y, z, w; };
+ };
+
+ __forceinline int4() {}
+ __forceinline int4(const __m128i a) : m128(a) {}
+ __forceinline operator const __m128i&(void) const { return m128; }
+ __forceinline operator __m128i&(void) { return m128; }
+#else
struct int4 {
int x, y, z, w;
+#endif
- int operator[](int i) const { return *(&x + i); }
- int& operator[](int i) { return *(&x + i); }
+ __forceinline int operator[](int i) const { return *(&x + i); }
+ __forceinline int& operator[](int i) { return *(&x + i); }
};
struct uint2 {
uint x, y;
- uint operator[](int i) const { return *(&x + i); }
- uint& operator[](int i) { return *(&x + i); }
+ __forceinline uint operator[](uint i) const { return *(&x + i); }
+ __forceinline uint& operator[](uint i) { return *(&x + i); }
};
struct uint3 {
uint x, y, z;
- uint operator[](int i) const { return *(&x + i); }
- uint& operator[](int i) { return *(&x + i); }
+ __forceinline uint operator[](uint i) const { return *(&x + i); }
+ __forceinline uint& operator[](uint i) { return *(&x + i); }
};
struct uint4 {
uint x, y, z, w;
- uint operator[](int i) const { return *(&x + i); }
- uint& operator[](int i) { return *(&x + i); }
+ __forceinline uint operator[](uint i) const { return *(&x + i); }
+ __forceinline uint& operator[](uint i) { return *(&x + i); }
};
struct float2 {
float x, y;
- float operator[](int i) const { return *(&x + i); }
- float& operator[](int i) { return *(&x + i); }
+ __forceinline float operator[](int i) const { return *(&x + i); }
+ __forceinline float& operator[](int i) { return *(&x + i); }
};
+#ifdef __KERNEL_SSE__
+struct __align(16) float3 {
+ union {
+ __m128 m128;
+ struct { float x, y, z, w; };
+ };
+
+ __forceinline float3() {}
+ __forceinline float3(const __m128 a) : m128(a) {}
+ __forceinline operator const __m128&(void) const { return m128; }
+ __forceinline operator __m128&(void) { return m128; }
+#else
struct float3 {
- float x, y, z;
-
-#ifdef WITH_OPENCL
- float w;
+ float x, y, z, w;
#endif
- float operator[](int i) const { return *(&x + i); }
- float& operator[](int i) { return *(&x + i); }
+ __forceinline float operator[](int i) const { return *(&x + i); }
+ __forceinline float& operator[](int i) { return *(&x + i); }
};
+#ifdef __KERNEL_SSE__
+struct __align(16) float4 {
+ union {
+ __m128 m128;
+ struct { float x, y, z, w; };
+ };
+
+ __forceinline float4() {}
+ __forceinline float4(const __m128 a) : m128(a) {}
+ __forceinline operator const __m128&(void) const { return m128; }
+ __forceinline operator __m128&(void) { return m128; }
+#else
struct float4 {
float x, y, z, w;
+#endif
- float operator[](int i) const { return *(&x + i); }
- float& operator[](int i) { return *(&x + i); }
+ __forceinline float operator[](int i) const { return *(&x + i); }
+ __forceinline float& operator[](int i) { return *(&x + i); }
};
#endif
@@ -201,87 +269,179 @@ struct float4 {
*
* OpenCL does not support C++ class, so we use these instead. */
-__device uchar2 make_uchar2(uchar x, uchar y)
+__device_inline uchar2 make_uchar2(uchar x, uchar y)
{
uchar2 a = {x, y};
return a;
}
-__device uchar3 make_uchar3(uchar x, uchar y, uchar z)
+__device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
{
uchar3 a = {x, y, z};
return a;
}
-__device uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+__device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
{
uchar4 a = {x, y, z, w};
return a;
}
-__device int2 make_int2(int x, int y)
+__device_inline int2 make_int2(int x, int y)
{
int2 a = {x, y};
return a;
}
-__device int3 make_int3(int x, int y, int z)
+__device_inline int3 make_int3(int x, int y, int z)
{
- int3 a = {x, y, z};
+#ifdef __KERNEL_SSE__
+ int3 a;
+ a.m128 = _mm_set_epi32(0, z, y, x);
+#else
+ int3 a = {x, y, z, 0};
+#endif
+
return a;
}
-__device int4 make_int4(int x, int y, int z, int w)
+__device_inline int4 make_int4(int x, int y, int z, int w)
{
+#ifdef __KERNEL_SSE__
+ int4 a;
+ a.m128 = _mm_set_epi32(w, z, y, x);
+#else
int4 a = {x, y, z, w};
+#endif
+
return a;
}
-__device uint2 make_uint2(uint x, uint y)
+__device_inline uint2 make_uint2(uint x, uint y)
{
uint2 a = {x, y};
return a;
}
-__device uint3 make_uint3(uint x, uint y, uint z)
+__device_inline uint3 make_uint3(uint x, uint y, uint z)
{
uint3 a = {x, y, z};
return a;
}
-__device uint4 make_uint4(uint x, uint y, uint z, uint w)
+__device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
{
uint4 a = {x, y, z, w};
return a;
}
-__device float2 make_float2(float x, float y)
+__device_inline float2 make_float2(float x, float y)
{
float2 a = {x, y};
return a;
}
-__device float3 make_float3(float x, float y, float z)
+__device_inline float3 make_float3(float x, float y, float z)
{
-#ifdef WITH_OPENCL
- float3 a = {x, y, z, 0.0f};
+#ifdef __KERNEL_SSE__
+ float3 a;
+ a.m128 = _mm_set_ps(0.0f, z, y, x);
#else
- float3 a = {x, y, z};
+ float3 a = {x, y, z, 0.0f};
#endif
+
return a;
}
-__device float4 make_float4(float x, float y, float z, float w)
+__device_inline float4 make_float4(float x, float y, float z, float w)
{
+#ifdef __KERNEL_SSE__
+ float4 a;
+ a.m128 = _mm_set_ps(w, z, y, x);
+#else
float4 a = {x, y, z, w};
+#endif
+
return a;
}
-__device int align_up(int offset, int alignment)
+__device_inline int align_up(int offset, int alignment)
{
return (offset + alignment - 1) & ~(alignment - 1);
}
+__device_inline int3 make_int3(int i)
+{
+#ifdef __KERNEL_SSE__
+ int3 a;
+ a.m128 = _mm_set1_epi32(i);
+#else
+ int3 a = {i, i, i, i};
+#endif
+
+ return a;
+}
+
+__device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+ int4 a;
+ a.m128 = _mm_set1_epi32(i);
+#else
+ int4 a = {i, i, i, i};
+#endif
+
+ return a;
+}
+
+__device_inline float3 make_float3(float f)
+{
+#ifdef __KERNEL_SSE__
+ float3 a;
+ a.m128 = _mm_set1_ps(f);
+#else
+ float3 a = {f, f, f, f};
+#endif
+
+ return a;
+}
+
+__device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+ float4 a;
+ a.m128 = _mm_set1_ps(f);
+#else
+ float4 a = {f, f, f, f};
+#endif
+
+ return a;
+}
+
+__device_inline float4 make_float4(const int4& i)
+{
+#ifdef __KERNEL_SSE__
+ float4 a;
+ a.m128 = _mm_cvtepi32_ps(i.m128);
+#else
+ float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
+#endif
+
+ return a;
+}
+
+__device_inline int4 make_int4(const float3& f)
+{
+#ifdef __KERNEL_SSE__
+ int4 a;
+ a.m128 = _mm_cvtps_epi32(f.m128);
+#else
+ int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+
+ return a;
+}
+
#endif
CCL_NAMESPACE_END