8 files changed, 992 insertions, 344 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 9182ee4cbe1..87bd84b4e0f 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
 	util_path.cpp
 	util_string.cpp
 	util_system.cpp
+	util_task.cpp
 	util_time.cpp
 	util_transform.cpp
 )
@@ -50,6 +51,7 @@ set(SRC_HEADERS
 	util_set.h
 	util_string.h
 	util_system.h
+	util_task.h
 	util_thread.h
 	util_time.h
 	util_transform.h
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index bb1df0b220f..9511b48e103 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -23,6 +23,7 @@
 #include <float.h>
 
 #include "util_math.h"
+#include "util_string.h"
 #include "util_transform.h"
 #include "util_types.h"
 
@@ -35,45 +36,81 @@ class BoundBox
 public:
 	float3 min, max;
 
-	BoundBox(void)
+	__forceinline BoundBox()
 	{
-		min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX);
-		max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
 	}
 
-	BoundBox(const float3& min_, const float3& max_)
+	__forceinline BoundBox(const float3& pt)
+	: min(pt), max(pt)
+	{
+	}
+
+	__forceinline BoundBox(const float3& min_, const float3& max_)
 	: min(min_), max(max_)
 	{
 	}
 
-	void grow(const float3& pt)  
+	static struct empty_t {} empty;
+
+	__forceinline BoundBox(empty_t)
+	: min(make_float3(FLT_MAX, FLT_MAX, FLT_MAX)), max(make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX))
+	{
+	}
+
+	__forceinline void grow(const float3& pt)  
 	{
 		min = ccl::min(min, pt);
 		max = ccl::max(max, pt);
 	}
 
-	void grow(const BoundBox& bbox)
+	__forceinline void grow(const BoundBox& bbox)
 	{
 		grow(bbox.min);
 		grow(bbox.max);
 	}
 
-	void intersect(const BoundBox& bbox) 
+	__forceinline void intersect(const BoundBox& bbox) 
 	{
 		min = ccl::max(min, bbox.min);
 		max = ccl::min(max, bbox.max);
 	}
 
-	float area(void) const
+	/* todo: avoid using this */
+	__forceinline float safe_area() const
 	{
-		if(!valid())
+		if(!((min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z)))
 			return 0.0f;
 
+		return area();
+	}
+
+	__forceinline float area() const
+	{
+		return half_area()*2.0f;
+	}
+
+	__forceinline float half_area() const
+	{
 		float3 d = max - min;
-		return dot(d, d)*2.0f;
+		return (d.x*d.z + d.y*d.z + d.x*d.y);
+	}
+
+	__forceinline float3 center() const
+	{
+		return 0.5f*(min + max);
 	}
 
-	bool valid(void) const
+	__forceinline float3 center2() const
+	{
+		return min + max;
+	}
+
+	__forceinline float3 size() const
+	{
+		return max - min;
+	}
+	
+	__forceinline bool valid() const
 	{
 		return (min.x <= max.x) && (min.y <= max.y) && (min.z <= max.z) &&
 		       (isfinite(min.x) && isfinite(min.y) && isfinite(min.z)) &&
@@ -82,7 +119,7 @@ public:
 
 	BoundBox transformed(const Transform *tfm)
 	{
-		BoundBox result;
+		BoundBox result = BoundBox::empty;
 
 		for(int i = 0; i < 8; i++) {
 			float3 p;
@@ -98,6 +135,31 @@ public:
 	}
 };
 
+__forceinline BoundBox merge(const BoundBox& bbox, const float3& pt)
+{
+	return BoundBox(min(bbox.min, pt), max(bbox.max, pt));
+}
+
+__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b)
+{
+	return BoundBox(min(a.min, b.min), max(a.max, b.max));
+}
+
+__forceinline BoundBox merge(const BoundBox& a, const BoundBox& b, const BoundBox& c, const BoundBox& d)
+{
+	return merge(merge(a, b), merge(c, d));
+}
+
+__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b)
+{
+	return BoundBox(max(a.min, b.min), min(a.max, b.max));
+}
+
+__forceinline BoundBox intersect(const BoundBox& a, const BoundBox& b, const BoundBox& c)
+{
+	return intersect(a, intersect(b, c));
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_BOUNDBOX_H__ */
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 019dede07fa..33e351c74e9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -182,93 +182,74 @@ __device_inline float average(const float2 a)
 
 __device_inline float2 operator-(const float2 a)
 {
-	float2 r = {-a.x, -a.y};
-	return r;
+	return make_float2(-a.x, -a.y);
 }
 
 __device_inline float2 operator*(const float2 a, const float2 b)
 {
-	float2 r = {a.x*b.x, a.y*b.y};
-	return r;
+	return make_float2(a.x*b.x, a.y*b.y);
 }
 
 __device_inline float2 operator*(const float2 a, float f)
 {
-	float2 r = {a.x*f, a.y*f};
-	return r;
+	return make_float2(a.x*f, a.y*f);
 }
 
 __device_inline float2 operator*(float f, const float2 a)
 {
-	float2 r = {a.x*f, a.y*f};
-	return r;
+	return make_float2(a.x*f, a.y*f);
 }
 
 __device_inline float2 operator/(float f, const float2 a)
 {
-	float2 r = {f/a.x, f/a.y};
-	return r;
+	return make_float2(f/a.x, f/a.y);
 }
 
 __device_inline float2 operator/(const float2 a, float f)
 {
 	float invf = 1.0f/f;
-	float2 r = {a.x*invf, a.y*invf};
-	return r;
+	return make_float2(a.x*invf, a.y*invf);
 }
 
 __device_inline float2 operator/(const float2 a, const float2 b)
 {
-	float2 r = {a.x/b.x, a.y/b.y};
-	return r;
+	return make_float2(a.x/b.x, a.y/b.y);
 }
 
 __device_inline float2 operator+(const float2 a, const float2 b)
 {
-	float2 r = {a.x+b.x, a.y+b.y};
-	return r;
+	return make_float2(a.x+b.x, a.y+b.y);
 }
 
 __device_inline float2 operator-(const float2 a, const float2 b)
 {
-	float2 r = {a.x-b.x, a.y-b.y};
-	return r;
+	return make_float2(a.x-b.x, a.y-b.y);
 }
 
 __device_inline float2 operator+=(float2& a, const float2 b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	return a;
+	return a = a + b;
 }
 
 __device_inline float2 operator*=(float2& a, const float2 b)
 {
-	a.x *= b.x;
-	a.y *= b.y;
-	return a;
+	return a = a * b;
 }
 
 __device_inline float2 operator*=(float2& a, float f)
 {
-	a.x *= f;
-	a.y *= f;
-	return a;
+	return a = a * f;
 }
 
 __device_inline float2 operator/=(float2& a, const float2 b)
 {
-	a.x /= b.x;
-	a.y /= b.y;
-	return a;
+	return a = a / b;
 }
 
 __device_inline float2 operator/=(float2& a, float f)
 {
 	float invf = 1.0f/f;
-	a.x *= invf;
-	a.y *= invf;
-	return a;
+	return a = a * invf;
 }
 
 
@@ -314,14 +295,12 @@ __device_inline bool operator!=(const float2 a, const float2 b)
 
 __device_inline float2 min(float2 a, float2 b)
 {
-	float2 r = {min(a.x, b.x), min(a.y, b.y)};
-	return r;
+	return make_float2(min(a.x, b.x), min(a.y, b.y));
 }
 
 __device_inline float2 max(float2 a, float2 b)
 {
-	float2 r = {max(a.x, b.x), max(a.y, b.y)};
-	return r;
+	return make_float2(max(a.x, b.x), max(a.y, b.y));
 }
 
 __device_inline float2 clamp(float2 a, float2 mn, float2 mx)
@@ -361,112 +340,78 @@ __device_inline float2 interp(float2 a, float2 b, float t)
 
 /* Float3 Vector */
 
-__device_inline bool is_zero(const float3 a)
-{
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
-}
-
-__device_inline float average(const float3 a)
-{
-	return (a.x + a.y + a.z)*(1.0f/3.0f);
-}
-
 #ifndef __KERNEL_OPENCL__
 
 __device_inline float3 operator-(const float3 a)
 {
-	float3 r = make_float3(-a.x, -a.y, -a.z);
-	return r;
+	return make_float3(-a.x, -a.y, -a.z);
 }
 
 __device_inline float3 operator*(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-	return r;
+	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
 }
 
 __device_inline float3 operator*(const float3 a, float f)
 {
-	float3 r = make_float3(a.x*f, a.y*f, a.z*f);
-	return r;
+	return make_float3(a.x*f, a.y*f, a.z*f);
 }
 
 __device_inline float3 operator*(float f, const float3 a)
 {
-	float3 r = make_float3(a.x*f, a.y*f, a.z*f);
-	return r;
+	return make_float3(a.x*f, a.y*f, a.z*f);
 }
 
 __device_inline float3 operator/(float f, const float3 a)
 {
-	float3 r = make_float3(f/a.x, f/a.y, f/a.z);
-	return r;
+	return make_float3(f/a.x, f/a.y, f/a.z);
 }
 
 __device_inline float3 operator/(const float3 a, float f)
 {
 	float invf = 1.0f/f;
-	float3 r = make_float3(a.x*invf, a.y*invf, a.z*invf);
-	return r;
+	return make_float3(a.x*invf, a.y*invf, a.z*invf);
 }
 
 __device_inline float3 operator/(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
-	return r;
+	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
 }
 
 __device_inline float3 operator+(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-	return r;
+	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
 }
 
 __device_inline float3 operator-(const float3 a, const float3 b)
 {
-	float3 r = make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
-	return r;
+	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
 }
 
 __device_inline float3 operator+=(float3& a, const float3 b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	a.z += b.z;
-	return a;
+	return a = a + b;
 }
 
 __device_inline float3 operator*=(float3& a, const float3 b)
 {
-	a.x *= b.x;
-	a.y *= b.y;
-	a.z *= b.z;
-	return a;
+	return a = a * b;
 }
 
 __device_inline float3 operator*=(float3& a, float f)
 {
-	a.x *= f;
-	a.y *= f;
-	a.z *= f;
-	return a;
+	return a = a * f;
 }
 
 __device_inline float3 operator/=(float3& a, const float3 b)
 {
-	a.x /= b.x;
-	a.y /= b.y;
-	a.z /= b.z;
-	return a;
+	return a = a / b;
 }
 
 __device_inline float3 operator/=(float3& a, float f)
 {
 	float invf = 1.0f/f;
-	a.x *= invf;
-	a.y *= invf;
-	a.z *= invf;
-	return a;
+	return a = a * invf;
 }
 
 __device_inline float dot(const float3 a, const float3 b)
@@ -506,7 +451,11 @@ __device_inline float3 normalize_len(const float3 a, float *t)
 
 __device_inline bool operator==(const float3 a, const float3 b)
 {
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
+#else
 	return (a.x == b.x && a.y == b.y && a.z == b.z);
+#endif
 }
 
 __device_inline bool operator!=(const float3 a, const float3 b)
@@ -516,14 +465,20 @@ __device_inline bool operator!=(const float3 a, const float3 b)
 
 __device_inline float3 min(float3 a, float3 b)
 {
-	float3 r = make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_min_ps(a.m128, b.m128);
+#else
+	return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
 }
 
 __device_inline float3 max(float3 a, float3 b)
 {
-	float3 r = make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_max_ps(a.m128, b.m128);
+#else
+	return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
 }
 
 __device_inline float3 clamp(float3 a, float3 mn, float3 mx)
@@ -533,7 +488,12 @@ __device_inline float3 clamp(float3 a, float3 mn, float3 mx)
 
 __device_inline float3 fabs(float3 a)
 {
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+	return _mm_and_ps(a.m128, mask);
+#else
 	return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
 }
 
 #endif
@@ -555,6 +515,16 @@ __device_inline void print_float3(const char *label, const float3& a)
 	printf("%s: %.8f %.8f %.8f\n", label, a.x, a.y, a.z);
 }
 
+__device_inline float3 rcp(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 r = _mm_rcp_ps(a.m128);
+	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#else
+	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
+#endif
+}
+
 #endif
 
 __device_inline float3 interp(float3 a, float3 b, float t)
@@ -562,122 +532,257 @@ __device_inline float3 interp(float3 a, float3 b, float t)
 	return a + t*(b - a);
 }
 
+__device_inline bool is_zero(const float3 a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float3(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
+#endif
+}
+
+__device_inline float reduce_add(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	return (a.x + a.y + a.z);
+#else
+	return (a.x + a.y + a.z);
+#endif
+}
+
+__device_inline float average(const float3 a)
+{
+	return reduce_add(a)*(1.0f/3.0f);
+}
+
 /* Float4 Vector */
 
-#ifndef __KERNEL_OPENCL__
+#ifdef __KERNEL_SSE__
 
-__device_inline bool is_zero(const float4& a)
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b)
 {
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
 }
 
-__device_inline float average(const float4& a)
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
-	return (a.x + a.y + a.z + a.w)*(1.0f/4.0f);
+	return _mm_moveldup_ps(b);
 }
 
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
+{
+	return _mm_movehdup_ps(b);
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
+{
+	return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)));
+}
+
+#endif
+
+#ifndef __KERNEL_OPENCL__
+
 __device_inline float4 operator-(const float4& a)
 {
-	float4 r = {-a.x, -a.y, -a.z, -a.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+	return _mm_xor_ps(a.m128, mask);
+#else
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+#endif
 }
 
 __device_inline float4 operator*(const float4& a, const float4& b)
 {
-	float4 r = {a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_mul_ps(a.m128, b.m128);
+#else
+	return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+#endif
 }
 
 __device_inline float4 operator*(const float4& a, float f)
 {
-	float4 r = {a.x*f, a.y*f, a.z*f, a.w*f};
-	return r;
+#ifdef __KERNEL_SSE__
+	return a * make_float4(f);
+#else
+	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
+#endif
 }
 
 __device_inline float4 operator*(float f, const float4& a)
 {
-	float4 r = {a.x*f, a.y*f, a.z*f, a.w*f};
-	return r;
+	return a * f;
+}
+
+__device_inline float4 rcp(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 r = _mm_rcp_ps(a.m128);
+	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#else
+	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
+#endif
 }
 
 __device_inline float4 operator/(const float4& a, float f)
 {
-	float invf = 1.0f/f;
-	float4 r = {a.x*invf, a.y*invf, a.z*invf, a.w*invf};
-	return r;
+	return a * (1.0f/f);
 }
 
 __device_inline float4 operator/(const float4& a, const float4& b)
 {
-	float4 r = {a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return a * rcp(b);
+#else
+	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+#endif
+
 }
 
 __device_inline float4 operator+(const float4& a, const float4& b)
 {
-	float4 r = {a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_add_ps(a.m128, b.m128);
+#else
+	return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
 }
 
 __device_inline float4 operator-(const float4& a, const float4& b)
 {
-	float4 r = {a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_sub_ps(a.m128, b.m128);
+#else
+	return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+#endif
 }
 
 __device_inline float4 operator+=(float4& a, const float4& b)
 {
-	a.x += b.x;
-	a.y += b.y;
-	a.z += b.z;
-	a.w += b.w;
-	return a;
+	return a = a + b;
 }
 
 __device_inline float4 operator*=(float4& a, const float4& b)
 {
-	a.x *= b.x;
-	a.y *= b.y;
-	a.z *= b.z;
-	a.w *= b.w;
-	return a;
+	return a = a * b;
 }
 
 __device_inline float4 operator/=(float4& a, float f)
 {
-	float invf = 1.0f/f;
-	a.x *= invf;
-	a.y *= invf;
-	a.z *= invf;
-	a.w *= invf;
-	return a;
+	return a = a / f;
 }
 
-__device_inline float dot(const float4& a, const float4& b)
+__device_inline int4 operator<(const float4& a, const float4& b)
 {
-	return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+#ifdef __KERNEL_SSE__
+	return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+__device_inline int4 operator>=(float4 a, float4 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+__device_inline int4 operator<=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */
+#else
+	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
+#endif
+}
+
+__device_inline bool operator==(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
+#else
+	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+#endif
 }
 
 __device_inline float4 cross(const float4& a, const float4& b)
 {
-	float4 r = {a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f};
-	return r;
+#ifdef __KERNEL_SSE__
+	return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
+#else
+	return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f);
+#endif
 }
 
 __device_inline float4 min(float4 a, float4 b)
 {
+#ifdef __KERNEL_SSE__
+	return _mm_min_ps(a.m128, b.m128);
+#else
 	return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
 }
 
 __device_inline float4 max(float4 a, float4 b)
 {
+#ifdef __KERNEL_SSE__
+	return _mm_max_ps(a.m128, b.m128);
+#else
 	return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
 }
 
 #endif
 
 #ifndef __KERNEL_GPU__
 
+__device_inline float4 select(const int4& mask, const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	/* blendv is sse4, and apparently broken on vs2008 */
+	return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
+#else
+	return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
+#endif
+}
+
+__device_inline float4 reduce_min(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = min(shuffle<1,0,3,2>(a), a);
+	return min(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
+#endif
+}
+
+__device_inline float4 reduce_max(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = max(shuffle<1,0,3,2>(a), a);
+	return max(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
+#endif
+}
+
+#if 0
+__device_inline float4 reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = shuffle<1,0,3,2>(a) + a;
+	return shuffle<2,3,0,1>(h) + h;
+#else
+	return make_float4((a.x + a.y) + (a.z + a.w));
+#endif
+}
+#endif
+
 __device_inline void print_float4(const char *label, const float4& a)
 {
 	printf("%s: %.8f %.8f %.8f %.8f\n", label, a.x, a.y, a.z, a.w);
@@ -685,26 +790,77 @@ __device_inline void print_float4(const char *label, const float4& a)
 
 #endif
 
+#ifndef __KERNEL_OPENCL__
+
+__device_inline bool is_zero(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float4(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#endif
+}
+
+__device_inline float reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = shuffle<1,0,3,2>(a) + a;
+	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */
+#else
+	return ((a.x + a.y) + (a.z + a.w));
+#endif
+}
+
+__device_inline float average(const float4& a)
+{
+	return reduce_add(a) * 0.25f;
+}
+
+__device_inline float dot(const float4& a, const float4& b)
+{
+	return reduce_add(a * b);
+}
+
+#endif
+
 /* Int3 */
 
 #ifndef __KERNEL_OPENCL__
 
+__device_inline int3 min(int3 a, int3 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_min_epi32(a.m128, b.m128);
+#else
+	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
 __device_inline int3 max(int3 a, int3 b)
 {
-	int3 r = {max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
-	return r;
+#ifdef __KERNEL_SSE__
+	return _mm_max_epi32(a.m128, b.m128);
+#else
+	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
 }
 
 __device_inline int3 clamp(const int3& a, int mn, int mx)
 {
-	int3 r = {clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)};
-	return r;
+#ifdef __KERNEL_SSE__
+	return min(max(a, make_int3(mn)), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
+#endif
 }
 
 __device_inline int3 clamp(const int3& a, int3& mn, int mx)
 {
-	int3 r = {clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)};
-	return r;
+#ifdef __KERNEL_SSE__
+	return min(max(a, mn), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
+#endif
 }
 
 #endif
@@ -720,16 +876,63 @@ __device_inline void print_int3(const char *label, const int3& a)
 
 /* Int4 */
 
-#ifndef __KERNEL_OPENCL__
+#ifndef __KERNEL_GPU__
 
-__device_inline int4 operator>=(float4 a, float4 b)
+__device_inline int4 operator+(const int4& a, const int4& b)
 {
-	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#ifdef __KERNEL_SSE__
+	return _mm_add_epi32(a.m128, b.m128);
+#else
+	return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+__device_inline int4 operator+=(int4& a, const int4& b)
+{
+	return a = a + b;
 }
 
+__device_inline int4 operator>>(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_srai_epi32(a.m128, i);
+#else
+	return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
 #endif
+}
 
-#ifndef __KERNEL_GPU__
+__device_inline int4 min(int4 a, int4 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_min_epi32(a.m128, b.m128);
+#else
+	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
+}
+
+__device_inline int4 max(int4 a, int4 b)
+{
+#ifdef __KERNEL_SSE__
+	return _mm_max_epi32(a.m128, b.m128);
+#else
+	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
+}
+
+__device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+__device_inline int4 select(const int4& mask, const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	__m128 m = _mm_cvtepi32_ps(mask);
+	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */
+#else
+	return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
+#endif
+}
 
 __device_inline void print_int4(const char *label, const int4& a)
 {
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
new file mode 100644
index 00000000000..6da9a70ec0c
--- /dev/null
+++ b/intern/cycles/util/util_task.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "util_debug.h"
+#include "util_foreach.h"
+#include "util_system.h"
+#include "util_task.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Task Pool */
+
+TaskPool::TaskPool(const TaskRunFunction& run_)
+{
+	num = 0;
+	num_done = 0;
+
+	do_cancel = false;
+
+	run = run_;
+}
+
+TaskPool::~TaskPool()
+{
+	stop();
+}
+
+void TaskPool::push(Task *task, bool front)
+{
+	TaskScheduler::Entry entry;
+
+	entry.task = task;
+	entry.pool = this;
+
+	TaskScheduler::push(entry, front);
+}
+
+void TaskPool::wait()
+{
+	thread_scoped_lock lock(done_mutex);
+
+	while(num_done != num)
+		done_cond.wait(lock);
+}
+
+void TaskPool::cancel()
+{
+	TaskScheduler::clear(this);
+
+	do_cancel = true;
+	wait();
+	do_cancel = false;
+}
+
+void TaskPool::stop()
+{
+	TaskScheduler::clear(this);
+
+	assert(num_done == num);
+}
+
+bool TaskPool::cancelled()
+{
+	return do_cancel;
+}
+
+void TaskPool::done_increase(int done)
+{
+	done_mutex.lock();
+	num_done += done;
+	done_mutex.unlock();
+
+	assert(num_done <= num);
+	done_cond.notify_all();
+}
+
+/* Task Scheduler */
+
+thread_mutex TaskScheduler::mutex;
+int TaskScheduler::users = 0;
+vector<thread*> TaskScheduler::threads;
+volatile bool TaskScheduler::do_exit = false;
+
+list<TaskScheduler::Entry> TaskScheduler::queue;
+thread_mutex TaskScheduler::queue_mutex;
+thread_condition_variable TaskScheduler::queue_cond;
+
+void TaskScheduler::init(int num_threads)
+{
+	thread_scoped_lock lock(mutex);
+
+	/* multiple cycles instances can use this task scheduler, sharing the same
+	   threads, so we keep track of the number of users. */
+	if(users == 0) {
+		do_exit = false;
+
+		/* launch threads that will be waiting for work */
+		if(num_threads == 0)
+			num_threads = system_cpu_thread_count();
+
+		threads.resize(num_threads);
+
+		for(size_t i = 0; i < threads.size(); i++)
+			threads[i] = new thread(function_bind(&TaskScheduler::thread_run, i));
+	}
+	
+	users++;
+}
+
+void TaskScheduler::exit()
+{
+	thread_scoped_lock lock(mutex);
+
+	users--;
+
+	if(users == 0) {
+		/* stop all waiting threads */
+		do_exit = true;
+		TaskScheduler::queue_cond.notify_all();
+
+		/* delete threads */
+		foreach(thread *t, threads) {
+			t->join();
+			delete t;
+		}
+
+		threads.clear();
+	}
+}
+
+bool TaskScheduler::thread_wait_pop(Entry& entry)
+{
+	thread_scoped_lock lock(queue_mutex);
+
+	while(queue.empty() && !do_exit)
+		queue_cond.wait(lock);
+
+	if(queue.empty()) {
+		assert(do_exit);
+		return false;
+	}
+	
+	entry = queue.front();
+	queue.pop_front();
+
+	return true;
+}
+
+void TaskScheduler::thread_run(int thread_id)
+{
+	Entry entry;
+
+	/* todo: test affinity/denormal mask */
+
+	/* keep popping off tasks */
+	while(thread_wait_pop(entry)) {
+		/* run task */
+		entry.pool->run(entry.task, thread_id);
+
+		/* delete task */
+		delete entry.task;
+
+		/* notify pool task was done */
+		entry.pool->done_increase(1);
+	}
+}
+
+void TaskScheduler::push(Entry& entry, bool front)
+{
+	/* add entry to queue */
+	TaskScheduler::queue_mutex.lock();
+	if(front)
+		TaskScheduler::queue.push_front(entry);
+	else
+		TaskScheduler::queue.push_back(entry);
+	entry.pool->num++;
+	TaskScheduler::queue_mutex.unlock();
+
+	TaskScheduler::queue_cond.notify_one();
+}
+
+void TaskScheduler::clear(TaskPool *pool)
+{
+	thread_scoped_lock lock(TaskScheduler::queue_mutex);
+
+	/* erase all tasks from this pool from the queue */
+	list<TaskScheduler::Entry>::iterator it = TaskScheduler::queue.begin();
+	int done = 0;
+
+	while(it != TaskScheduler::queue.end()) {
+		TaskScheduler::Entry& entry = *it;
+
+		if(entry.pool == pool) {
+			done++;
+			delete entry.task;
+
+			it = TaskScheduler::queue.erase(it);
+		}
+		else
+			it++;
+	}
+
+	/* notify done */
+	pool->done_increase(done);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
new file mode 100644
index 00000000000..acdb2cb50a2
--- /dev/null
+++ b/intern/cycles/util/util_task.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __UTIL_TASK_H__
+#define __UTIL_TASK_H__
+
+#include "util_list.h"
+#include "util_thread.h"
+#include "util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Task;
+class TaskPool;
+class TaskScheduler;
+
+typedef boost::function<void(Task*,int)> TaskRunFunction;
+
+/* Task
+ *
+ * Base class for tasks to be executed in threads. */
+
+class Task
+{
+public:
+	Task() {};
+	virtual ~Task() {}
+};
+
+/* Task Pool
+ *
+ * Pool of tasks that will be executed by the central TaskScheduler.For each
+ * pool, we can wait for all tasks to be done, or cancel them before they are
+ * done.
+ *
+ * The run callback that actually executes the task may be create like this:
+ * function_bind(&MyClass::task_execute, this, _1, _2) */
+
+class TaskPool
+{
+public:
+	TaskPool(const TaskRunFunction& run);
+	~TaskPool();
+
+	void push(Task *task, bool front = false);
+
+	void wait();		/* wait until all tasks are done */
+	void cancel();		/* cancel all tasks, keep worker threads running */
+	void stop();		/* stop all worker threads */
+
+	bool cancelled();	/* for worker threads, test if cancelled */
+
+protected:
+	friend class TaskScheduler;
+
+	void done_increase(int done);
+
+	TaskRunFunction run;
+
+	thread_mutex done_mutex;
+	thread_condition_variable done_cond;
+
+	volatile int num, num_done;
+	volatile bool do_cancel;
+};
+
+/* Task Scheduler
+ * 
+ * Central scheduler that holds running threads ready to execute tasks. A singe
+ * queue holds the task from all pools. */
+
+class TaskScheduler
+{
+public:
+	static void init(int num_threads = 0);
+	static void exit();
+
+	static int num_threads() { return threads.size(); }
+
+protected:
+	friend class TaskPool;
+
+	struct Entry {
+		Task *task;
+		TaskPool *pool;
+	};
+
+	static thread_mutex mutex;
+	static int users;
+	static vector<thread*> threads;
+	static volatile bool do_exit;
+
+	static list<Entry> queue;
+	static thread_mutex queue_mutex;
+	static thread_condition_variable queue_cond;
+
+	static void thread_run(int thread_id);
+	static bool thread_wait_pop(Entry& entry);
+
+	static void push(Entry& entry, bool front);
+	static void clear(TaskPool *pool);
+};
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 6836be203f5..3d15b342fe5 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -69,133 +69,6 @@ protected:
 	bool joined;
 };
 
-/* Thread Safe Queue to pass tasks from one thread to another. Tasks should be
- * pushed into the queue, while the worker thread waits to pop the next task
- * off the queue. Once all tasks are into the queue, calling stop() will stop
- * the worker threads from waiting for more tasks once all tasks are done. */
-
-template<typename T> class ThreadQueue
-{
-public:
-	ThreadQueue()
-	{
-		tot = 0;
-		tot_done = 0;
-		do_stop = false;
-		do_cancel = false;
-	}
-
-	/* Main thread functions */
-
-	/* push a task to be executed */
-	void push(const T& value)
-	{
-		thread_scoped_lock lock(queue_mutex);
-		queue.push(value);
-		tot++;
-		lock.unlock();
-
-		queue_cond.notify_one();
-	}
-
-	/* wait until all tasks are done */
-	void wait_done()
-	{
-		thread_scoped_lock lock(done_mutex);
-
-		while(tot_done != tot)
-			done_cond.wait(lock);
-	}
-
-	/* stop all worker threads */
-	void stop()
-	{
-		clear();
-		do_stop = true;
-		queue_cond.notify_all();
-	}
-
-	/* cancel all tasks, but keep worker threads running */
-	void cancel()
-	{
-		clear();
-		do_cancel = true;
-		wait_done();
-		do_cancel = false;
-	}
-
-	/* Worker thread functions
-     *
-	 * while(queue.worker_wait_pop(task)) {
-	 *		for(..) {
-	 *			... do work ...
-	 *
-	 *			if(queue.worker_cancel())
-	 *				break;
-	 *      }
-	 *		
-	 *		queue.worker_done();
-	 * }
-	 */
-
-	bool worker_wait_pop(T& value)
-	{
-		thread_scoped_lock lock(queue_mutex);
-
-		while(queue.empty() && !do_stop)
-			queue_cond.wait(lock);
-
-		if(queue.empty())
-			return false;
-		
-		value = queue.front();
-		queue.pop();
-
-		return true;
-	}
-
-	void worker_done()
-	{
-		thread_scoped_lock lock(done_mutex);
-		tot_done++;
-		lock.unlock();
-
-		assert(tot_done <= tot);
-
-		done_cond.notify_all();
-	}
-
-	bool worker_cancel()
-	{
-		return do_cancel;
-	}
-
-protected:
-	void clear()
-	{
-		thread_scoped_lock lock(queue_mutex);
-
-		while(!queue.empty()) {
-			thread_scoped_lock done_lock(done_mutex);
-			tot_done++;
-			done_lock.unlock();
-
-			queue.pop();
-		}
-
-		done_cond.notify_all();
-	}
-
-	std::queue<T> queue;
-	thread_mutex queue_mutex;
-	thread_mutex done_mutex;
-	thread_condition_variable queue_cond;
-	thread_condition_variable done_cond;
-	volatile bool do_stop;
-	volatile bool do_cancel;
-	volatile int tot, tot_done;
-};
-
 /* Thread Local Storage
  *
  * Boost implementation is a bit slow, and Mac OS X __thread is not supported
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 61bc36ae888..0fd26825911 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -129,23 +129,26 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
 
 Transform transform_inverse(const Transform& tfm)
 {
-	union { Transform T; float M[4][4]; } R, M;
-	
-	R.T = transform_identity();
-	M.T = tfm;
+	Transform tfmR = transform_identity();
+	float M[4][4], R[4][4];
 
-	if(!transform_matrix4_gj_inverse(R.M, M.M)) {
+	memcpy(R, &tfmR, sizeof(R));
+	memcpy(M, &tfm, sizeof(M));
+
+	if(!transform_matrix4_gj_inverse(R, M)) {
 		/* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
 		   never be in this situation, but try to invert it anyway with tweak */
-		M.M[0][0] += 1e-8f;
-		M.M[1][1] += 1e-8f;
-		M.M[2][2] += 1e-8f;
+		M[0][0] += 1e-8f;
+		M[1][1] += 1e-8f;
+		M[2][2] += 1e-8f;
 
-		if(!transform_matrix4_gj_inverse(R.M, M.M))
+		if(!transform_matrix4_gj_inverse(R, M))
 			return transform_identity();
 	}
 
-	return R.T;
+	memcpy(&tfmR, R, sizeof(R));
+
+	return tfmR;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index efdda98571a..cf167707e47 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -36,23 +36,37 @@
 #define __shared
 #define __constant
 
-#ifdef __GNUC__
-#define __device_inline static inline __attribute__((always_inline))
-#else
+#ifdef _WIN32
 #define __device_inline static __forceinline
+#define __align(...) __declspec(align(__VA_ARGS__))
+#else
+#define __device_inline static inline __attribute__((always_inline))
+#define __forceinline inline __attribute__((always_inline))
+#define __align(...) __attribute__((aligned(__VA_ARGS__)))
 #endif
 
 #endif
 
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __KERNEL_64_BIT__
+#endif
+
 /* SIMD Types */
 
-/* not needed yet, will be for qbvh
-#ifndef __KERNEL_GPU__
+/* not enabled, globally applying it just gives slowdown,
+ * but useful for testing. */
+//#define __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
 
-#include <emmintrin.h>
-#include <xmmintrin.h>
+#include <xmmintrin.h> /* SSE 1 */
+#include <emmintrin.h> /* SSE 2 */
+#include <pmmintrin.h> /* SSE 3 */
+#include <tmmintrin.h> /* SSE 3 */
+#include <smmintrin.h> /* SSE 4 */
 
-#endif*/
+#endif
 
 #ifndef _WIN32
 #ifndef __KERNEL_GPU__
@@ -97,6 +111,12 @@ typedef unsigned int uint32_t;
 typedef long long int64_t;
 typedef unsigned long long uint64_t;
 
+#ifdef __KERNEL_64_BIT__
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+
 #endif
 
 /* Generic Memory Pointer */
@@ -108,89 +128,137 @@ typedef uint64_t device_ptr;
 struct uchar2 {
 	uchar x, y;
 
-	uchar operator[](int i) const { return *(&x + i); }
-	uchar& operator[](int i) { return *(&x + i); }
+	__forceinline uchar operator[](int i) const { return *(&x + i); }
+	__forceinline uchar& operator[](int i) { return *(&x + i); }
 };
 
 struct uchar3 {
 	uchar x, y, z;
 
-	uchar operator[](int i) const { return *(&x + i); }
-	uchar& operator[](int i) { return *(&x + i); }
+	__forceinline uchar operator[](int i) const { return *(&x + i); }
+	__forceinline uchar& operator[](int i) { return *(&x + i); }
 };
 
 struct uchar4 {
 	uchar x, y, z, w;
 
-	uchar operator[](int i) const { return *(&x + i); }
-	uchar& operator[](int i) { return *(&x + i); }
+	__forceinline uchar operator[](int i) const { return *(&x + i); }
+	__forceinline uchar& operator[](int i) { return *(&x + i); }
 };
 
 struct int2 {
 	int x, y;
 
-	int operator[](int i) const { return *(&x + i); }
-	int& operator[](int i) { return *(&x + i); }
+	__forceinline int operator[](int i) const { return *(&x + i); }
+	__forceinline int& operator[](int i) { return *(&x + i); }
 };
 
+#ifdef __KERNEL_SSE__
+struct __align(16) int3 {
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int3() {}
+	__forceinline int3(const __m128i a) : m128(a) {}
+	__forceinline operator const __m128i&(void) const { return m128; }
+	__forceinline operator __m128i&(void) { return m128; }
+#else
 struct int3 {
-	int x, y, z;
+	int x, y, z, w;
+#endif
 
-	int operator[](int i) const { return *(&x + i); }
-	int& operator[](int i) { return *(&x + i); }
+	__forceinline int operator[](int i) const { return *(&x + i); }
+	__forceinline int& operator[](int i) { return *(&x + i); }
 };
 
+#ifdef __KERNEL_SSE__
+struct __align(16) int4 {
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int4() {}
+	__forceinline int4(const __m128i a) : m128(a) {}
+	__forceinline operator const __m128i&(void) const { return m128; }
+	__forceinline operator __m128i&(void) { return m128; }
+#else
 struct int4 {
 	int x, y, z, w;
+#endif
 
-	int operator[](int i) const { return *(&x + i); }
-	int& operator[](int i) { return *(&x + i); }
+	__forceinline int operator[](int i) const { return *(&x + i); }
+	__forceinline int& operator[](int i) { return *(&x + i); }
 };
 
 struct uint2 {
 	uint x, y;
 
-	uint operator[](int i) const { return *(&x + i); }
-	uint& operator[](int i) { return *(&x + i); }
+	__forceinline uint operator[](uint i) const { return *(&x + i); }
+	__forceinline uint& operator[](uint i) { return *(&x + i); }
 };
 
 struct uint3 {
 	uint x, y, z;
 
-	uint operator[](int i) const { return *(&x + i); }
-	uint& operator[](int i) { return *(&x + i); }
+	__forceinline uint operator[](uint i) const { return *(&x + i); }
+	__forceinline uint& operator[](uint i) { return *(&x + i); }
 };
 
 struct uint4 {
 	uint x, y, z, w;
 
-	uint operator[](int i) const { return *(&x + i); }
-	uint& operator[](int i) { return *(&x + i); }
+	__forceinline uint operator[](uint i) const { return *(&x + i); }
+	__forceinline uint& operator[](uint i) { return *(&x + i); }
 };
 
 struct float2 {
 	float x, y;
 
-	float operator[](int i) const { return *(&x + i); }
-	float& operator[](int i) { return *(&x + i); }
+	__forceinline float operator[](int i) const { return *(&x + i); }
+	__forceinline float& operator[](int i) { return *(&x + i); }
 };
 
+#ifdef __KERNEL_SSE__
+struct __align(16) float3 {
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float3() {}
+	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline operator const __m128&(void) const { return m128; }
+	__forceinline operator __m128&(void) { return m128; }
+#else
 struct float3 {
-	float x, y, z;
-
-#ifdef WITH_OPENCL
-	float w;
+	float x, y, z, w;
 #endif
 
-	float operator[](int i) const { return *(&x + i); }
-	float& operator[](int i) { return *(&x + i); }
+	__forceinline float operator[](int i) const { return *(&x + i); }
+	__forceinline float& operator[](int i) { return *(&x + i); }
 };
 
+#ifdef __KERNEL_SSE__
+struct __align(16) float4 {
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float4() {}
+	__forceinline float4(const __m128 a) : m128(a) {}
+	__forceinline operator const __m128&(void) const { return m128; }
+	__forceinline operator __m128&(void) { return m128; }
+#else
 struct float4 {
 	float x, y, z, w;
+#endif
 
-	float operator[](int i) const { return *(&x + i); }
-	float& operator[](int i) { return *(&x + i); }
+	__forceinline float operator[](int i) const { return *(&x + i); }
+	__forceinline float& operator[](int i) { return *(&x + i); }
 };
 
 #endif
@@ -201,87 +269,179 @@ struct float4 {
  * 
  * OpenCL does not support C++ class, so we use these instead. */
 
-__device uchar2 make_uchar2(uchar x, uchar y)
+__device_inline uchar2 make_uchar2(uchar x, uchar y)
 {
 	uchar2 a = {x, y};
 	return a;
 }
 
-__device uchar3 make_uchar3(uchar x, uchar y, uchar z)
+__device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
 {
 	uchar3 a = {x, y, z};
 	return a;
 }
 
-__device uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+__device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
 {
 	uchar4 a = {x, y, z, w};
 	return a;
 }
 
-__device int2 make_int2(int x, int y)
+__device_inline int2 make_int2(int x, int y)
 {
 	int2 a = {x, y};
 	return a;
 }
 
-__device int3 make_int3(int x, int y, int z)
+__device_inline int3 make_int3(int x, int y, int z)
 {
-	int3 a = {x, y, z};
+#ifdef __KERNEL_SSE__
+	int3 a;
+	a.m128 = _mm_set_epi32(0, z, y, x);
+#else
+	int3 a = {x, y, z, 0};
+#endif
+
 	return a;
 }
 
-__device int4 make_int4(int x, int y, int z, int w)
+__device_inline int4 make_int4(int x, int y, int z, int w)
 {
+#ifdef __KERNEL_SSE__
+	int4 a;
+	a.m128 = _mm_set_epi32(w, z, y, x);
+#else
 	int4 a = {x, y, z, w};
+#endif
+
 	return a;
 }
 
-__device uint2 make_uint2(uint x, uint y)
+__device_inline uint2 make_uint2(uint x, uint y)
 {
 	uint2 a = {x, y};
 	return a;
 }
 
-__device uint3 make_uint3(uint x, uint y, uint z)
+__device_inline uint3 make_uint3(uint x, uint y, uint z)
 {
 	uint3 a = {x, y, z};
 	return a;
 }
 
-__device uint4 make_uint4(uint x, uint y, uint z, uint w)
+__device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
 {
 	uint4 a = {x, y, z, w};
 	return a;
 }
 
-__device float2 make_float2(float x, float y)
+__device_inline float2 make_float2(float x, float y)
 {
 	float2 a = {x, y};
 	return a;
 }
 
-__device float3 make_float3(float x, float y, float z)
+__device_inline float3 make_float3(float x, float y, float z)
 {
-#ifdef WITH_OPENCL
-	float3 a = {x, y, z, 0.0f};
+#ifdef __KERNEL_SSE__
+	float3 a;
+	a.m128 = _mm_set_ps(0.0f, z, y, x);
 #else
-	float3 a = {x, y, z};
+	float3 a = {x, y, z, 0.0f};
 #endif
+
 	return a;
 }
 
-__device float4 make_float4(float x, float y, float z, float w)
+__device_inline float4 make_float4(float x, float y, float z, float w)
 {
+#ifdef __KERNEL_SSE__
+	float4 a;
+	a.m128 = _mm_set_ps(w, z, y, x);
+#else
 	float4 a = {x, y, z, w};
+#endif
+
 	return a;
 }
 
-__device int align_up(int offset, int alignment)
+__device_inline int align_up(int offset, int alignment)
 {
 	return (offset + alignment - 1) & ~(alignment - 1);
 }
 
+__device_inline int3 make_int3(int i)
+{
+#ifdef __KERNEL_SSE__
+	int3 a;
+	a.m128 = _mm_set1_epi32(i);
+#else
+	int3 a = {i, i, i, i};
+#endif
+
+	return a;
+}
+
+__device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+	int4 a;
+	a.m128 = _mm_set1_epi32(i);
+#else
+	int4 a = {i, i, i, i};
+#endif
+
+	return a;
+}
+
+__device_inline float3 make_float3(float f)
+{
+#ifdef __KERNEL_SSE__
+	float3 a;
+	a.m128 = _mm_set1_ps(f);
+#else
+	float3 a = {f, f, f, f};
+#endif
+
+	return a;
+}
+
+__device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+	float4 a;
+	a.m128 = _mm_set1_ps(f);
+#else
+	float4 a = {f, f, f, f};
+#endif
+
+	return a;
+}
+
+__device_inline float4 make_float4(const int4& i)
+{
+#ifdef __KERNEL_SSE__
+	float4 a;
+	a.m128 = _mm_cvtepi32_ps(i.m128);
+#else
+	float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
+#endif
+
+	return a;
+}
+
+__device_inline int4 make_int4(const float3& f)
+{
+#ifdef __KERNEL_SSE__
+	int4 a;
+	a.m128 = _mm_cvtps_epi32(f.m128);
+#else
+	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+
+	return a;
+}
+
 #endif
 
 CCL_NAMESPACE_END