1 files changed, 104 insertions, 94 deletions
diff --git a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.h b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.h
index 10f94627c37..da296445e81 100644
--- a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.h
+++ b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.h
@@ -31,7 +31,7 @@ subject to the following restrictions:
 #define DBVT_IMPL_SSE			1	// SSE
 
 // Template implementation of ICollide
-#ifdef WIN32_AVOID_WHEN_EMBEDDED_INSIDE_BLENDER
+#ifdef WIN32_AVOID_SSE_WHEN_EMBEDDED_INSIDE_BLENDER //there is always some weird compiler that breaks SSE builds
 	#if (defined (_MSC_VER) && _MSC_VER >= 1400)
 	#define	DBVT_USE_TEMPLATE		1
 	#else
@@ -41,6 +41,9 @@ subject to the following restrictions:
 #define	DBVT_USE_TEMPLATE		0
 #endif
 
+// Use only intrinsics instead of inline asm
+#define DBVT_USE_INTRINSIC_SSE	1
+
 // Using memmov for collideOCL
 #define DBVT_USE_MEMMOVE		1
 
@@ -57,14 +60,21 @@ subject to the following restrictions:
 #endif
 
 // Specific methods implementation
-#ifdef WIN32_AVOID_WHEN_EMBEDDED_INSIDE_BLENDER
-#define DBVT_PROXIMITY_IMPL		DBVT_IMPL_SSE
+
+#ifdef WIN32_AVOID_SSE_WHEN_EMBEDDED_INSIDE_BLENDER //there is always some weird compiler that breaks SSE builds
 #define DBVT_SELECT_IMPL		DBVT_IMPL_SSE
 #define DBVT_MERGE_IMPL			DBVT_IMPL_SSE
+#define DBVT_INT0_IMPL			DBVT_IMPL_SSE
 #else
-#define DBVT_PROXIMITY_IMPL		DBVT_IMPL_GENERIC
 #define DBVT_SELECT_IMPL		DBVT_IMPL_GENERIC
 #define DBVT_MERGE_IMPL			DBVT_IMPL_GENERIC
+#define DBVT_INT0_IMPL			DBVT_IMPL_GENERIC
+#endif
+
+#if	(DBVT_SELECT_IMPL==DBVT_IMPL_SSE)||	\
+	(DBVT_MERGE_IMPL==DBVT_IMPL_SSE)||	\
+	(DBVT_INT0_IMPL==DBVT_IMPL_SSE)
+#include <emmintrin.h>
 #endif
 
 //
@@ -104,10 +114,6 @@ subject to the following restrictions:
 #error "DBVT_ENABLE_BENCHMARK undefined"
 #endif
 
-#ifndef DBVT_PROXIMITY_IMPL
-#error "DBVT_PROXIMITY_IMPL undefined"
-#endif
-
 #ifndef DBVT_SELECT_IMPL
 #error "DBVT_SELECT_IMPL undefined"
 #endif
@@ -116,6 +122,10 @@ subject to the following restrictions:
 #error "DBVT_MERGE_IMPL undefined"
 #endif
 
+#ifndef DBVT_INT0_IMPL
+#error "DBVT_INT0_IMPL undefined"
+#endif
+
 //
 // Defaults volumes
 //
@@ -133,8 +143,8 @@ static inline btDbvtAabbMm		FromCR(const btVector3& c,btScalar r);
 static inline btDbvtAabbMm		FromMM(const btVector3& mi,const btVector3& mx);
 static inline btDbvtAabbMm		FromPoints(const btVector3* pts,int n);
 static inline btDbvtAabbMm		FromPoints(const btVector3** ppts,int n);
-DBVT_INLINE void				Expand(const btVector3 e);
-DBVT_INLINE void				SignedExpand(const btVector3 e);
+DBVT_INLINE void				Expand(const btVector3& e);
+DBVT_INLINE void				SignedExpand(const btVector3& e);
 DBVT_INLINE bool				Contain(const btDbvtAabbMm& a) const;
 DBVT_INLINE int					Classify(const btVector3& n,btScalar o,int s) const;
 DBVT_INLINE btScalar			ProjectMinimum(const btVector3& v,unsigned signs) const;
@@ -173,12 +183,12 @@ struct	btDbvtNode
 {
 	btDbvtVolume	volume;
 	btDbvtNode*		parent;
-	bool	isleaf() const		{ return(childs[1]==0); }
-	bool	isinternal() const	{ return(!isleaf()); }
+	DBVT_INLINE bool	isleaf() const		{ return(childs[1]==0); }
+	DBVT_INLINE bool	isinternal() const	{ return(!isleaf()); }
 	union	{
-		btDbvtNode*	childs[2];
-		void*	data;
-		};
+			btDbvtNode*	childs[2];
+			void*	data;
+			};
 };
 
 ///The btDbvt class implements a fast dynamic bounding volume tree based on axis aligned bounding boxes (aabb tree).
@@ -186,8 +196,6 @@ struct	btDbvtNode
 ///Unlike the btQuantizedBvh, nodes can be dynamically moved around, which allows for change in topology of the underlying data structure.
 struct	btDbvt
 	{
-	
-	
 	/* Stack element	*/ 
 	struct	sStkNN
 		{
@@ -250,8 +258,8 @@ struct	btDbvt
 			};
 		
 	// Fields
-	btDbvtNode*			m_root;
-	btDbvtNode*			m_free;
+	btDbvtNode*		m_root;
+	btDbvtNode*		m_free;
 	int				m_lkhd;
 	int				m_leaves;
 	unsigned		m_opath;
@@ -408,17 +416,17 @@ return(box);
 }
 
 //
-DBVT_INLINE void		btDbvtAabbMm::Expand(const btVector3 e)
+DBVT_INLINE void		btDbvtAabbMm::Expand(const btVector3& e)
 {
 mi-=e;mx+=e;
 }
 	
 //
-DBVT_INLINE void		btDbvtAabbMm::SignedExpand(const btVector3 e)
+DBVT_INLINE void		btDbvtAabbMm::SignedExpand(const btVector3& e)
 {
-if(e.x()>0) mx.setX(mx.x()+e.x()); else mi.setX(mi.x()+e.x());
-if(e.y()>0) mx.setY(mx.y()+e.y()); else mi.setY(mi.y()+e.y());
-if(e.z()>0) mx.setZ(mx.z()+e.z()); else mi.setZ(mi.z()+e.z());
+if(e.x()>0) mx.setX(mx.x()+e[0]); else mi.setX(mi.x()+e[0]);
+if(e.y()>0) mx.setY(mx.y()+e[1]); else mi.setY(mi.y()+e[1]);
+if(e.z()>0) mx.setZ(mx.z()+e[2]); else mi.setZ(mi.z()+e[2]);
 }
 	
 //
@@ -486,12 +494,19 @@ for(int i=0;i<3;++i)
 DBVT_INLINE bool		Intersect(	const btDbvtAabbMm& a,
 									const btDbvtAabbMm& b)
 {
+#if	DBVT_INT0_IMPL == DBVT_IMPL_SSE
+const __m128	rt(_mm_or_ps(	_mm_cmplt_ps(_mm_load_ps(b.mx),_mm_load_ps(a.mi)),
+								_mm_cmplt_ps(_mm_load_ps(a.mx),_mm_load_ps(b.mi))));
+const __int32*	pu((const __int32*)&rt);
+return((pu[0]|pu[1]|pu[2])==0);
+#else
 return(	(a.mi.x()<=b.mx.x())&&
 		(a.mx.x()>=b.mi.x())&&
 		(a.mi.y()<=b.mx.y())&&
 		(a.mx.y()>=b.mi.y())&&
 		(a.mi.z()<=b.mx.z())&&		
 		(a.mx.z()>=b.mi.z()));
+#endif
 }
 
 //
@@ -558,32 +573,8 @@ return(txmax>0);
 DBVT_INLINE btScalar	Proximity(	const btDbvtAabbMm& a,
 									const btDbvtAabbMm& b)
 {
-#if	DBVT_PROXIMITY_IMPL == DBVT_IMPL_SSE
-DBVT_ALIGN btScalar							r[1];
-static DBVT_ALIGN const unsigned __int32	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
-__asm
-	{
-	mov		eax,a
-	mov		ecx,b
-	movaps	xmm0,[eax]
-	movaps	xmm2,[ecx]
-	movaps	xmm1,[eax+16]
-	movaps	xmm3,[ecx+16]
-	addps	xmm0,xmm1
-	addps	xmm2,xmm3
-	subps	xmm0,xmm2
-	andps	xmm0,mask
-	movhlps	xmm1,xmm0
-	addps	xmm0,xmm1
-	pshufd	xmm1,xmm0,1
-	addss	xmm0,xmm1
-	movss	r,xmm0
-	}
-return(r[0]);
-#else
 const btVector3	d=(a.mi+a.mx)-(b.mi+b.mx);
 return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z()));
-#endif
 }
 
 //
@@ -592,36 +583,57 @@ DBVT_INLINE int			Select(	const btDbvtAabbMm& o,
 								const btDbvtAabbMm& b)
 {
 #if	DBVT_SELECT_IMPL == DBVT_IMPL_SSE
-DBVT_ALIGN __int32							r[1];
 static DBVT_ALIGN const unsigned __int32	mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
-__asm
-	{
-	mov		eax,o
-	mov		ecx,a
-	mov		edx,b
-	movaps	xmm0,[eax]
-	movaps	xmm5,mask
-	addps	xmm0,[eax+16]	
-	movaps	xmm1,[ecx]
-	movaps	xmm2,[edx]
-	addps	xmm1,[ecx+16]
-	addps	xmm2,[edx+16]
-	subps	xmm1,xmm0
-	subps	xmm2,xmm0
-	andps	xmm1,xmm5
-	andps	xmm2,xmm5
-	movhlps	xmm3,xmm1
-	movhlps	xmm4,xmm2
-	addps	xmm1,xmm3
-	addps	xmm2,xmm4
-	pshufd	xmm3,xmm1,1
-	pshufd	xmm4,xmm2,1
-	addss	xmm1,xmm3
-	addss	xmm2,xmm4
-	cmpless	xmm2,xmm1
-	movss	r,xmm2
-	}
-return(r[0]&1);
+	// TODO: the intrinsic version is 11% slower
+	#if DBVT_USE_INTRINSIC_SSE
+	__m128	omi(_mm_load_ps(o.mi));
+	omi=_mm_add_ps(omi,_mm_load_ps(o.mx));
+	__m128	ami(_mm_load_ps(a.mi));
+	ami=_mm_add_ps(ami,_mm_load_ps(a.mx));
+	ami=_mm_sub_ps(ami,omi);
+	ami=_mm_and_ps(ami,_mm_load_ps((const float*)mask));
+	__m128	bmi(_mm_load_ps(b.mi));
+	bmi=_mm_add_ps(bmi,_mm_load_ps(b.mx));
+	bmi=_mm_sub_ps(bmi,omi);
+	bmi=_mm_and_ps(bmi,_mm_load_ps((const float*)mask));
+	__m128	t0(_mm_movehl_ps(ami,ami));
+	ami=_mm_add_ps(ami,t0);
+	ami=_mm_add_ss(ami,_mm_shuffle_ps(ami,ami,1));
+	__m128	t1(_mm_movehl_ps(bmi,bmi));
+	bmi=_mm_add_ps(bmi,t1);
+	bmi=_mm_add_ss(bmi,_mm_shuffle_ps(bmi,bmi,1));
+	return(_mm_cmple_ss(bmi,ami).m128_u32[0]&1);
+	#else
+	DBVT_ALIGN __int32	r[1];
+	__asm
+		{
+		mov		eax,o
+		mov		ecx,a
+		mov		edx,b
+		movaps	xmm0,[eax]
+		movaps	xmm5,mask
+		addps	xmm0,[eax+16]	
+		movaps	xmm1,[ecx]
+		movaps	xmm2,[edx]
+		addps	xmm1,[ecx+16]
+		addps	xmm2,[edx+16]
+		subps	xmm1,xmm0
+		subps	xmm2,xmm0
+		andps	xmm1,xmm5
+		andps	xmm2,xmm5
+		movhlps	xmm3,xmm1
+		movhlps	xmm4,xmm2
+		addps	xmm1,xmm3
+		addps	xmm2,xmm4
+		pshufd	xmm3,xmm1,1
+		pshufd	xmm4,xmm2,1
+		addss	xmm1,xmm3
+		addss	xmm2,xmm4
+		cmpless	xmm2,xmm1
+		movss	r,xmm2
+		}
+	return(r[0]&1);
+	#endif
 #else
 return(Proximity(o,a)<Proximity(o,b)?0:1);
 #endif
@@ -633,20 +645,14 @@ DBVT_INLINE void		Merge(	const btDbvtAabbMm& a,
 								btDbvtAabbMm& r)
 {
 #if DBVT_MERGE_IMPL==DBVT_IMPL_SSE
-__asm
-	{
-	mov		eax,a
-	mov		edx,b
-	mov		ecx,r
-	movaps	xmm0,[eax+0]
-	movaps	xmm1,[edx+0]
-	movaps	xmm2,[eax+16]
-	movaps	xmm3,[edx+16]
-	minps	xmm0,xmm1
-	maxps	xmm2,xmm3
-	movaps	[ecx+0],xmm0
-	movaps	[ecx+16],xmm2
-	}
+__m128	ami(_mm_load_ps(a.mi));
+__m128	amx(_mm_load_ps(a.mx));
+__m128	bmi(_mm_load_ps(b.mi));
+__m128	bmx(_mm_load_ps(b.mx));
+ami=_mm_min_ps(ami,bmi);
+amx=_mm_max_ps(amx,bmx);
+_mm_store_ps(r.mi,ami);
+_mm_store_ps(r.mx,amx);
 #else
 for(int i=0;i<3;++i)
 	{
@@ -717,7 +723,7 @@ if(root0&&root1)
 	int								treshold=DOUBLE_STACKSIZE-4;
 	stack.resize(DOUBLE_STACKSIZE);
 	stack[0]=sStkNN(root0,root1);
-	do	{
+	do	{		
 		sStkNN	p=stack[--depth];
 		if(depth>treshold)
 			{
@@ -838,12 +844,13 @@ collideTT(root0,root1,xform,policy);
 //
 DBVT_PREFIX
 inline void		btDbvt::collideTV(	const btDbvtNode* root,
-									const btDbvtVolume& volume,
+									const btDbvtVolume& vol,
 									DBVT_IPOLICY)
 {
 DBVT_CHECKTYPE
 if(root)
 	{
+	ATTRIBUTE_ALIGNED16(btDbvtVolume)		volume(vol);
 	btAlignedObjectArray<const btDbvtNode*>	stack;
 	stack.reserve(SIMPLE_STACKSIZE);
 	stack.push_back(root);
@@ -1095,7 +1102,10 @@ if(root)
 #undef DBVT_IPOLICY
 #undef DBVT_CHECKTYPE
 #undef DBVT_IMPL_GENERIC
-#undef DBVT_IMPL_FPU0x86
 #undef DBVT_IMPL_SSE
+#undef DBVT_USE_INTRINSIC_SSE
+#undef DBVT_SELECT_IMPL
+#undef DBVT_MERGE_IMPL
+#undef DBVT_INT0_IMPL
 
 #endif