From 7f293488d12b5d5076b4bbf3d6c9248867c447a0 Mon Sep 17 00:00:00 2001 From: Erwin Coumans Date: Sat, 13 Sep 2008 07:06:43 +0000 Subject: Upgrade to latest Bullet trunk, that is in sync with Blender/extern/bullet2. (except for one define 'WIN32_AVOID_SSE_WHEN_EMBEDDED_INSIDE_BLENDER') In case someone reads those SVN logs: you can enable some extra broadphase SSE optimizations by replacing WIN32_AVOID_SSE_WHEN_EMBEDDED_INSIDE_BLENDER by WIN32 in extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.h Thanks to Benoit Bolsee for the upstream patch/contribution. Removed some obsolete files, they were just intended for comparison/testing. --- .../BroadphaseCollision/btAxisSweep3.h | 85 ++++----- .../BulletCollision/BroadphaseCollision/btDbvt.cpp | 91 ++++------ .../BulletCollision/BroadphaseCollision/btDbvt.h | 198 +++++++++++---------- .../BroadphaseCollision/btSimpleBroadphase.cpp | 66 +++---- .../BroadphaseCollision/btSimpleBroadphase.h | 21 +-- 5 files changed, 211 insertions(+), 250 deletions(-) (limited to 'extern/bullet2/src/BulletCollision/BroadphaseCollision') diff --git a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h index e7c5fb5b6cf..d0ad09a385a 100644 --- a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h +++ b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btAxisSweep3.h @@ -27,6 +27,7 @@ #include "btOverlappingPairCallback.h" //#define DEBUG_BROADPHASE 1 +#define USE_OVERLAP_TEST_ON_REMOVES 1 /// The internal templace class btAxisSweep3Internal implements the sweep and prune broadphase. /// It uses quantized integers to represent the begin and end points for each of the 3 axis. @@ -52,9 +53,7 @@ public: }; public: - //This breaks the Intel compiler, see http://softwarecommunity.intel.com/isn/Community/en-US/forums/thread/30253577.aspx - class Handle : public btBroadphaseProxy - //ATTRIBUTE_ALIGNED16(class) Handle : public btBroadphaseProxy + class Handle : public btBroadphaseProxy { public: BT_DECLARE_ALIGNED_ALLOCATOR(); @@ -80,7 +79,7 @@ protected: BP_FP_INT_TYPE m_numHandles; // number of active handles BP_FP_INT_TYPE m_maxHandles; // max number of handles Handle* m_pHandles; // handles pool - void* m_pHandlesRawPtr; + BP_FP_INT_TYPE m_firstFreeHandle; // free handles list Edge* m_pEdges[3]; // edge arrays for the 3 axes (each array has m_maxHandles * 2 + 2 sentinel entries) @@ -100,7 +99,7 @@ protected: void freeHandle(BP_FP_INT_TYPE handle); - bool testOverlap(int ignoreAxis,const Handle* pHandleA, const Handle* pHandleB); + bool testOverlap2D(const Handle* pHandleA, const Handle* pHandleB,int axis0,int axis1); #ifdef DEBUG_BROADPHASE void debugPrintAxis(int axis,bool checkCardinality=true); @@ -273,10 +272,9 @@ m_invalidPair(0) m_quantize = btVector3(btScalar(maxInt),btScalar(maxInt),btScalar(maxInt)) / aabbSize; - // allocate handles buffer and put all handles on free list - m_pHandlesRawPtr = btAlignedAlloc(sizeof(Handle)*maxHandles,16); - m_pHandles = new(m_pHandlesRawPtr) Handle[maxHandles]; - + // allocate handles buffer, using btAlignedAlloc, and put all handles on free list + m_pHandles = new Handle[maxHandles]; + m_maxHandles = maxHandles; m_numHandles = 0; @@ -327,7 +325,7 @@ btAxisSweep3Internal::~btAxisSweep3Internal() { btAlignedFree(m_pEdgesRawPtr[i]); } - btAlignedFree(m_pHandlesRawPtr); + delete [] m_pHandles; if (m_ownsPairCache) { @@ -603,34 +601,17 @@ bool btAxisSweep3Internal::testAabbOverlap(btBroadphaseProxy* pr } template -bool btAxisSweep3Internal::testOverlap(int ignoreAxis,const Handle* pHandleA, const Handle* pHandleB) +bool btAxisSweep3Internal::testOverlap2D(const Handle* pHandleA, const Handle* pHandleB,int axis0,int axis1) { //optimization 1: check the array index (memory address), instead of the m_pos - for (int axis = 0; axis < 3; axis++) + if (pHandleA->m_maxEdges[axis0] < pHandleB->m_minEdges[axis0] || + pHandleB->m_maxEdges[axis0] < pHandleA->m_minEdges[axis0] || + pHandleA->m_maxEdges[axis1] < pHandleB->m_minEdges[axis1] || + pHandleB->m_maxEdges[axis1] < pHandleA->m_minEdges[axis1]) { - if (axis != ignoreAxis) - { - if (pHandleA->m_maxEdges[axis] < pHandleB->m_minEdges[axis] || - pHandleB->m_maxEdges[axis] < pHandleA->m_minEdges[axis]) - { - return false; - } - } + return false; } - - //optimization 2: only 2 axis need to be tested (conflicts with 'delayed removal' optimization) - - /*for (int axis = 0; axis < 3; axis++) - { - if (m_pEdges[axis][pHandleA->m_maxEdges[axis]].m_pos < m_pEdges[axis][pHandleB->m_minEdges[axis]].m_pos || - m_pEdges[axis][pHandleB->m_maxEdges[axis]].m_pos < m_pEdges[axis][pHandleA->m_minEdges[axis]].m_pos) - { - return false; - } - } - */ - return true; } @@ -700,7 +681,9 @@ void btAxisSweep3Internal::sortMinDown(int axis, BP_FP_INT_TYPE if (pPrev->IsMax()) { // if previous edge is a maximum check the bounds and add an overlap if necessary - if (updateOverlaps && testOverlap(axis,pHandleEdge, pHandlePrev)) + const int axis1 = (1 << axis) & 3; + const int axis2 = (1 << axis1) & 3; + if (updateOverlaps && testOverlap2D(pHandleEdge, pHandlePrev,axis1,axis2)) { m_pairCache->addOverlappingPair(pHandleEdge,pHandlePrev); if (m_userPairCallback) @@ -748,12 +731,19 @@ void btAxisSweep3Internal::sortMinUp(int axis, BP_FP_INT_TYPE ed if (pNext->IsMax()) { - + Handle* handle0 = getHandle(pEdge->m_handle); + Handle* handle1 = getHandle(pNext->m_handle); + const int axis1 = (1 << axis) & 3; + const int axis2 = (1 << axis1) & 3; + // if next edge is maximum remove any overlap between the two handles - if (updateOverlaps) + if (updateOverlaps +#ifdef USE_OVERLAP_TEST_ON_REMOVES + && testOverlap2D(handle0,handle1,axis1,axis2) +#endif //USE_OVERLAP_TEST_ON_REMOVES + ) { - Handle* handle0 = getHandle(pEdge->m_handle); - Handle* handle1 = getHandle(pNext->m_handle); + m_pairCache->removeOverlappingPair(handle0,handle1,dispatcher); if (m_userPairCallback) @@ -799,12 +789,20 @@ void btAxisSweep3Internal::sortMaxDown(int axis, BP_FP_INT_TYPE if (!pPrev->IsMax()) { // if previous edge was a minimum remove any overlap between the two handles - if (updateOverlaps) + Handle* handle0 = getHandle(pEdge->m_handle); + Handle* handle1 = getHandle(pPrev->m_handle); + const int axis1 = (1 << axis) & 3; + const int axis2 = (1 << axis1) & 3; + + if (updateOverlaps +#ifdef USE_OVERLAP_TEST_ON_REMOVES + && testOverlap2D(handle0,handle1,axis1,axis2) +#endif //USE_OVERLAP_TEST_ON_REMOVES + ) { //this is done during the overlappingpairarray iteration/narrowphase collision - Handle* handle0 = getHandle(pEdge->m_handle); - Handle* handle1 = getHandle(pPrev->m_handle); + m_pairCache->removeOverlappingPair(handle0,handle1,dispatcher); if (m_userPairCallback) m_userPairCallback->removeOverlappingPair(handle0,handle1,dispatcher); @@ -850,10 +848,13 @@ void btAxisSweep3Internal::sortMaxUp(int axis, BP_FP_INT_TYPE ed { Handle* pHandleNext = getHandle(pNext->m_handle); + const int axis1 = (1 << axis) & 3; + const int axis2 = (1 << axis1) & 3; + if (!pNext->IsMax()) { // if next edge is a minimum check the bounds and add an overlap if necessary - if (updateOverlaps && testOverlap(axis, pHandleEdge, pHandleNext)) + if (updateOverlaps && testOverlap2D(pHandleEdge, pHandleNext,axis1,axis2)) { Handle* handle0 = getHandle(pEdge->m_handle); Handle* handle1 = getHandle(pNext->m_handle); diff --git a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.cpp b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.cpp index fade71179e6..7c41c8d8f71 100644 --- a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.cpp +++ b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btDbvt.cpp @@ -663,24 +663,23 @@ Benchmarking dbvt... Leaves: 8192 sizeof(btDbvtVolume): 32 bytes sizeof(btDbvtNode): 44 bytes -[1] btDbvtVolume intersections: 3537 ms (0%) -[2] btDbvtVolume merges: 1945 ms (0%) -[3] btDbvt::collideTT: 6646 ms (0%) -[4] btDbvt::collideTT self: 3389 ms (0%) -[5] btDbvt::collideTT xform: 7505 ms (0%) -[6] btDbvt::collideTT xform,self: 7480 ms (0%) -[7] btDbvt::collideRAY: 6307 ms (0%),(332511 r/s) -[8] insert/remove: 2105 ms (-3%),(996271 ir/s) -[9] updates (teleport): 1943 ms (0%),(1079337 u/s) -[10] updates (jitter): 1301 ms (0%),(1611953 u/s) -[11] optimize (incremental): 2510 ms (0%),(1671000 o/s) -[12] btDbvtVolume notequal: 3677 ms (0%) -[13] culling(OCL+fullsort): 2231 ms (0%),(458 t/s) -[14] culling(OCL+qsort): 3500 ms (0%),(2340 t/s) -[15] culling(KDOP+qsort): 1151 ms (0%),(7117 t/s) -[16] insert/remove batch(256): 5138 ms (0%),(816330 bir/s) -[17] btDbvtVolume proximity: 2842 ms (0%) -[18] btDbvtVolume select: 3390 ms (0%) +[1] btDbvtVolume intersections: 3499 ms (-1%) +[2] btDbvtVolume merges: 1934 ms (0%) +[3] btDbvt::collideTT: 5485 ms (-21%) +[4] btDbvt::collideTT self: 2814 ms (-20%) +[5] btDbvt::collideTT xform: 7379 ms (-1%) +[6] btDbvt::collideTT xform,self: 7270 ms (-2%) +[7] btDbvt::collideRAY: 6314 ms (0%),(332143 r/s) +[8] insert/remove: 2093 ms (0%),(1001983 ir/s) +[9] updates (teleport): 1879 ms (-3%),(1116100 u/s) +[10] updates (jitter): 1244 ms (-4%),(1685813 u/s) +[11] optimize (incremental): 2514 ms (0%),(1668000 o/s) +[12] btDbvtVolume notequal: 3659 ms (0%) +[13] culling(OCL+fullsort): 2218 ms (0%),(461 t/s) +[14] culling(OCL+qsort): 3688 ms (5%),(2221 t/s) +[15] culling(KDOP+qsort): 1139 ms (-1%),(7192 t/s) +[16] insert/remove batch(256): 5092 ms (0%),(823704 bir/s) +[17] btDbvtVolume select: 3419 ms (0%) */ struct btDbvtBenchmark @@ -787,7 +786,7 @@ static const bool cfgEnable = true; //[1] btDbvtVolume intersections bool cfgBenchmark1_Enable = cfgEnable; static const int cfgBenchmark1_Iterations = 8; -static const int cfgBenchmark1_Reference = 3537; +static const int cfgBenchmark1_Reference = 3499; //[2] btDbvtVolume merges bool cfgBenchmark2_Enable = cfgEnable; static const int cfgBenchmark2_Iterations = 4; @@ -795,21 +794,21 @@ static const int cfgBenchmark2_Reference = 1945; //[3] btDbvt::collideTT bool cfgBenchmark3_Enable = cfgEnable; static const int cfgBenchmark3_Iterations = 512; -static const int cfgBenchmark3_Reference = 6646; +static const int cfgBenchmark3_Reference = 5485; //[4] btDbvt::collideTT self bool cfgBenchmark4_Enable = cfgEnable; static const int cfgBenchmark4_Iterations = 512; -static const int cfgBenchmark4_Reference = 3389; +static const int cfgBenchmark4_Reference = 2814; //[5] btDbvt::collideTT xform bool cfgBenchmark5_Enable = cfgEnable; static const int cfgBenchmark5_Iterations = 512; static const btScalar cfgBenchmark5_OffsetScale = 2; -static const int cfgBenchmark5_Reference = 7505; +static const int cfgBenchmark5_Reference = 7379; //[6] btDbvt::collideTT xform,self bool cfgBenchmark6_Enable = cfgEnable; static const int cfgBenchmark6_Iterations = 512; static const btScalar cfgBenchmark6_OffsetScale = 2; -static const int cfgBenchmark6_Reference = 7480; +static const int cfgBenchmark6_Reference = 7270; //[7] btDbvt::collideRAY bool cfgBenchmark7_Enable = cfgEnable; static const int cfgBenchmark7_Passes = 32; @@ -824,13 +823,13 @@ static const int cfgBenchmark8_Reference = 2105; bool cfgBenchmark9_Enable = cfgEnable; static const int cfgBenchmark9_Passes = 32; static const int cfgBenchmark9_Iterations = 65536; -static const int cfgBenchmark9_Reference = 1943; +static const int cfgBenchmark9_Reference = 1879; //[10] updates (jitter) bool cfgBenchmark10_Enable = cfgEnable; static const btScalar cfgBenchmark10_Scale = cfgVolumeCenterScale/10000; static const int cfgBenchmark10_Passes = 32; static const int cfgBenchmark10_Iterations = 65536; -static const int cfgBenchmark10_Reference = 1301; +static const int cfgBenchmark10_Reference = 1244; //[11] optimize (incremental) bool cfgBenchmark11_Enable = cfgEnable; static const int cfgBenchmark11_Passes = 64; @@ -857,14 +856,10 @@ bool cfgBenchmark16_Enable = cfgEnable; static const int cfgBenchmark16_BatchCount = 256; static const int cfgBenchmark16_Passes = 16384; static const int cfgBenchmark16_Reference = 5138; -//[17] proximity +//[17] select bool cfgBenchmark17_Enable = cfgEnable; -static const int cfgBenchmark17_Iterations = 8; -static const int cfgBenchmark17_Reference = 2842; -//[18] select -bool cfgBenchmark18_Enable = cfgEnable; -static const int cfgBenchmark18_Iterations = 4; -static const int cfgBenchmark18_Reference = 3390; +static const int cfgBenchmark17_Iterations = 4; +static const int cfgBenchmark17_Reference = 3390; btClock wallclock; printf("Benchmarking dbvt...\r\n"); @@ -1259,32 +1254,6 @@ if(cfgBenchmark17_Enable) {// Benchmark 17 srand(380843); btAlignedObjectArray volumes; - btAlignedObjectArray results; - volumes.resize(cfgLeaves); - results.resize(cfgLeaves); - for(int i=0;i volumes; btAlignedObjectArray results; btAlignedObjectArray indices; volumes.resize(cfgLeaves); @@ -1299,9 +1268,9 @@ if(cfgBenchmark18_Enable) { btSwap(indices[i],indices[rand()%cfgLeaves]); } - printf("[18] btDbvtVolume select: "); + printf("[17] btDbvtVolume select: "); wallclock.reset(); - for(int i=0;i= 1400) #define DBVT_USE_TEMPLATE 1 #else @@ -41,6 +41,9 @@ subject to the following restrictions: #define DBVT_USE_TEMPLATE 0 #endif +// Use only intrinsics instead of inline asm +#define DBVT_USE_INTRINSIC_SSE 1 + // Using memmov for collideOCL #define DBVT_USE_MEMMOVE 1 @@ -57,14 +60,21 @@ subject to the following restrictions: #endif // Specific methods implementation -#ifdef WIN32_AVOID_WHEN_EMBEDDED_INSIDE_BLENDER -#define DBVT_PROXIMITY_IMPL DBVT_IMPL_SSE + +#ifdef WIN32_AVOID_SSE_WHEN_EMBEDDED_INSIDE_BLENDER //there is always some weird compiler that breaks SSE builds #define DBVT_SELECT_IMPL DBVT_IMPL_SSE #define DBVT_MERGE_IMPL DBVT_IMPL_SSE +#define DBVT_INT0_IMPL DBVT_IMPL_SSE #else -#define DBVT_PROXIMITY_IMPL DBVT_IMPL_GENERIC #define DBVT_SELECT_IMPL DBVT_IMPL_GENERIC #define DBVT_MERGE_IMPL DBVT_IMPL_GENERIC +#define DBVT_INT0_IMPL DBVT_IMPL_GENERIC +#endif + +#if (DBVT_SELECT_IMPL==DBVT_IMPL_SSE)|| \ + (DBVT_MERGE_IMPL==DBVT_IMPL_SSE)|| \ + (DBVT_INT0_IMPL==DBVT_IMPL_SSE) +#include #endif // @@ -104,10 +114,6 @@ subject to the following restrictions: #error "DBVT_ENABLE_BENCHMARK undefined" #endif -#ifndef DBVT_PROXIMITY_IMPL -#error "DBVT_PROXIMITY_IMPL undefined" -#endif - #ifndef DBVT_SELECT_IMPL #error "DBVT_SELECT_IMPL undefined" #endif @@ -116,6 +122,10 @@ subject to the following restrictions: #error "DBVT_MERGE_IMPL undefined" #endif +#ifndef DBVT_INT0_IMPL +#error "DBVT_INT0_IMPL undefined" +#endif + // // Defaults volumes // @@ -133,8 +143,8 @@ static inline btDbvtAabbMm FromCR(const btVector3& c,btScalar r); static inline btDbvtAabbMm FromMM(const btVector3& mi,const btVector3& mx); static inline btDbvtAabbMm FromPoints(const btVector3* pts,int n); static inline btDbvtAabbMm FromPoints(const btVector3** ppts,int n); -DBVT_INLINE void Expand(const btVector3 e); -DBVT_INLINE void SignedExpand(const btVector3 e); +DBVT_INLINE void Expand(const btVector3& e); +DBVT_INLINE void SignedExpand(const btVector3& e); DBVT_INLINE bool Contain(const btDbvtAabbMm& a) const; DBVT_INLINE int Classify(const btVector3& n,btScalar o,int s) const; DBVT_INLINE btScalar ProjectMinimum(const btVector3& v,unsigned signs) const; @@ -173,12 +183,12 @@ struct btDbvtNode { btDbvtVolume volume; btDbvtNode* parent; - bool isleaf() const { return(childs[1]==0); } - bool isinternal() const { return(!isleaf()); } + DBVT_INLINE bool isleaf() const { return(childs[1]==0); } + DBVT_INLINE bool isinternal() const { return(!isleaf()); } union { - btDbvtNode* childs[2]; - void* data; - }; + btDbvtNode* childs[2]; + void* data; + }; }; ///The btDbvt class implements a fast dynamic bounding volume tree based on axis aligned bounding boxes (aabb tree). @@ -186,8 +196,6 @@ struct btDbvtNode ///Unlike the btQuantizedBvh, nodes can be dynamically moved around, which allows for change in topology of the underlying data structure. struct btDbvt { - - /* Stack element */ struct sStkNN { @@ -250,8 +258,8 @@ struct btDbvt }; // Fields - btDbvtNode* m_root; - btDbvtNode* m_free; + btDbvtNode* m_root; + btDbvtNode* m_free; int m_lkhd; int m_leaves; unsigned m_opath; @@ -408,17 +416,17 @@ return(box); } // -DBVT_INLINE void btDbvtAabbMm::Expand(const btVector3 e) +DBVT_INLINE void btDbvtAabbMm::Expand(const btVector3& e) { mi-=e;mx+=e; } // -DBVT_INLINE void btDbvtAabbMm::SignedExpand(const btVector3 e) +DBVT_INLINE void btDbvtAabbMm::SignedExpand(const btVector3& e) { -if(e.x()>0) mx.setX(mx.x()+e.x()); else mi.setX(mi.x()+e.x()); -if(e.y()>0) mx.setY(mx.y()+e.y()); else mi.setY(mi.y()+e.y()); -if(e.z()>0) mx.setZ(mx.z()+e.z()); else mi.setZ(mi.z()+e.z()); +if(e.x()>0) mx.setX(mx.x()+e[0]); else mi.setX(mi.x()+e[0]); +if(e.y()>0) mx.setY(mx.y()+e[1]); else mi.setY(mi.y()+e[1]); +if(e.z()>0) mx.setZ(mx.z()+e[2]); else mi.setZ(mi.z()+e[2]); } // @@ -486,12 +494,19 @@ for(int i=0;i<3;++i) DBVT_INLINE bool Intersect( const btDbvtAabbMm& a, const btDbvtAabbMm& b) { +#if DBVT_INT0_IMPL == DBVT_IMPL_SSE +const __m128 rt(_mm_or_ps( _mm_cmplt_ps(_mm_load_ps(b.mx),_mm_load_ps(a.mi)), + _mm_cmplt_ps(_mm_load_ps(a.mx),_mm_load_ps(b.mi)))); +const __int32* pu((const __int32*)&rt); +return((pu[0]|pu[1]|pu[2])==0); +#else return( (a.mi.x()<=b.mx.x())&& (a.mx.x()>=b.mi.x())&& (a.mi.y()<=b.mx.y())&& (a.mx.y()>=b.mi.y())&& (a.mi.z()<=b.mx.z())&& (a.mx.z()>=b.mi.z())); +#endif } // @@ -558,32 +573,8 @@ return(txmax>0); DBVT_INLINE btScalar Proximity( const btDbvtAabbMm& a, const btDbvtAabbMm& b) { -#if DBVT_PROXIMITY_IMPL == DBVT_IMPL_SSE -DBVT_ALIGN btScalar r[1]; -static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; -__asm - { - mov eax,a - mov ecx,b - movaps xmm0,[eax] - movaps xmm2,[ecx] - movaps xmm1,[eax+16] - movaps xmm3,[ecx+16] - addps xmm0,xmm1 - addps xmm2,xmm3 - subps xmm0,xmm2 - andps xmm0,mask - movhlps xmm1,xmm0 - addps xmm0,xmm1 - pshufd xmm1,xmm0,1 - addss xmm0,xmm1 - movss r,xmm0 - } -return(r[0]); -#else const btVector3 d=(a.mi+a.mx)-(b.mi+b.mx); return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z())); -#endif } // @@ -592,36 +583,57 @@ DBVT_INLINE int Select( const btDbvtAabbMm& o, const btDbvtAabbMm& b) { #if DBVT_SELECT_IMPL == DBVT_IMPL_SSE -DBVT_ALIGN __int32 r[1]; static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; -__asm - { - mov eax,o - mov ecx,a - mov edx,b - movaps xmm0,[eax] - movaps xmm5,mask - addps xmm0,[eax+16] - movaps xmm1,[ecx] - movaps xmm2,[edx] - addps xmm1,[ecx+16] - addps xmm2,[edx+16] - subps xmm1,xmm0 - subps xmm2,xmm0 - andps xmm1,xmm5 - andps xmm2,xmm5 - movhlps xmm3,xmm1 - movhlps xmm4,xmm2 - addps xmm1,xmm3 - addps xmm2,xmm4 - pshufd xmm3,xmm1,1 - pshufd xmm4,xmm2,1 - addss xmm1,xmm3 - addss xmm2,xmm4 - cmpless xmm2,xmm1 - movss r,xmm2 - } -return(r[0]&1); + // TODO: the intrinsic version is 11% slower + #if DBVT_USE_INTRINSIC_SSE + __m128 omi(_mm_load_ps(o.mi)); + omi=_mm_add_ps(omi,_mm_load_ps(o.mx)); + __m128 ami(_mm_load_ps(a.mi)); + ami=_mm_add_ps(ami,_mm_load_ps(a.mx)); + ami=_mm_sub_ps(ami,omi); + ami=_mm_and_ps(ami,_mm_load_ps((const float*)mask)); + __m128 bmi(_mm_load_ps(b.mi)); + bmi=_mm_add_ps(bmi,_mm_load_ps(b.mx)); + bmi=_mm_sub_ps(bmi,omi); + bmi=_mm_and_ps(bmi,_mm_load_ps((const float*)mask)); + __m128 t0(_mm_movehl_ps(ami,ami)); + ami=_mm_add_ps(ami,t0); + ami=_mm_add_ss(ami,_mm_shuffle_ps(ami,ami,1)); + __m128 t1(_mm_movehl_ps(bmi,bmi)); + bmi=_mm_add_ps(bmi,t1); + bmi=_mm_add_ss(bmi,_mm_shuffle_ps(bmi,bmi,1)); + return(_mm_cmple_ss(bmi,ami).m128_u32[0]&1); + #else + DBVT_ALIGN __int32 r[1]; + __asm + { + mov eax,o + mov ecx,a + mov edx,b + movaps xmm0,[eax] + movaps xmm5,mask + addps xmm0,[eax+16] + movaps xmm1,[ecx] + movaps xmm2,[edx] + addps xmm1,[ecx+16] + addps xmm2,[edx+16] + subps xmm1,xmm0 + subps xmm2,xmm0 + andps xmm1,xmm5 + andps xmm2,xmm5 + movhlps xmm3,xmm1 + movhlps xmm4,xmm2 + addps xmm1,xmm3 + addps xmm2,xmm4 + pshufd xmm3,xmm1,1 + pshufd xmm4,xmm2,1 + addss xmm1,xmm3 + addss xmm2,xmm4 + cmpless xmm2,xmm1 + movss r,xmm2 + } + return(r[0]&1); + #endif #else return(Proximity(o,a)treshold) { @@ -838,12 +844,13 @@ collideTT(root0,root1,xform,policy); // DBVT_PREFIX inline void btDbvt::collideTV( const btDbvtNode* root, - const btDbvtVolume& volume, + const btDbvtVolume& vol, DBVT_IPOLICY) { DBVT_CHECKTYPE if(root) { + ATTRIBUTE_ALIGNED16(btDbvtVolume) volume(vol); btAlignedObjectArray stack; stack.reserve(SIMPLE_STACKSIZE); stack.push_back(root); @@ -1095,7 +1102,10 @@ if(root) #undef DBVT_IPOLICY #undef DBVT_CHECKTYPE #undef DBVT_IMPL_GENERIC -#undef DBVT_IMPL_FPU0x86 #undef DBVT_IMPL_SSE +#undef DBVT_USE_INTRINSIC_SSE +#undef DBVT_SELECT_IMPL +#undef DBVT_MERGE_IMPL +#undef DBVT_INT0_IMPL #endif diff --git a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp index 2d27f22567f..a57952ffa06 100644 --- a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp +++ b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp @@ -55,17 +55,15 @@ btSimpleBroadphase::btSimpleBroadphase(int maxProxies, btOverlappingPairCache* o m_maxHandles = maxProxies; m_numHandles = 0; m_firstFreeHandle = 0; - m_firstAllocatedHandle = -1; + { for (int i = m_firstFreeHandle; i < maxProxies; i++) { m_pHandles[i].SetNextFree(i + 1); m_pHandles[i].m_uniqueId = i+2;//any UID will do, we just avoid too trivial values (0,1) for debugging purposes - m_pHandles[i].SetNextAllocated(-1); } m_pHandles[maxProxies - 1].SetNextFree(0); - m_pHandles[maxProxies - 1].SetNextAllocated(-1); } @@ -179,31 +177,29 @@ void btSimpleBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) //first check for new overlapping pairs int i,j; - if (m_firstAllocatedHandle >= 0) + if (m_numHandles >= 0) { - btSimpleBroadphaseProxy* proxy0 = &m_pHandles[m_firstAllocatedHandle]; - for (i=0;ifindPair(proxy0,proxy1)) - { - m_pairCache->addOverlappingPair(proxy0,proxy1); - } - } else + btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0); + btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1); + + if (aabbOverlap(p0,p1)) + { + if ( !m_pairCache->findPair(proxy0,proxy1)) { + m_pairCache->addOverlappingPair(proxy0,proxy1); + } + } else + { if (!m_pairCache->hasDeferredRemoval()) { if ( m_pairCache->findPair(proxy0,proxy1)) @@ -211,19 +207,13 @@ void btSimpleBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); } } - - } } - proxy1 = &m_pHandles[proxy1->GetNextAllocated()]; - } - proxy0 = &m_pHandles[proxy0->GetNextAllocated()]; - } if (m_ownsPairCache && m_pairCache->hasDeferredRemoval()) { - + btBroadphasePairArray& overlappingPairArray = m_pairCache->getOverlappingPairArray(); //perform a sort, to find duplicates and to sort 'invalid' pairs to the end @@ -237,11 +227,11 @@ void btSimpleBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) previousPair.m_pProxy0 = 0; previousPair.m_pProxy1 = 0; previousPair.m_algorithm = 0; - - + + for (i=0;icleanOverlappingPair(pair,dispatcher); - // m_overlappingPairArray.swap(i,m_overlappingPairArray.size()-1); - // m_overlappingPairArray.pop_back(); + // m_overlappingPairArray.swap(i,m_overlappingPairArray.size()-1); + // m_overlappingPairArray.pop_back(); pair.m_pProxy0 = 0; pair.m_pProxy1 = 0; m_invalidPair++; gOverlappingPairs--; } - + } - ///if you don't like to skip the invalid pairs in the array, execute following code: - #define CLEAN_INVALID_PAIRS 1 - #ifdef CLEAN_INVALID_PAIRS + ///if you don't like to skip the invalid pairs in the array, execute following code: +#define CLEAN_INVALID_PAIRS 1 +#ifdef CLEAN_INVALID_PAIRS //perform a sort, to sort 'invalid' pairs to the end overlappingPairArray.quickSort(btBroadphasePairSortPredicate()); overlappingPairArray.resize(overlappingPairArray.size() - m_invalidPair); m_invalidPair = 0; - #endif//CLEAN_INVALID_PAIRS +#endif//CLEAN_INVALID_PAIRS } } diff --git a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.h b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.h index 49dfeb84900..e2ebb825725 100644 --- a/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.h +++ b/extern/bullet2/src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.h @@ -25,7 +25,7 @@ struct btSimpleBroadphaseProxy : public btBroadphaseProxy btVector3 m_min; btVector3 m_max; int m_nextFree; - int m_nextAllocated; + // int m_handleId; @@ -42,8 +42,7 @@ struct btSimpleBroadphaseProxy : public btBroadphaseProxy SIMD_FORCE_INLINE void SetNextFree(int next) {m_nextFree = next;} SIMD_FORCE_INLINE int GetNextFree() const {return m_nextFree;} - SIMD_FORCE_INLINE void SetNextAllocated(int next) {m_nextAllocated = next;} - SIMD_FORCE_INLINE int GetNextAllocated() const {return m_nextAllocated;} + }; @@ -57,22 +56,18 @@ protected: int m_numHandles; // number of active handles int m_maxHandles; // max number of handles + btSimpleBroadphaseProxy* m_pHandles; // handles pool + void* m_pHandlesRawPtr; int m_firstFreeHandle; // free handles list - int m_firstAllocatedHandle; - + int allocHandle() { - + btAssert(m_numHandles < m_maxHandles); int freeHandle = m_firstFreeHandle; m_firstFreeHandle = m_pHandles[freeHandle].GetNextFree(); - - m_pHandles[freeHandle].SetNextAllocated(m_firstAllocatedHandle); - m_firstAllocatedHandle = freeHandle; - m_numHandles++; - return freeHandle; } @@ -84,13 +79,9 @@ protected: proxy->SetNextFree(m_firstFreeHandle); m_firstFreeHandle = handle; - m_firstAllocatedHandle = proxy->GetNextAllocated(); - proxy->SetNextAllocated(-1); - m_numHandles--; } - btOverlappingPairCache* m_pairCache; bool m_ownsPairCache; -- cgit v1.2.3