1 files changed, 821 insertions, 0 deletions
diff --git a/extern/bullet/BulletDynamics/ConstraintSolver/SorLcp.cpp b/extern/bullet/BulletDynamics/ConstraintSolver/SorLcp.cpp
new file mode 100644
index 00000000000..d5c6d80552d
--- /dev/null
+++ b/extern/bullet/BulletDynamics/ConstraintSolver/SorLcp.cpp
@@ -0,0 +1,821 @@
+/*
+ * Copyright (c) 2005 Erwin Coumans http://www.erwincoumans.com
+ *
+ * Permission to use, copy, modify, distribute and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appear in all copies.
+ * Erwin Coumans makes no representations about the suitability 
+ * of this software for any purpose.  
+ * It is provided "as is" without express or implied warranty.
+*/
+#include "SorLcp.h"
+
+#ifdef USE_SOR_SOLVER
+
+// SOR LCP taken from ode quickstep, 
+// todo: write own successive overrelaxation gauss-seidel, or jacobi iterative solver
+
+
+#include "SimdScalar.h"
+
+#include "Dynamics/RigidBody.h"
+#include <math.h>
+#include <float.h>//FLT_MAX
+#ifdef WIN32
+#include <memory.h>
+#endif
+#include <string.h>
+#include <stdio.h>
+
+#ifdef WIN32
+#include <malloc.h>
+#else
+#include <alloca.h>
+#endif
+
+#include "Dynamics/BU_Joint.h"
+#include "ContactSolverInfo.h"
+
+////////////////////////////////////////////////////////////////////
+//math stuff
+
+typedef SimdScalar dVector4[4];
+typedef SimdScalar dMatrix3[4*3];
+#define dInfinity FLT_MAX
+
+
+
+#define dRecip(x) ((float)(1.0f/(x)))				/* reciprocal */
+
+
+
+#define dMULTIPLY0_331NEW(A,op,B,C) \
+{\
+	float tmp[3];\
+	tmp[0] = C.getX();\
+	tmp[1] = C.getY();\
+	tmp[2] = C.getZ();\
+	dMULTIPLYOP0_331(A,op,B,tmp);\
+}
+
+#define dMULTIPLY0_331(A,B,C) dMULTIPLYOP0_331(A,=,B,C)
+#define dMULTIPLYOP0_331(A,op,B,C) \
+  (A)[0] op dDOT1((B),(C)); \
+  (A)[1] op dDOT1((B+4),(C)); \
+  (A)[2] op dDOT1((B+8),(C));
+
+#define dAASSERT ASSERT
+#define dIASSERT ASSERT
+
+#define REAL float
+#define dDOTpq(a,b,p,q) ((a)[0]*(b)[0] + (a)[p]*(b)[q] + (a)[2*(p)]*(b)[2*(q)])
+SimdScalar dDOT1  (const SimdScalar *a, const SimdScalar *b) { return dDOTpq(a,b,1,1); }
+#define dDOT14(a,b) dDOTpq(a,b,1,4)
+
+#define dCROSS(a,op,b,c) \
+  (a)[0] op ((b)[1]*(c)[2] - (b)[2]*(c)[1]); \
+  (a)[1] op ((b)[2]*(c)[0] - (b)[0]*(c)[2]); \
+  (a)[2] op ((b)[0]*(c)[1] - (b)[1]*(c)[0]);
+
+
+#define dMULTIPLYOP2_333(A,op,B,C) \
+  (A)[0] op dDOT1((B),(C)); \
+  (A)[1] op dDOT1((B),(C+4)); \
+  (A)[2] op dDOT1((B),(C+8)); \
+  (A)[4] op dDOT1((B+4),(C)); \
+  (A)[5] op dDOT1((B+4),(C+4)); \
+  (A)[6] op dDOT1((B+4),(C+8)); \
+  (A)[8] op dDOT1((B+8),(C)); \
+  (A)[9] op dDOT1((B+8),(C+4)); \
+  (A)[10] op dDOT1((B+8),(C+8));
+#define dMULTIPLYOP0_333(A,op,B,C) \
+  (A)[0] op dDOT14((B),(C)); \
+  (A)[1] op dDOT14((B),(C+1)); \
+  (A)[2] op dDOT14((B),(C+2)); \
+  (A)[4] op dDOT14((B+4),(C)); \
+  (A)[5] op dDOT14((B+4),(C+1)); \
+  (A)[6] op dDOT14((B+4),(C+2)); \
+  (A)[8] op dDOT14((B+8),(C)); \
+  (A)[9] op dDOT14((B+8),(C+1)); \
+  (A)[10] op dDOT14((B+8),(C+2));
+
+#define dMULTIPLY2_333(A,B,C) dMULTIPLYOP2_333(A,=,B,C)
+#define dMULTIPLY0_333(A,B,C) dMULTIPLYOP0_333(A,=,B,C)
+#define dMULTIPLYADD0_331(A,B,C) dMULTIPLYOP0_331(A,+=,B,C)
+
+
+////////////////////////////////////////////////////////////////////
+#define EFFICIENT_ALIGNMENT 16
+#define dEFFICIENT_SIZE(x) ((((x)-1)|(EFFICIENT_ALIGNMENT-1))+1)
+/* alloca aligned to the EFFICIENT_ALIGNMENT. note that this can waste
+ * up to 15 bytes per allocation, depending on what alloca() returns.
+ */
+
+#define dALLOCA16(n) \
+  ((char*)dEFFICIENT_SIZE(((size_t)(alloca((n)+(EFFICIENT_ALIGNMENT-1))))))
+
+
+
+/////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////
+
+#ifdef DEBUG
+#define ANSI_FTOL 1
+
+extern "C" { 
+    __declspec(naked) void _ftol2() {
+        __asm    {
+#if ANSI_FTOL
+            fnstcw   WORD PTR [esp-2]
+            mov      ax, WORD PTR [esp-2]
+			
+            OR AX,	 0C00h
+			
+            mov      WORD PTR [esp-4], ax
+            fldcw    WORD PTR [esp-4]
+            fistp    QWORD PTR [esp-12]
+            fldcw    WORD PTR [esp-2]
+            mov      eax, DWORD PTR [esp-12]
+            mov      edx, DWORD PTR [esp-8]
+#else
+            fistp    DWORD PTR [esp-12]
+            mov		 eax, DWORD PTR [esp-12]
+            mov		 ecx, DWORD PTR [esp-8]
+#endif
+            ret
+        }
+    }
+}
+#endif //DEBUG
+
+
+
+  
+
+
+#define ALLOCA dALLOCA16
+
+typedef const SimdScalar *dRealPtr;
+typedef SimdScalar *dRealMutablePtr;
+#define dRealArray(name,n) SimdScalar name[n];
+#define dRealAllocaArray(name,n) SimdScalar *name = (SimdScalar*) ALLOCA ((n)*sizeof(SimdScalar));
+
+void dSetZero1 (SimdScalar *a, int n)
+{
+  dAASSERT (a && n >= 0);
+  while (n > 0) {
+    *(a++) = 0;
+    n--;
+  }
+}
+
+void dSetValue1 (SimdScalar *a, int n, SimdScalar value)
+{
+  dAASSERT (a && n >= 0);
+  while (n > 0) {
+    *(a++) = value;
+    n--;
+  }
+}
+
+
+//***************************************************************************
+// configuration
+
+// for the SOR and CG methods:
+// uncomment the following line to use warm starting. this definitely
+// help for motor-driven joints. unfortunately it appears to hurt
+// with high-friction contacts using the SOR method. use with care
+
+//#define WARM_STARTING 1
+
+// for the SOR method:
+// uncomment the following line to randomly reorder constraint rows
+// during the solution. depending on the situation, this can help a lot
+// or hardly at all, but it doesn't seem to hurt.
+
+//#define RANDOMLY_REORDER_CONSTRAINTS 1
+
+
+
+//***************************************************************************
+// various common computations involving the matrix J
+
+// compute iMJ = inv(M)*J'
+
+static void compute_invM_JT (int m, dRealMutablePtr J, dRealMutablePtr iMJ, int *jb,
+	RigidBody * const *body, dRealPtr invI)
+{
+	int i,j;
+	dRealMutablePtr iMJ_ptr = iMJ;
+	dRealMutablePtr J_ptr = J;
+	for (i=0; i<m; i++) {
+		int b1 = jb[i*2];	
+		int b2 = jb[i*2+1];
+		SimdScalar k = body[b1]->getInvMass();
+		for (j=0; j<3; j++) iMJ_ptr[j] = k*J_ptr[j];
+		dMULTIPLY0_331 (iMJ_ptr + 3, invI + 12*b1, J_ptr + 3);
+		if (b2 >= 0) {
+			k = body[b2]->getInvMass();
+			for (j=0; j<3; j++) iMJ_ptr[j+6] = k*J_ptr[j+6];
+			dMULTIPLY0_331 (iMJ_ptr + 9, invI + 12*b2, J_ptr + 9);
+		}
+		J_ptr += 12;
+		iMJ_ptr += 12;
+	}
+}
+
+static void multiply_invM_JTSpecial (int m, int nb, dRealMutablePtr iMJ, int *jb,
+	dRealMutablePtr in, dRealMutablePtr out,int onlyBody1,int onlyBody2)
+{
+	int i,j;
+
+		
+
+	dRealMutablePtr out_ptr1 = out + onlyBody1*6;
+	
+	for (j=0; j<6; j++) 
+		out_ptr1[j] = 0;
+
+	if (onlyBody2 >= 0)
+	{
+		out_ptr1 = out + onlyBody2*6;
+
+		for (j=0; j<6; j++) 
+			out_ptr1[j] = 0;
+	}
+
+	dRealPtr iMJ_ptr = iMJ;
+	for (i=0; i<m; i++) {
+
+		int b1 = jb[i*2];	
+
+		dRealMutablePtr out_ptr = out + b1*6;
+		if ((b1 == onlyBody1) || (b1 == onlyBody2))
+		{
+			for (j=0; j<6; j++) 
+				out_ptr[j] += iMJ_ptr[j] * in[i] ;
+		}
+			
+		iMJ_ptr += 6;
+
+		int b2 = jb[i*2+1];
+		if ((b2 == onlyBody1) || (b2 == onlyBody2))
+		{
+			if (b2 >= 0) 
+			{
+					out_ptr = out + b2*6;
+					for (j=0; j<6; j++) 
+						out_ptr[j] += iMJ_ptr[j] * in[i];
+			}
+		}
+		
+		iMJ_ptr += 6;
+		
+	}
+}
+
+
+// compute out = inv(M)*J'*in.
+
+static void multiply_invM_JT (int m, int nb, dRealMutablePtr iMJ, int *jb,
+	dRealMutablePtr in, dRealMutablePtr out)
+{
+	int i,j;
+	dSetZero1 (out,6*nb);
+	dRealPtr iMJ_ptr = iMJ;
+	for (i=0; i<m; i++) {
+		int b1 = jb[i*2];	
+		int b2 = jb[i*2+1];
+		dRealMutablePtr out_ptr = out + b1*6;
+		for (j=0; j<6; j++) 
+			out_ptr[j] += iMJ_ptr[j] * in[i];
+		iMJ_ptr += 6;
+		if (b2 >= 0) {
+			out_ptr = out + b2*6;
+			for (j=0; j<6; j++) out_ptr[j] += iMJ_ptr[j] * in[i];
+		}
+		iMJ_ptr += 6;
+	}
+}
+
+
+// compute out = J*in.
+
+static void multiply_J (int m, dRealMutablePtr J, int *jb,
+	dRealMutablePtr in, dRealMutablePtr out)
+{
+	int i,j;
+	dRealPtr J_ptr = J;
+	for (i=0; i<m; i++) {
+		int b1 = jb[i*2];	
+		int b2 = jb[i*2+1];
+		SimdScalar sum = 0;
+		dRealMutablePtr in_ptr = in + b1*6;
+		for (j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j];				
+		J_ptr += 6;
+		if (b2 >= 0) {
+			in_ptr = in + b2*6;
+			for (j=0; j<6; j++) sum += J_ptr[j] * in_ptr[j];				
+		}
+		J_ptr += 6;
+		out[i] = sum;
+	}
+}
+
+//***************************************************************************
+// SOR-LCP method
+
+// nb is the number of bodies in the body array.
+// J is an m*12 matrix of constraint rows
+// jb is an array of first and second body numbers for each constraint row
+// invI is the global frame inverse inertia for each body (stacked 3x3 matrices)
+//
+// this returns lambda and fc (the constraint force).
+// note: fc is returned as inv(M)*J'*lambda, the constraint force is actually J'*lambda
+//
+// b, lo and hi are modified on exit
+
+
+struct IndexError {
+	SimdScalar error;		// error to sort on
+	int findex;
+	int index;		// row index
+};
+
+static unsigned long seed2 = 0;
+
+unsigned long dRand2()
+{
+  seed2 = (1664525L*seed2 + 1013904223L) & 0xffffffff;
+  return seed2;
+}
+
+int dRandInt2 (int n)
+{
+  float a = float(n) / 4294967296.0f;
+  return (int) (float(dRand2()) * a);
+}
+
+
+static void SOR_LCP (int m, int nb, dRealMutablePtr J, int *jb, RigidBody * const *body,
+	dRealPtr invI, dRealMutablePtr lambda, dRealMutablePtr invMforce, dRealMutablePtr rhs,
+	dRealMutablePtr lo, dRealMutablePtr hi, dRealPtr cfm, int *findex,
+	int numiter,float overRelax)
+{
+	const int num_iterations = numiter;
+	const float sor_w = overRelax;		// SOR over-relaxation parameter
+
+	int i,j;
+
+#ifdef WARM_STARTING
+	// for warm starting, this seems to be necessary to prevent
+	// jerkiness in motor-driven joints. i have no idea why this works.
+	for (i=0; i<m; i++) lambda[i] *= 0.9;
+#else
+	dSetZero1 (lambda,m);
+#endif
+
+	// the lambda computed at the previous iteration.
+	// this is used to measure error for when we are reordering the indexes.
+	dRealAllocaArray (last_lambda,m);
+
+	// a copy of the 'hi' vector in case findex[] is being used
+	dRealAllocaArray (hicopy,m);
+	memcpy (hicopy,hi,m*sizeof(float));
+
+	// precompute iMJ = inv(M)*J'
+	dRealAllocaArray (iMJ,m*12);
+	compute_invM_JT (m,J,iMJ,jb,body,invI);
+
+	// compute fc=(inv(M)*J')*lambda. we will incrementally maintain fc
+	// as we change lambda.
+#ifdef WARM_STARTING
+	multiply_invM_JT (m,nb,iMJ,jb,lambda,fc);
+#else
+	dSetZero1 (invMforce,nb*6);
+#endif
+
+	// precompute 1 / diagonals of A
+	dRealAllocaArray (Ad,m);
+	dRealPtr iMJ_ptr = iMJ;
+	dRealMutablePtr J_ptr = J;
+	for (i=0; i<m; i++) {
+		float sum = 0;
+		for (j=0; j<6; j++) sum += iMJ_ptr[j] * J_ptr[j];
+		if (jb[i*2+1] >= 0) {
+			for (j=6; j<12; j++) sum += iMJ_ptr[j] * J_ptr[j];
+		}
+		iMJ_ptr += 12;
+		J_ptr += 12;
+		Ad[i] = sor_w / (sum + cfm[i]);
+	}
+
+	// scale J and b by Ad
+	J_ptr = J;
+	for (i=0; i<m; i++) {
+		for (j=0; j<12; j++) {
+			J_ptr[0] *= Ad[i];
+			J_ptr++;
+		}
+		rhs[i] *= Ad[i];
+	}
+
+	// scale Ad by CFM
+	for (i=0; i<m; i++) Ad[i] *= cfm[i];
+
+	// order to solve constraint rows in
+	IndexError *order = (IndexError*) alloca (m*sizeof(IndexError));
+
+#ifndef REORDER_CONSTRAINTS
+	// make sure constraints with findex < 0 come first.
+	j=0;
+	for (i=0; i<m; i++) if (findex[i] < 0) order[j++].index = i;
+	for (i=0; i<m; i++) if (findex[i] >= 0) order[j++].index = i;
+	dIASSERT (j==m);
+#endif
+
+	for (int iteration=0; iteration < num_iterations; iteration++) {
+
+#ifdef REORDER_CONSTRAINTS
+		// constraints with findex < 0 always come first.
+		if (iteration < 2) {
+			// for the first two iterations, solve the constraints in
+			// the given order
+			for (i=0; i<m; i++) {
+				order[i].error = i;
+				order[i].findex = findex[i];
+				order[i].index = i;
+			}
+		}
+		else {
+			// sort the constraints so that the ones converging slowest
+			// get solved last. use the absolute (not relative) error.
+			for (i=0; i<m; i++) {
+				float v1 = dFabs (lambda[i]);
+				float v2 = dFabs (last_lambda[i]);
+				float max = (v1 > v2) ? v1 : v2;
+				if (max > 0) {
+					//@@@ relative error: order[i].error = dFabs(lambda[i]-last_lambda[i])/max;
+					order[i].error = dFabs(lambda[i]-last_lambda[i]);
+				}
+				else {
+					order[i].error = dInfinity;
+				}
+				order[i].findex = findex[i];
+				order[i].index = i;
+			}
+		}
+		qsort (order,m,sizeof(IndexError),&compare_index_error);
+#endif
+#ifdef RANDOMLY_REORDER_CONSTRAINTS
+                if ((iteration & 7) == 0) {
+			for (i=1; i<m; ++i) {
+				IndexError tmp = order[i];
+				int swapi = dRandInt2(i+1);
+				order[i] = order[swapi];
+				order[swapi] = tmp;
+			}
+                }
+#endif
+
+		//@@@ potential optimization: swap lambda and last_lambda pointers rather
+		//    than copying the data. we must make sure lambda is properly
+		//    returned to the caller
+		memcpy (last_lambda,lambda,m*sizeof(float));
+
+		for (int i=0; i<m; i++) {
+			// @@@ potential optimization: we could pre-sort J and iMJ, thereby
+			//     linearizing access to those arrays. hmmm, this does not seem
+			//     like a win, but we should think carefully about our memory
+			//     access pattern.
+		
+			int index = order[i].index;
+			J_ptr = J + index*12;
+			iMJ_ptr = iMJ + index*12;
+		
+			// set the limits for this constraint. note that 'hicopy' is used.
+			// this is the place where the QuickStep method differs from the
+			// direct LCP solving method, since that method only performs this
+			// limit adjustment once per time step, whereas this method performs
+			// once per iteration per constraint row.
+			// the constraints are ordered so that all lambda[] values needed have
+			// already been computed.
+			if (findex[index] >= 0) {
+				hi[index] = fabsf (hicopy[index] * lambda[findex[index]]);
+				lo[index] = -hi[index];
+			}
+
+			int b1 = jb[index*2];
+			int b2 = jb[index*2+1];
+			float delta = rhs[index] - lambda[index]*Ad[index];
+			dRealMutablePtr fc_ptr = invMforce + 6*b1;
+			
+			// @@@ potential optimization: SIMD-ize this and the b2 >= 0 case
+			delta -=fc_ptr[0] * J_ptr[0] + fc_ptr[1] * J_ptr[1] +
+				fc_ptr[2] * J_ptr[2] + fc_ptr[3] * J_ptr[3] +
+				fc_ptr[4] * J_ptr[4] + fc_ptr[5] * J_ptr[5];
+			// @@@ potential optimization: handle 1-body constraints in a separate
+			//     loop to avoid the cost of test & jump?
+			if (b2 >= 0) {
+				fc_ptr = invMforce + 6*b2;
+				delta -=fc_ptr[0] * J_ptr[6] + fc_ptr[1] * J_ptr[7] +
+					fc_ptr[2] * J_ptr[8] + fc_ptr[3] * J_ptr[9] +
+					fc_ptr[4] * J_ptr[10] + fc_ptr[5] * J_ptr[11];
+			}
+
+			// compute lambda and clamp it to [lo,hi].
+			// @@@ potential optimization: does SSE have clamping instructions
+			//     to save test+jump penalties here?
+			float new_lambda = lambda[index] + delta;
+			if (new_lambda < lo[index]) {
+				delta = lo[index]-lambda[index];
+				lambda[index] = lo[index];
+			}
+			else if (new_lambda > hi[index]) {
+				delta = hi[index]-lambda[index];
+				lambda[index] = hi[index];
+			}
+			else {
+				lambda[index] = new_lambda;
+			}
+
+			//@@@ a trick that may or may not help
+			//float ramp = (1-((float)(iteration+1)/(float)num_iterations));
+			//delta *= ramp;
+		
+			// update invMforce.
+			// @@@ potential optimization: SIMD for this and the b2 >= 0 case
+			fc_ptr = invMforce + 6*b1;
+			fc_ptr[0] += delta * iMJ_ptr[0];
+			fc_ptr[1] += delta * iMJ_ptr[1];
+			fc_ptr[2] += delta * iMJ_ptr[2];
+			fc_ptr[3] += delta * iMJ_ptr[3];
+			fc_ptr[4] += delta * iMJ_ptr[4];
+			fc_ptr[5] += delta * iMJ_ptr[5];
+			// @@@ potential optimization: handle 1-body constraints in a separate
+			//     loop to avoid the cost of test & jump?
+			if (b2 >= 0) {
+				fc_ptr = invMforce + 6*b2;
+				fc_ptr[0] += delta * iMJ_ptr[6];
+				fc_ptr[1] += delta * iMJ_ptr[7];
+				fc_ptr[2] += delta * iMJ_ptr[8];
+				fc_ptr[3] += delta * iMJ_ptr[9];
+				fc_ptr[4] += delta * iMJ_ptr[10];
+				fc_ptr[5] += delta * iMJ_ptr[11];
+			}
+		}
+	}
+}
+
+
+
+void SolveInternal1 (float global_cfm,
+					 float global_erp,
+					 RigidBody * const *body, int nb,
+					BU_Joint * const *_joint, 
+					int nj, 
+					const ContactSolverInfo& solverInfo)
+{
+
+	int numIter = 30;
+	float sOr = 1.3f;
+
+	int i,j;
+	
+	SimdScalar stepsize1 = dRecip(solverInfo.m_timeStep);
+
+	// number all bodies in the body list - set their tag values
+	for (i=0; i<nb; i++) 
+		body[i]->m_odeTag = i;
+	
+	// make a local copy of the joint array, because we might want to modify it.
+	// (the "BU_Joint *const*" declaration says we're allowed to modify the joints
+	// but not the joint array, because the caller might need it unchanged).
+	//@@@ do we really need to do this? we'll be sorting constraint rows individually, not joints
+	BU_Joint **joint = (BU_Joint**) alloca (nj * sizeof(BU_Joint*));
+	memcpy (joint,_joint,nj * sizeof(BU_Joint*));
+	
+	// for all bodies, compute the inertia tensor and its inverse in the global
+	// frame, and compute the rotational force and add it to the torque
+	// accumulator. I and invI are a vertical stack of 3x4 matrices, one per body.
+	dRealAllocaArray (I,3*4*nb);
+	dRealAllocaArray (invI,3*4*nb);
+/*	for (i=0; i<nb; i++) {
+		dMatrix3 tmp;
+		// compute inertia tensor in global frame
+		dMULTIPLY2_333 (tmp,body[i]->m_I,body[i]->m_R);
+		// compute inverse inertia tensor in global frame
+		dMULTIPLY2_333 (tmp,body[i]->m_invI,body[i]->m_R);
+		dMULTIPLY0_333 (invI+i*12,body[i]->m_R,tmp);
+		// compute rotational force
+		dCROSS (body[i]->m_tacc,-=,body[i]->getAngularVelocity(),tmp);
+	}
+*/
+	for (i=0; i<nb; i++) {
+		dMatrix3 tmp;
+		// compute inertia tensor in global frame
+		dMULTIPLY2_333 (tmp,body[i]->m_I,body[i]->m_R);
+		dMULTIPLY0_333 (I+i*12,body[i]->m_R,tmp);
+		// compute inverse inertia tensor in global frame
+		dMULTIPLY2_333 (tmp,body[i]->m_invI,body[i]->m_R);
+		dMULTIPLY0_333 (invI+i*12,body[i]->m_R,tmp);
+		// compute rotational force
+		dMULTIPLY0_331 (tmp,I+i*12,body[i]->getAngularVelocity());
+		//dCROSS (body[i]->tacc,-=,body[i]->avel,tmp);
+		dCROSS (body[i]->m_tacc,-=,body[i]->getAngularVelocity(),tmp);
+
+	}
+
+
+	// get joint information (m = total constraint dimension, nub = number of unbounded variables).
+	// joints with m=0 are inactive and are removed from the joints array
+	// entirely, so that the code that follows does not consider them.
+	//@@@ do we really need to save all the info1's
+	BU_Joint::Info1 *info = (BU_Joint::Info1*) alloca (nj*sizeof(BU_Joint::Info1));
+	for (i=0, j=0; j<nj; j++) {	// i=dest, j=src
+		joint[j]->GetInfo1 (info+i);
+		dIASSERT (info[i].m >= 0 && info[i].m <= 6 && info[i].nub >= 0 && info[i].nub <= info[i].m);
+		if (info[i].m > 0) {
+			joint[i] = joint[j];
+			i++;
+		}
+	}
+	nj = i;
+
+	// create the row offset array
+	int m = 0;
+	int *ofs = (int*) alloca (nj*sizeof(int));
+	for (i=0; i<nj; i++) {
+		ofs[i] = m;
+		m += info[i].m;
+	}
+
+	// if there are constraints, compute the constraint force
+	dRealAllocaArray (J,m*12);
+	int *jb = (int*) alloca (m*2*sizeof(int));
+	if (m > 0) {
+		// create a constraint equation right hand side vector `c', a constraint
+		// force mixing vector `cfm', and LCP low and high bound vectors, and an
+		// 'findex' vector.
+		dRealAllocaArray (c,m);
+		dRealAllocaArray (cfm,m);
+		dRealAllocaArray (lo,m);
+		dRealAllocaArray (hi,m);
+		int *findex = (int*) alloca (m*sizeof(int));
+		dSetZero1 (c,m);
+		dSetValue1 (cfm,m,global_cfm);
+		dSetValue1 (lo,m,-dInfinity);
+		dSetValue1 (hi,m, dInfinity);
+		for (i=0; i<m; i++) findex[i] = -1;
+		
+		// get jacobian data from constraints. an m*12 matrix will be created
+		// to store the two jacobian blocks from each constraint. it has this
+		// format:
+		//
+		//   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 \    .
+		//   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2  }-- jacobian for joint 0, body 1 and body 2 (3 rows)
+		//   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 /
+		//   l1 l1 l1 a1 a1 a1 l2 l2 l2 a2 a2 a2 }--- jacobian for joint 1, body 1 and body 2 (3 rows)
+		//   etc...
+		//
+		//   (lll) = linear jacobian data
+		//   (aaa) = angular jacobian data
+		//
+		dSetZero1 (J,m*12);
+		BU_Joint::Info2 Jinfo;
+		Jinfo.rowskip = 12;
+		Jinfo.fps = stepsize1;
+		Jinfo.erp = global_erp;
+		for (i=0; i<nj; i++) {
+			Jinfo.J1l = J + ofs[i]*12;
+			Jinfo.J1a = Jinfo.J1l + 3;
+			Jinfo.J2l = Jinfo.J1l + 6;
+			Jinfo.J2a = Jinfo.J1l + 9;
+			Jinfo.c = c + ofs[i];
+			Jinfo.cfm = cfm + ofs[i];
+			Jinfo.lo = lo + ofs[i];
+			Jinfo.hi = hi + ofs[i];
+			Jinfo.findex = findex + ofs[i];
+			joint[i]->GetInfo2 (&Jinfo);
+
+			if (Jinfo.c[0] > solverInfo.m_maxErrorReduction)
+				Jinfo.c[0] = solverInfo.m_maxErrorReduction;
+
+
+
+
+			// adjust returned findex values for global index numbering
+			for (j=0; j<info[i].m; j++) {
+				if (findex[ofs[i] + j] >= 0) 
+					findex[ofs[i] + j] += ofs[i];
+			}
+		}
+
+		// create an array of body numbers for each joint row
+		int *jb_ptr = jb;
+		for (i=0; i<nj; i++) {
+			int b1 = (joint[i]->node[0].body) ? (joint[i]->node[0].body->m_odeTag) : -1;
+			int b2 = (joint[i]->node[1].body) ? (joint[i]->node[1].body->m_odeTag) : -1;
+			for (j=0; j<info[i].m; j++) {
+				jb_ptr[0] = b1;
+				jb_ptr[1] = b2;
+				jb_ptr += 2;
+			}
+		}
+		dIASSERT (jb_ptr == jb+2*m);
+
+		// compute the right hand side `rhs'
+		dRealAllocaArray (tmp1,nb*6);
+		// put v/h + invM*fe into tmp1
+		for (i=0; i<nb; i++) {
+			SimdScalar body_invMass = body[i]->getInvMass();
+			for (j=0; j<3; j++) tmp1[i*6+j] = body[i]->m_facc[j] * body_invMass + body[i]->getLinearVelocity()[j] * stepsize1;
+			dMULTIPLY0_331NEW (tmp1 + i*6 + 3,=,invI + i*12,body[i]->m_tacc);
+			for (j=0; j<3; j++) tmp1[i*6+3+j] += body[i]->getAngularVelocity()[j] * stepsize1;
+		}
+
+		// put J*tmp1 into rhs
+		dRealAllocaArray (rhs,m);
+		multiply_J (m,J,jb,tmp1,rhs);
+
+		// complete rhs
+		for (i=0; i<m; i++) rhs[i] = c[i]*stepsize1 - rhs[i];
+
+		// scale CFM
+		for (i=0; i<m; i++) 
+			cfm[i] =0;//*= stepsize1;
+
+		// load lambda from the value saved on the previous iteration
+		dRealAllocaArray (lambda,m);
+#ifdef WARM_STARTING
+		dSetZero1 (lambda,m);	//@@@ shouldn't be necessary
+		for (i=0; i<nj; i++) {
+			memcpy (lambda+ofs[i],joint[i]->lambda,info[i].m * sizeof(SimdScalar));
+		}
+#endif
+
+		// solve the LCP problem and get lambda and invM*constraint_force
+		dRealAllocaArray (cforce,nb*6);
+
+		SOR_LCP (m,nb,J,jb,body,invI,lambda,cforce,rhs,lo,hi,cfm,findex,numIter,sOr);
+
+#ifdef WARM_STARTING
+		// save lambda for the next iteration
+		//@@@ note that this doesn't work for contact joints yet, as they are
+		// recreated every iteration
+		for (i=0; i<nj; i++) {
+			memcpy (joint[i]->lambda,lambda+ofs[i],info[i].m * sizeof(SimdScalar));
+		}
+#endif
+
+	
+		// note that the SOR method overwrites rhs and J at this point, so
+		// they should not be used again.
+		
+		// add stepsize * cforce to the body velocity
+		for (i=0; i<nb; i++) {
+			SimdVector3 linvel = body[i]->getLinearVelocity();
+			SimdVector3 angvel = body[i]->getAngularVelocity();
+			
+			for (j=0; j<3; j++) 
+				linvel[j] += solverInfo.m_timeStep* cforce[i*6+j];
+			for (j=0; j<3; j++) 
+				angvel[j] += solverInfo.m_timeStep* cforce[i*6+3+j];
+		
+			body[i]->setLinearVelocity(linvel);
+			body[i]->setAngularVelocity(angvel);
+
+		}
+		
+	}
+
+
+
+	// compute the velocity update:
+	// add stepsize * invM * fe to the body velocity
+
+	for (i=0; i<nb; i++) {
+		SimdScalar body_invMass = body[i]->getInvMass();
+		SimdVector3 linvel = body[i]->getLinearVelocity();
+		SimdVector3 angvel = body[i]->getAngularVelocity();
+
+		for (j=0; j<3; j++) 
+		{
+			linvel[j] += solverInfo.m_timeStep * body_invMass * body[i]->m_facc[j];
+		}
+		for (j=0; j<3; j++) 
+		{
+			body[i]->m_tacc[j] *= solverInfo.m_timeStep;	
+		}
+		dMULTIPLY0_331NEW(angvel,+=,invI + i*12,body[i]->m_tacc);
+		body[i]->setAngularVelocity(angvel);
+
+	}
+
+
+
+}
+
+
+#endif //USE_SOR_SOLVER
+\ No newline at end of file