1 files changed, 162 insertions, 0 deletions
diff --git a/extern/bullet2/src/BulletSoftBody/btSoftBodyHelpers.cpp b/extern/bullet2/src/BulletSoftBody/btSoftBodyHelpers.cpp
index 36f675a6c0c..293a393e55e 100644
--- a/extern/bullet2/src/BulletSoftBody/btSoftBodyHelpers.cpp
+++ b/extern/bullet2/src/BulletSoftBody/btSoftBodyHelpers.cpp
@@ -480,6 +480,168 @@ void			btSoftBodyHelpers::DrawClusterTree(	btSoftBody* psb,
 	drawTree(idraw,psb->m_cdbvt.m_root,0,btVector3(0,1,1),btVector3(1,0,0),mindepth,maxdepth);
 }
 
+
+//The btSoftBody object from the BulletSDK includes an array of Nodes and Links. These links appear
+// to be first set up to connect a node to between 5 and 6 of its neighbors [480 links], 
+//and then to the rest of the nodes after the execution of the Floyd-Warshall graph algorithm 
+//[another 930 links]. 
+//The way the links are stored by default, we have a number of cases where adjacent links share a node in common
+// - this leads to the creation of a data dependency through memory. 
+//The PSolve_Links() function reads and writes nodes as it iterates over each link. 
+//So, we now have the possibility of a data dependency between iteration X 
+//that processes link L with iteration X+1 that processes link L+1 
+//because L and L+1 have one node in common, and iteration X updates the positions of that node, 
+//and iteration X+1 reads in the position of that shared node.
+//
+//Such a memory dependency limits the ability of a modern CPU to speculate beyond 
+//a certain point because it has to respect a possible dependency 
+//- this prevents the CPU from making full use of its out-of-order resources. 
+//If we re-order the links such that we minimize the cases where a link L and L+1 share a common node, 
+//we create a temporal gap between when the node position is written, 
+//and when it is subsequently read. This in turn allows the CPU to continue execution without 
+//risking a dependency violation. Such a reordering would result in significant speedups on 
+//modern CPUs with lots of execution resources. 
+//In our testing, we see it have a tremendous impact not only on the A7, 
+//but also on all x86 cores that ship with modern Macs. 
+//The attached source file includes a single function (ReoptimizeLinkOrder) which can be called on a 
+//btSoftBody object in the solveConstraints() function before the actual solver is invoked, 
+//or right after generateBendingConstraints() once we have all 1410 links.
+
+
+//===================================================================
+//
+//
+// This function takes in a list of interdependent Links and tries 
+// to maximize the distance between calculation
+// of dependent links.  This increases the amount of parallelism that can
+// be exploited by out-of-order instruction processors with large but
+// (inevitably) finite instruction windows.
+//
+//===================================================================
+
+// A small structure to track lists of dependent link calculations
+class LinkDeps_t {
+	public:
+	int value;			// A link calculation that is dependent on this one
+		// Positive values = "input A" while negative values = "input B"
+	LinkDeps_t *next;	// Next dependence in the list
+};
+typedef LinkDeps_t *LinkDepsPtr_t;
+
+// Dependency list constants
+#define REOP_NOT_DEPENDENT	-1
+#define REOP_NODE_COMPLETE	-2	// Must be less than REOP_NOT_DEPENDENT
+
+
+void btSoftBodyHelpers::ReoptimizeLinkOrder(btSoftBody *psb /* This can be replaced by a btSoftBody pointer */)
+{
+	int i, nLinks=psb->m_links.size(), nNodes=psb->m_nodes.size();
+	btSoftBody::Link *lr;
+	int ar, br;
+	btSoftBody::Node *node0 = &(psb->m_nodes[0]);
+	btSoftBody::Node *node1 = &(psb->m_nodes[1]);
+	LinkDepsPtr_t linkDep;
+	int readyListHead, readyListTail, linkNum, linkDepFrees, depLink;
+	
+	// Allocate temporary buffers
+	int *nodeWrittenAt = new int[nNodes+1];	// What link calculation produced this node's current values?
+	int *linkDepA = new int[nLinks];			// Link calculation input is dependent upon prior calculation #N
+	int *linkDepB = new int[nLinks];
+	int *readyList = new int[nLinks];		// List of ready-to-process link calculations (# of links, maximum)
+	LinkDeps_t *linkDepFreeList = new LinkDeps_t[2*nLinks];		// Dependent-on-me list elements (2x# of links, maximum)
+	LinkDepsPtr_t *linkDepListStarts = new LinkDepsPtr_t[nLinks];	// Start nodes of dependent-on-me lists, one for each link
+		
+	// Copy the original, unsorted links to a side buffer
+	btSoftBody::Link *linkBuffer = new btSoftBody::Link[nLinks];
+	memcpy(linkBuffer, &(psb->m_links[0]), sizeof(btSoftBody::Link)*nLinks);
+
+	// Clear out the node setup and ready list
+	for (i=0; i < nNodes+1; i++) {
+		nodeWrittenAt[i] = REOP_NOT_DEPENDENT;
+	}
+	for (i=0; i < nLinks; i++) {
+		linkDepListStarts[i] = NULL;
+	}
+	readyListHead = readyListTail = linkDepFrees = 0;
+
+	// Initial link analysis to set up data structures
+	for (i=0; i < nLinks; i++) {
+	
+		// Note which prior link calculations we are dependent upon & build up dependence lists
+		lr = &(psb->m_links[i]);
+		ar = (lr->m_n[0] - node0)/(node1 - node0);
+		br = (lr->m_n[1] - node0)/(node1 - node0);
+		if (nodeWrittenAt[ar] > REOP_NOT_DEPENDENT) {
+			linkDepA[i] = nodeWrittenAt[ar];
+			linkDep = &linkDepFreeList[linkDepFrees++];
+			linkDep->value = i;
+			linkDep->next = linkDepListStarts[nodeWrittenAt[ar]];
+			linkDepListStarts[nodeWrittenAt[ar]] = linkDep;
+		} else {
+			linkDepA[i] = REOP_NOT_DEPENDENT;
+		}
+		if (nodeWrittenAt[br] > REOP_NOT_DEPENDENT) {
+			linkDepB[i] = nodeWrittenAt[br];
+			linkDep = &linkDepFreeList[linkDepFrees++];
+			linkDep->value = -(i+1);
+			linkDep->next = linkDepListStarts[nodeWrittenAt[br]];
+			linkDepListStarts[nodeWrittenAt[br]] = linkDep;
+		} else {
+			linkDepB[i] = REOP_NOT_DEPENDENT;
+		}
+		
+		// Add this link to the initial ready list, if it is not dependent on any other links
+		if ((linkDepA[i] == REOP_NOT_DEPENDENT) && (linkDepB[i] == REOP_NOT_DEPENDENT)) {
+			readyList[readyListTail++] = i;
+			linkDepA[i] = linkDepB[i] = REOP_NODE_COMPLETE;	// Probably not needed now
+		}
+		
+		// Update the nodes to mark which ones are calculated by this link
+		nodeWrittenAt[ar] = nodeWrittenAt[br] = i;
+	}
+	
+	// Process the ready list and create the sorted list of links
+	// -- By treating the ready list as a queue, we maximize the distance between any
+	//    inter-dependent node calculations
+	// -- All other (non-related) nodes in the ready list will automatically be inserted
+	//    in between each set of inter-dependent link calculations by this loop
+	i = 0;
+	while (readyListHead != readyListTail) {
+		// Use ready list to select the next link to process
+		linkNum = readyList[readyListHead++];
+		// Copy the next-to-calculate link back into the original link array
+		psb->m_links[i++] = linkBuffer[linkNum];
+
+		// Free up any link inputs that are dependent on this one
+		linkDep = linkDepListStarts[linkNum];
+		while (linkDep) {
+			depLink = linkDep->value;
+			if (depLink >= 0) {
+				linkDepA[depLink] = REOP_NOT_DEPENDENT;
+			} else {
+				depLink = -depLink - 1;
+				linkDepB[depLink] = REOP_NOT_DEPENDENT;
+			}
+			// Add this dependent link calculation to the ready list if *both* inputs are clear
+			if ((linkDepA[depLink] == REOP_NOT_DEPENDENT) && (linkDepB[depLink] == REOP_NOT_DEPENDENT)) {
+				readyList[readyListTail++] = depLink;
+				linkDepA[depLink] = linkDepB[depLink] = REOP_NODE_COMPLETE;	// Probably not needed now
+			}
+			linkDep = linkDep->next;
+		}
+	}
+
+	// Delete the temporary buffers
+	delete [] nodeWrittenAt;
+	delete [] linkDepA;
+	delete [] linkDepB;
+	delete [] readyList;
+	delete [] linkDepFreeList;
+	delete [] linkDepListStarts;
+	delete [] linkBuffer;
+}
+
+
 //
 void			btSoftBodyHelpers::DrawFrame(		btSoftBody* psb,
 											 btIDebugDraw* idraw)