diff options
Diffstat (limited to 'extern/bullet2/src/LinearMath/TaskScheduler')
4 files changed, 1661 insertions, 0 deletions
diff --git a/extern/bullet2/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/extern/bullet2/src/LinearMath/TaskScheduler/btTaskScheduler.cpp new file mode 100644 index 00000000000..5f1115c4027 --- /dev/null +++ b/extern/bullet2/src/LinearMath/TaskScheduler/btTaskScheduler.cpp @@ -0,0 +1,792 @@ + +#include "LinearMath/btMinMax.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btQuickprof.h" +#include <stdio.h> +#include <algorithm> + +#if BT_THREADSAFE + +#include "btThreadSupportInterface.h" + +#if defined(_WIN32) + +#define WIN32_LEAN_AND_MEAN + +#include <windows.h> + +#endif + +typedef unsigned long long btU64; +static const int kCacheLineSize = 64; + +void btSpinPause() +{ +#if defined(_WIN32) + YieldProcessor(); +#endif +} + +struct WorkerThreadStatus +{ + enum Type + { + kInvalid, + kWaitingForWork, + kWorking, + kSleeping, + }; +}; + +ATTRIBUTE_ALIGNED64(class) +WorkerThreadDirectives +{ + static const int kMaxThreadCount = BT_MAX_THREAD_COUNT; + // directives for all worker threads packed into a single cacheline + char m_threadDirs[kMaxThreadCount]; + +public: + enum Type + { + kInvalid, + kGoToSleep, // go to sleep + kStayAwakeButIdle, // wait for not checking job queue + kScanForJobs, // actively scan job queue for jobs + }; + WorkerThreadDirectives() + { + for (int i = 0; i < kMaxThreadCount; ++i) + { + m_threadDirs[i] = 0; + } + } + + Type getDirective(int threadId) + { + btAssert(threadId < kMaxThreadCount); + return static_cast<Type>(m_threadDirs[threadId]); + } + + void setDirectiveByRange(int threadBegin, int threadEnd, Type dir) + { + btAssert(threadBegin < threadEnd); + btAssert(threadEnd <= kMaxThreadCount); + char dirChar = static_cast<char>(dir); + for (int i = threadBegin; i < threadEnd; ++i) + { + m_threadDirs[i] = dirChar; + } + } +}; + +class JobQueue; + +ATTRIBUTE_ALIGNED64(struct) +ThreadLocalStorage +{ + int m_threadId; + WorkerThreadStatus::Type m_status; + int m_numJobsFinished; + btSpinMutex m_mutex; + btScalar m_sumResult; + WorkerThreadDirectives* m_directive; + JobQueue* m_queue; + btClock* m_clock; + unsigned int m_cooldownTime; +}; + +struct IJob +{ + virtual void executeJob(int threadId) = 0; +}; + +class ParallelForJob : public IJob +{ + const btIParallelForBody* m_body; + int m_begin; + int m_end; + +public: + ParallelForJob(int iBegin, int iEnd, const btIParallelForBody& body) + { + m_body = &body; + m_begin = iBegin; + m_end = iEnd; + } + virtual void executeJob(int threadId) BT_OVERRIDE + { + BT_PROFILE("executeJob"); + + // call the functor body to do the work + m_body->forLoop(m_begin, m_end); + } +}; + +class ParallelSumJob : public IJob +{ + const btIParallelSumBody* m_body; + ThreadLocalStorage* m_threadLocalStoreArray; + int m_begin; + int m_end; + +public: + ParallelSumJob(int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls) + { + m_body = &body; + m_threadLocalStoreArray = tls; + m_begin = iBegin; + m_end = iEnd; + } + virtual void executeJob(int threadId) BT_OVERRIDE + { + BT_PROFILE("executeJob"); + + // call the functor body to do the work + btScalar val = m_body->sumLoop(m_begin, m_end); +#if BT_PARALLEL_SUM_DETERMINISTISM + // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) + const float TRUNC_SCALE = float(1 << 19); + val = floor(val * TRUNC_SCALE + 0.5f) / TRUNC_SCALE; // truncate some bits +#endif + m_threadLocalStoreArray[threadId].m_sumResult += val; + } +}; + +ATTRIBUTE_ALIGNED64(class) +JobQueue +{ + btThreadSupportInterface* m_threadSupport; + btCriticalSection* m_queueLock; + btSpinMutex m_mutex; + + btAlignedObjectArray<IJob*> m_jobQueue; + char* m_jobMem; + int m_jobMemSize; + bool m_queueIsEmpty; + int m_tailIndex; + int m_headIndex; + int m_allocSize; + bool m_useSpinMutex; + btAlignedObjectArray<JobQueue*> m_neighborContexts; + char m_cachePadding[kCacheLineSize]; // prevent false sharing + + void freeJobMem() + { + if (m_jobMem) + { + // free old + btAlignedFree(m_jobMem); + m_jobMem = NULL; + } + } + void resizeJobMem(int newSize) + { + if (newSize > m_jobMemSize) + { + freeJobMem(); + m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize)); + m_jobMemSize = newSize; + } + } + +public: + JobQueue() + { + m_jobMem = NULL; + m_jobMemSize = 0; + m_threadSupport = NULL; + m_queueLock = NULL; + m_headIndex = 0; + m_tailIndex = 0; + m_useSpinMutex = false; + } + ~JobQueue() + { + exit(); + } + void exit() + { + freeJobMem(); + if (m_queueLock && m_threadSupport) + { + m_threadSupport->deleteCriticalSection(m_queueLock); + m_queueLock = NULL; + m_threadSupport = 0; + } + } + + void init(btThreadSupportInterface * threadSup, btAlignedObjectArray<JobQueue> * contextArray) + { + m_threadSupport = threadSup; + if (threadSup) + { + m_queueLock = m_threadSupport->createCriticalSection(); + } + setupJobStealing(contextArray, contextArray->size()); + } + void setupJobStealing(btAlignedObjectArray<JobQueue> * contextArray, int numActiveContexts) + { + btAlignedObjectArray<JobQueue>& contexts = *contextArray; + int selfIndex = 0; + for (int i = 0; i < contexts.size(); ++i) + { + if (this == &contexts[i]) + { + selfIndex = i; + break; + } + } + int numNeighbors = btMin(2, contexts.size() - 1); + int neighborOffsets[] = {-1, 1, -2, 2, -3, 3}; + int numOffsets = sizeof(neighborOffsets) / sizeof(neighborOffsets[0]); + m_neighborContexts.reserve(numNeighbors); + m_neighborContexts.resizeNoInitialize(0); + for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++) + { + int neighborIndex = selfIndex + neighborOffsets[i]; + if (neighborIndex >= 0 && neighborIndex < numActiveContexts) + { + m_neighborContexts.push_back(&contexts[neighborIndex]); + } + } + } + + bool isQueueEmpty() const { return m_queueIsEmpty; } + void lockQueue() + { + if (m_useSpinMutex) + { + m_mutex.lock(); + } + else + { + m_queueLock->lock(); + } + } + void unlockQueue() + { + if (m_useSpinMutex) + { + m_mutex.unlock(); + } + else + { + m_queueLock->unlock(); + } + } + void clearQueue(int jobCount, int jobSize) + { + lockQueue(); + m_headIndex = 0; + m_tailIndex = 0; + m_allocSize = 0; + m_queueIsEmpty = true; + int jobBufSize = jobSize * jobCount; + // make sure we have enough memory allocated to store jobs + if (jobBufSize > m_jobMemSize) + { + resizeJobMem(jobBufSize); + } + // make sure job queue is big enough + if (jobCount > m_jobQueue.capacity()) + { + m_jobQueue.reserve(jobCount); + } + unlockQueue(); + m_jobQueue.resizeNoInitialize(0); + } + void* allocJobMem(int jobSize) + { + btAssert(m_jobMemSize >= (m_allocSize + jobSize)); + void* jobMem = &m_jobMem[m_allocSize]; + m_allocSize += jobSize; + return jobMem; + } + void submitJob(IJob * job) + { + btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize); + m_jobQueue.push_back(job); + lockQueue(); + m_tailIndex++; + m_queueIsEmpty = false; + unlockQueue(); + } + IJob* consumeJobFromOwnQueue() + { + if (m_queueIsEmpty) + { + // lock free path. even if this is taken erroneously it isn't harmful + return NULL; + } + IJob* job = NULL; + lockQueue(); + if (!m_queueIsEmpty) + { + job = m_jobQueue[m_headIndex++]; + btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize); + if (m_headIndex == m_tailIndex) + { + m_queueIsEmpty = true; + } + } + unlockQueue(); + return job; + } + IJob* consumeJob() + { + if (IJob* job = consumeJobFromOwnQueue()) + { + return job; + } + // own queue is empty, try to steal from neighbor + for (int i = 0; i < m_neighborContexts.size(); ++i) + { + JobQueue* otherContext = m_neighborContexts[i]; + if (IJob* job = otherContext->consumeJobFromOwnQueue()) + { + return job; + } + } + return NULL; + } +}; + +static void WorkerThreadFunc(void* userPtr) +{ + BT_PROFILE("WorkerThreadFunc"); + ThreadLocalStorage* localStorage = (ThreadLocalStorage*)userPtr; + JobQueue* jobQueue = localStorage->m_queue; + + bool shouldSleep = false; + int threadId = localStorage->m_threadId; + while (!shouldSleep) + { + // do work + localStorage->m_mutex.lock(); + while (IJob* job = jobQueue->consumeJob()) + { + localStorage->m_status = WorkerThreadStatus::kWorking; + job->executeJob(threadId); + localStorage->m_numJobsFinished++; + } + localStorage->m_status = WorkerThreadStatus::kWaitingForWork; + localStorage->m_mutex.unlock(); + btU64 clockStart = localStorage->m_clock->getTimeMicroseconds(); + // while queue is empty, + while (jobQueue->isQueueEmpty()) + { + // todo: spin wait a bit to avoid hammering the empty queue + btSpinPause(); + if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep) + { + shouldSleep = true; + break; + } + // if jobs are incoming, + if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs) + { + clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock + } + else + { + for (int i = 0; i < 50; ++i) + { + btSpinPause(); + btSpinPause(); + btSpinPause(); + btSpinPause(); + if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty()) + { + break; + } + } + // if no jobs incoming and queue has been empty for the cooldown time, sleep + btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart; + if (timeElapsed > localStorage->m_cooldownTime) + { + shouldSleep = true; + break; + } + } + } + } + { + BT_PROFILE("sleep"); + // go sleep + localStorage->m_mutex.lock(); + localStorage->m_status = WorkerThreadStatus::kSleeping; + localStorage->m_mutex.unlock(); + } +} + +class btTaskSchedulerDefault : public btITaskScheduler +{ + btThreadSupportInterface* m_threadSupport; + WorkerThreadDirectives* m_workerDirective; + btAlignedObjectArray<JobQueue> m_jobQueues; + btAlignedObjectArray<JobQueue*> m_perThreadJobQueues; + btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage; + btSpinMutex m_antiNestingLock; // prevent nested parallel-for + btClock m_clock; + int m_numThreads; + int m_numWorkerThreads; + int m_numActiveJobQueues; + int m_maxNumThreads; + int m_numJobs; + static const int kFirstWorkerThreadId = 1; + +public: + btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") + { + m_threadSupport = NULL; + m_workerDirective = NULL; + } + + virtual ~btTaskSchedulerDefault() + { + waitForWorkersToSleep(); + + for (int i = 0; i < m_jobQueues.size(); ++i) + { + m_jobQueues[i].exit(); + } + + if (m_threadSupport) + { + delete m_threadSupport; + m_threadSupport = NULL; + } + if (m_workerDirective) + { + btAlignedFree(m_workerDirective); + m_workerDirective = NULL; + } + } + + void init() + { + btThreadSupportInterface::ConstructionInfo constructionInfo("TaskScheduler", WorkerThreadFunc); + m_threadSupport = btThreadSupportInterface::create(constructionInfo); + m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64)); + + m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); + m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; + m_numThreads = m_maxNumThreads; + // ideal to have one job queue for each physical processor (except for the main thread which needs no queue) + int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio(); + int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads - 1) : (m_maxNumThreads / numThreadsPerQueue); + m_jobQueues.resize(numJobQueues); + m_numActiveJobQueues = numJobQueues; + for (int i = 0; i < m_jobQueues.size(); ++i) + { + m_jobQueues[i].init(m_threadSupport, &m_jobQueues); + } + m_perThreadJobQueues.resize(m_numThreads); + for (int i = 0; i < m_numThreads; i++) + { + JobQueue* jq = NULL; + // only worker threads get a job queue + if (i > 0) + { + if (numThreadsPerQueue == 1) + { + // one queue per worker thread + jq = &m_jobQueues[i - kFirstWorkerThreadId]; + } + else + { + // 2 threads share each queue + jq = &m_jobQueues[i / numThreadsPerQueue]; + } + } + m_perThreadJobQueues[i] = jq; + } + m_threadLocalStorage.resize(m_numThreads); + for (int i = 0; i < m_numThreads; i++) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_threadId = i; + storage.m_directive = m_workerDirective; + storage.m_status = WorkerThreadStatus::kSleeping; + storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do + storage.m_clock = &m_clock; + storage.m_queue = m_perThreadJobQueues[i]; + } + setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); // no work for them yet + setNumThreads(m_threadSupport->getCacheFriendlyNumThreads()); + } + + void setWorkerDirectives(WorkerThreadDirectives::Type dir) + { + m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir); + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return m_maxNumThreads; + } + + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + + virtual void setNumThreads(int numThreads) BT_OVERRIDE + { + m_numThreads = btMax(btMin(numThreads, int(m_maxNumThreads)), 1); + m_numWorkerThreads = m_numThreads - 1; + m_numActiveJobQueues = 0; + // if there is at least 1 worker, + if (m_numWorkerThreads > 0) + { + // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue + JobQueue* lastActiveContext = m_perThreadJobQueues[m_numThreads - 1]; + int iLastActiveContext = lastActiveContext - &m_jobQueues[0]; + m_numActiveJobQueues = iLastActiveContext + 1; + for (int i = 0; i < m_jobQueues.size(); ++i) + { + m_jobQueues[i].setupJobStealing(&m_jobQueues, m_numActiveJobQueues); + } + } + m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep); + } + + void waitJobs() + { + BT_PROFILE("waitJobs"); + // have the main thread work until the job queues are empty + int numMainThreadJobsFinished = 0; + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + while (IJob* job = m_jobQueues[i].consumeJob()) + { + job->executeJob(0); + numMainThreadJobsFinished++; + } + } + + // done with jobs for now, tell workers to rest (but not sleep) + setWorkerDirectives(WorkerThreadDirectives::kStayAwakeButIdle); + + btU64 clockStart = m_clock.getTimeMicroseconds(); + // wait for workers to finish any jobs in progress + while (true) + { + int numWorkerJobsFinished = 0; + for (int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread) + { + ThreadLocalStorage* storage = &m_threadLocalStorage[iThread]; + storage->m_mutex.lock(); + numWorkerJobsFinished += storage->m_numJobsFinished; + storage->m_mutex.unlock(); + } + if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) + { + break; + } + btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart; + btAssert(timeElapsed < 1000); + if (timeElapsed > 100000) + { + break; + } + btSpinPause(); + } + } + + void wakeWorkers(int numWorkersToWake) + { + BT_PROFILE("wakeWorkers"); + btAssert(m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs); + int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); + int numActiveWorkers = 0; + for (int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker) + { + // note this count of active workers is not necessarily totally reliable, because a worker thread could be + // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. + ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker]; + if (storage.m_status != WorkerThreadStatus::kSleeping) + { + numActiveWorkers++; + } + } + for (int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker) + { + ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker]; + if (storage.m_status == WorkerThreadStatus::kSleeping) + { + m_threadSupport->runTask(iWorker, &storage); + numActiveWorkers++; + } + } + } + + void waitForWorkersToSleep() + { + BT_PROFILE("waitForWorkersToSleep"); + setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); + m_threadSupport->waitForAllTasks(); + for (int i = kFirstWorkerThreadId; i < m_numThreads; i++) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + btAssert(storage.m_status == WorkerThreadStatus::kSleeping); + } + } + + virtual void sleepWorkerThreadsHint() BT_OVERRIDE + { + BT_PROFILE("sleepWorkerThreadsHint"); + // hint the task scheduler that we may not be using these threads for a little while + setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); + } + + void prepareWorkerThreads() + { + for (int i = kFirstWorkerThreadId; i < m_numThreads; ++i) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_mutex.lock(); + storage.m_numJobsFinished = 0; + storage.m_mutex.unlock(); + } + setWorkerDirectives(WorkerThreadDirectives::kScanForJobs); + } + + virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelFor_ThreadSupport"); + btAssert(iEnd >= iBegin); + btAssert(grainSize >= 1); + int iterationCount = iEnd - iBegin; + if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock()) + { + typedef ParallelForJob JobType; + int jobCount = (iterationCount + grainSize - 1) / grainSize; + m_numJobs = jobCount; + btAssert(jobCount >= 2); // need more than one job for multithreading + int jobSize = sizeof(JobType); + + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue(jobCount, jobSize); + } + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + int iThread = kFirstWorkerThreadId; // first worker thread + for (int i = iBegin; i < iEnd; i += grainSize) + { + btAssert(iJob < jobCount); + int iE = btMin(i + grainSize, iEnd); + JobQueue* jq = m_perThreadJobQueues[iThread]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new (jobMem) ParallelForJob(i, iE, body); // placement new + jq->submitJob(job); + iJob++; + iThread++; + if (iThread >= m_numThreads) + { + iThread = kFirstWorkerThreadId; // first worker thread + } + } + wakeWorkers(jobCount - 1); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + m_antiNestingLock.unlock(); + } + else + { + BT_PROFILE("parallelFor_mainThread"); + // just run on main thread + body.forLoop(iBegin, iEnd); + } + } + virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE + { + BT_PROFILE("parallelSum_ThreadSupport"); + btAssert(iEnd >= iBegin); + btAssert(grainSize >= 1); + int iterationCount = iEnd - iBegin; + if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock()) + { + typedef ParallelSumJob JobType; + int jobCount = (iterationCount + grainSize - 1) / grainSize; + m_numJobs = jobCount; + btAssert(jobCount >= 2); // need more than one job for multithreading + int jobSize = sizeof(JobType); + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue(jobCount, jobSize); + } + + // initialize summation + for (int iThread = 0; iThread < m_numThreads; ++iThread) + { + m_threadLocalStorage[iThread].m_sumResult = btScalar(0); + } + + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + int iThread = kFirstWorkerThreadId; // first worker thread + for (int i = iBegin; i < iEnd; i += grainSize) + { + btAssert(iJob < jobCount); + int iE = btMin(i + grainSize, iEnd); + JobQueue* jq = m_perThreadJobQueues[iThread]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new (jobMem) ParallelSumJob(i, iE, body, &m_threadLocalStorage[0]); // placement new + jq->submitJob(job); + iJob++; + iThread++; + if (iThread >= m_numThreads) + { + iThread = kFirstWorkerThreadId; // first worker thread + } + } + wakeWorkers(jobCount - 1); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + + // add up all the thread sums + btScalar sum = btScalar(0); + for (int iThread = 0; iThread < m_numThreads; ++iThread) + { + sum += m_threadLocalStorage[iThread].m_sumResult; + } + m_antiNestingLock.unlock(); + return sum; + } + else + { + BT_PROFILE("parallelSum_mainThread"); + // just run on main thread + return body.sumLoop(iBegin, iEnd); + } + } +}; + +btITaskScheduler* btCreateDefaultTaskScheduler() +{ + btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); + ts->init(); + return ts; +} + +#else // #if BT_THREADSAFE + +btITaskScheduler* btCreateDefaultTaskScheduler() +{ + return NULL; +} + +#endif // #else // #if BT_THREADSAFE diff --git a/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportInterface.h b/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportInterface.h new file mode 100644 index 00000000000..1fe49335a19 --- /dev/null +++ b/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportInterface.h @@ -0,0 +1,64 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_THREAD_SUPPORT_INTERFACE_H +#define BT_THREAD_SUPPORT_INTERFACE_H + +class btCriticalSection +{ +public: + btCriticalSection() {} + virtual ~btCriticalSection() {} + + virtual void lock() = 0; + virtual void unlock() = 0; +}; + +class btThreadSupportInterface +{ +public: + virtual ~btThreadSupportInterface() {} + + virtual int getNumWorkerThreads() const = 0; // number of worker threads (total number of logical processors - 1) + virtual int getCacheFriendlyNumThreads() const = 0; // the number of logical processors sharing a single L3 cache + virtual int getLogicalToPhysicalCoreRatio() const = 0; // the number of logical processors per physical processor (usually 1 or 2) + virtual void runTask(int threadIndex, void* userData) = 0; + virtual void waitForAllTasks() = 0; + + virtual btCriticalSection* createCriticalSection() = 0; + virtual void deleteCriticalSection(btCriticalSection* criticalSection) = 0; + + typedef void (*ThreadFunc)(void* userPtr); + + struct ConstructionInfo + { + ConstructionInfo(const char* uniqueName, + ThreadFunc userThreadFunc, + int threadStackSize = 65535) + : m_uniqueName(uniqueName), + m_userThreadFunc(userThreadFunc), + m_threadStackSize(threadStackSize) + { + } + + const char* m_uniqueName; + ThreadFunc m_userThreadFunc; + int m_threadStackSize; + }; + + static btThreadSupportInterface* create(const ConstructionInfo& info); +}; + +#endif //BT_THREAD_SUPPORT_INTERFACE_H diff --git a/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp new file mode 100644 index 00000000000..a03f6dc5703 --- /dev/null +++ b/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp @@ -0,0 +1,353 @@ + +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#if BT_THREADSAFE && !defined(_WIN32) + +#include "LinearMath/btScalar.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btMinMax.h" +#include "btThreadSupportInterface.h" + +#include <stdio.h> +#include <errno.h> +#include <unistd.h> + +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html +#endif //_XOPEN_SOURCE +#include <pthread.h> +#include <semaphore.h> +#include <unistd.h> //for sysconf + +/// +/// getNumHardwareThreads() +/// +/// +/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine +/// +#if __cplusplus >= 201103L + +#include <thread> + +int btGetNumHardwareThreads() +{ + return btMax(1u, btMin(BT_MAX_THREAD_COUNT, std::thread::hardware_concurrency())); +} + +#else + +int btGetNumHardwareThreads() +{ + return btMax(1, btMin<int>(BT_MAX_THREAD_COUNT, sysconf(_SC_NPROCESSORS_ONLN))); +} + +#endif + +// btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication +class btThreadSupportPosix : public btThreadSupportInterface +{ +public: + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + + pthread_t thread; + //each tread will wait until this signal to start its work + sem_t* startSemaphore; + btCriticalSection* m_cs; + // this is a copy of m_mainSemaphore, + //each tread will signal once it is finished with its work + sem_t* m_mainSemaphore; + unsigned long threadUsed; + }; + +private: + typedef unsigned long long UINT64; + + btAlignedObjectArray<btThreadStatus> m_activeThreadStatus; + // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work + sem_t* m_mainSemaphore; + int m_numThreads; + UINT64 m_startedThreadsMask; + void startThreads(const ConstructionInfo& threadInfo); + void stopThreads(); + int waitForResponse(); + btCriticalSection* m_cs; +public: + btThreadSupportPosix(const ConstructionInfo& threadConstructionInfo); + virtual ~btThreadSupportPosix(); + + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + // TODO: return the number of logical processors sharing the first L3 cache + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; } + // TODO: detect if CPU has hyperthreading enabled + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; } + + virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; + + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE; +}; + +#define checkPThreadFunction(returnValue) \ + if (0 != returnValue) \ + { \ + printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \ + } + +// The number of threads should be equal to the number of available cores +// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor. + +btThreadSupportPosix::btThreadSupportPosix(const ConstructionInfo& threadConstructionInfo) +{ + m_cs = createCriticalSection(); + startThreads(threadConstructionInfo); +} + +// cleanup/shutdown Libspe2 +btThreadSupportPosix::~btThreadSupportPosix() +{ + stopThreads(); + deleteCriticalSection(m_cs); + m_cs=0; +} + +#if (defined(__APPLE__)) +#define NAMED_SEMAPHORES +#endif + +static sem_t* createSem(const char* baseName) +{ + static int semCount = 0; +#ifdef NAMED_SEMAPHORES + /// Named semaphore begin + char name[32]; + snprintf(name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++); + sem_t* tempSem = sem_open(name, O_CREAT, 0600, 0); + + if (tempSem != reinterpret_cast<sem_t*>(SEM_FAILED)) + { + // printf("Created \"%s\" Semaphore %p\n", name, tempSem); + } + else + { + //printf("Error creating Semaphore %d\n", errno); + exit(-1); + } + /// Named semaphore end +#else + sem_t* tempSem = new sem_t; + checkPThreadFunction(sem_init(tempSem, 0, 0)); +#endif + return tempSem; +} + +static void destroySem(sem_t* semaphore) +{ +#ifdef NAMED_SEMAPHORES + checkPThreadFunction(sem_close(semaphore)); +#else + checkPThreadFunction(sem_destroy(semaphore)); + delete semaphore; +#endif +} + +static void* threadFunction(void* argument) +{ + btThreadSupportPosix::btThreadStatus* status = (btThreadSupportPosix::btThreadStatus*)argument; + + while (1) + { + checkPThreadFunction(sem_wait(status->startSemaphore)); + void* userPtr = status->m_userPtr; + + if (userPtr) + { + btAssert(status->m_status); + status->m_userThreadFunc(userPtr); + status->m_cs->lock(); + status->m_status = 2; + status->m_cs->unlock(); + checkPThreadFunction(sem_post(status->m_mainSemaphore)); + status->threadUsed++; + } + else + { + //exit Thread + status->m_cs->lock(); + status->m_status = 3; + status->m_cs->unlock(); + checkPThreadFunction(sem_post(status->m_mainSemaphore)); + break; + } + } + + return 0; +} + +///send messages to SPUs +void btThreadSupportPosix::runTask(int threadIndex, void* userData) +{ + ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished + btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex]; + btAssert(threadIndex >= 0); + btAssert(threadIndex < m_activeThreadStatus.size()); + threadStatus.m_cs = m_cs; + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadsMask |= UINT64(1) << threadIndex; + + // fire event to start new task + checkPThreadFunction(sem_post(threadStatus.startSemaphore)); +} + +///check for messages from SPUs +int btThreadSupportPosix::waitForResponse() +{ + ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response + ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback' + + btAssert(m_activeThreadStatus.size()); + + // wait for any of the threads to finish + checkPThreadFunction(sem_wait(m_mainSemaphore)); + // get at least one thread which has finished + size_t last = -1; + + for (size_t t = 0; t < size_t(m_activeThreadStatus.size()); ++t) + { + m_cs->lock(); + bool hasFinished = (2 == m_activeThreadStatus[t].m_status); + m_cs->unlock(); + if (hasFinished) + { + last = t; + break; + } + } + + btThreadStatus& threadStatus = m_activeThreadStatus[last]; + + btAssert(threadStatus.m_status > 1); + threadStatus.m_status = 0; + + // need to find an active spu + btAssert(last >= 0); + m_startedThreadsMask &= ~(UINT64(1) << last); + + return last; +} + +void btThreadSupportPosix::waitForAllTasks() +{ + while (m_startedThreadsMask) + { + waitForResponse(); + } +} + +void btThreadSupportPosix::startThreads(const ConstructionInfo& threadConstructionInfo) +{ + m_numThreads = btGetNumHardwareThreads() - 1; // main thread exists already + m_activeThreadStatus.resize(m_numThreads); + m_startedThreadsMask = 0; + + m_mainSemaphore = createSem("main"); + //checkPThreadFunction(sem_wait(mainSemaphore)); + + for (int i = 0; i < m_numThreads; i++) + { + btThreadStatus& threadStatus = m_activeThreadStatus[i]; + threadStatus.startSemaphore = createSem("threadLocal"); + threadStatus.m_userPtr = 0; + threadStatus.m_cs = m_cs; + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_mainSemaphore = m_mainSemaphore; + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + threadStatus.threadUsed = 0; + checkPThreadFunction(pthread_create(&threadStatus.thread, NULL, &threadFunction, (void*)&threadStatus)); + + } +} + +///tell the task scheduler we are done with the SPU tasks +void btThreadSupportPosix::stopThreads() +{ + for (size_t t = 0; t < size_t(m_activeThreadStatus.size()); ++t) + { + btThreadStatus& threadStatus = m_activeThreadStatus[t]; + + threadStatus.m_userPtr = 0; + checkPThreadFunction(sem_post(threadStatus.startSemaphore)); + checkPThreadFunction(sem_wait(m_mainSemaphore)); + + checkPThreadFunction(pthread_join(threadStatus.thread, 0)); + destroySem(threadStatus.startSemaphore); + } + destroySem(m_mainSemaphore); + m_activeThreadStatus.clear(); +} + +class btCriticalSectionPosix : public btCriticalSection +{ + pthread_mutex_t m_mutex; + +public: + btCriticalSectionPosix() + { + pthread_mutex_init(&m_mutex, NULL); + } + virtual ~btCriticalSectionPosix() + { + pthread_mutex_destroy(&m_mutex); + } + + virtual void lock() + { + pthread_mutex_lock(&m_mutex); + } + virtual void unlock() + { + pthread_mutex_unlock(&m_mutex); + } +}; + +btCriticalSection* btThreadSupportPosix::createCriticalSection() +{ + return new btCriticalSectionPosix(); +} + +void btThreadSupportPosix::deleteCriticalSection(btCriticalSection* cs) +{ + delete cs; +} + +btThreadSupportInterface* btThreadSupportInterface::create(const ConstructionInfo& info) +{ + return new btThreadSupportPosix(info); +} + +#endif // BT_THREADSAFE && !defined( _WIN32 ) diff --git a/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp new file mode 100644 index 00000000000..922e449cce2 --- /dev/null +++ b/extern/bullet2/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp @@ -0,0 +1,452 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#if defined(_WIN32) && BT_THREADSAFE + +#include "LinearMath/btScalar.h" +#include "LinearMath/btMinMax.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "btThreadSupportInterface.h" +#include <windows.h> +#include <stdio.h> + +struct btProcessorInfo +{ + int numLogicalProcessors; + int numCores; + int numNumaNodes; + int numL1Cache; + int numL2Cache; + int numL3Cache; + int numPhysicalPackages; + static const int maxNumTeamMasks = 32; + int numTeamMasks; + UINT64 processorTeamMasks[maxNumTeamMasks]; +}; + +UINT64 getProcessorTeamMask(const btProcessorInfo& procInfo, int procId) +{ + UINT64 procMask = UINT64(1) << procId; + for (int i = 0; i < procInfo.numTeamMasks; ++i) + { + if (procMask & procInfo.processorTeamMasks[i]) + { + return procInfo.processorTeamMasks[i]; + } + } + return 0; +} + +int getProcessorTeamIndex(const btProcessorInfo& procInfo, int procId) +{ + UINT64 procMask = UINT64(1) << procId; + for (int i = 0; i < procInfo.numTeamMasks; ++i) + { + if (procMask & procInfo.processorTeamMasks[i]) + { + return i; + } + } + return -1; +} + +int countSetBits(ULONG64 bits) +{ + int count = 0; + while (bits) + { + if (bits & 1) + { + count++; + } + bits >>= 1; + } + return count; +} + +typedef BOOL(WINAPI* Pfn_GetLogicalProcessorInformation)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + +void getProcessorInformation(btProcessorInfo* procInfo) +{ + memset(procInfo, 0, sizeof(*procInfo)); + Pfn_GetLogicalProcessorInformation getLogicalProcInfo = + (Pfn_GetLogicalProcessorInformation)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (getLogicalProcInfo == NULL) + { + // no info + return; + } + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL; + DWORD bufSize = 0; + while (true) + { + if (getLogicalProcInfo(buf, &bufSize)) + { + break; + } + else + { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) + { + if (buf) + { + free(buf); + } + buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(bufSize); + } + } + } + + int len = bufSize / sizeof(*buf); + for (int i = 0; i < len; ++i) + { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i; + switch (info->Relationship) + { + case RelationNumaNode: + procInfo->numNumaNodes++; + break; + + case RelationProcessorCore: + procInfo->numCores++; + procInfo->numLogicalProcessors += countSetBits(info->ProcessorMask); + break; + + case RelationCache: + if (info->Cache.Level == 1) + { + procInfo->numL1Cache++; + } + else if (info->Cache.Level == 2) + { + procInfo->numL2Cache++; + } + else if (info->Cache.Level == 3) + { + procInfo->numL3Cache++; + // processors that share L3 cache are considered to be on the same team + // because they can more easily work together on the same data. + // Large performance penalties will occur if 2 or more threads from different + // teams attempt to frequently read and modify the same cache lines. + // + // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into + // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both + // CCXs are operating on the same data, many cycles will be spent keeping the + // two caches coherent. + if (procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks) + { + procInfo->processorTeamMasks[procInfo->numTeamMasks] = info->ProcessorMask; + procInfo->numTeamMasks++; + } + } + break; + + case RelationProcessorPackage: + procInfo->numPhysicalPackages++; + break; + } + } + free(buf); +} + +///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication +class btThreadSupportWin32 : public btThreadSupportInterface +{ +public: + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + + void* m_threadHandle; //this one is calling 'Win32ThreadFunc' + + void* m_eventStartHandle; + char m_eventStartHandleName[32]; + + void* m_eventCompleteHandle; + char m_eventCompleteHandleName[32]; + }; + +private: + btAlignedObjectArray<btThreadStatus> m_activeThreadStatus; + btAlignedObjectArray<void*> m_completeHandles; + int m_numThreads; + DWORD_PTR m_startedThreadMask; + btProcessorInfo m_processorInfo; + + void startThreads(const ConstructionInfo& threadInfo); + void stopThreads(); + int waitForResponse(); + +public: + btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo); + virtual ~btThreadSupportWin32(); + + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); } + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; } + + virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; + + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE; +}; + +btThreadSupportWin32::btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo) +{ + startThreads(threadConstructionInfo); +} + +btThreadSupportWin32::~btThreadSupportWin32() +{ + stopThreads(); +} + +DWORD WINAPI win32threadStartFunc(LPVOID lpParam) +{ + btThreadSupportWin32::btThreadStatus* status = (btThreadSupportWin32::btThreadStatus*)lpParam; + + while (1) + { + WaitForSingleObject(status->m_eventStartHandle, INFINITE); + void* userPtr = status->m_userPtr; + + if (userPtr) + { + btAssert(status->m_status); + status->m_userThreadFunc(userPtr); + status->m_status = 2; + SetEvent(status->m_eventCompleteHandle); + } + else + { + //exit Thread + status->m_status = 3; + printf("Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle); + SetEvent(status->m_eventCompleteHandle); + break; + } + } + printf("Thread TERMINATED\n"); + return 0; +} + +void btThreadSupportWin32::runTask(int threadIndex, void* userData) +{ + btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex]; + btAssert(threadIndex >= 0); + btAssert(int(threadIndex) < m_activeThreadStatus.size()); + + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadMask |= DWORD_PTR(1) << threadIndex; + + ///fire event to start new task + SetEvent(threadStatus.m_eventStartHandle); +} + +int btThreadSupportWin32::waitForResponse() +{ + btAssert(m_activeThreadStatus.size()); + + int last = -1; + DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE); + btAssert(res != WAIT_FAILED); + last = res - WAIT_OBJECT_0; + + btThreadStatus& threadStatus = m_activeThreadStatus[last]; + btAssert(threadStatus.m_threadHandle); + btAssert(threadStatus.m_eventCompleteHandle); + + //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + btAssert(threadStatus.m_status > 1); + threadStatus.m_status = 0; + + ///need to find an active spu + btAssert(last >= 0); + m_startedThreadMask &= ~(DWORD_PTR(1) << last); + + return last; +} + +void btThreadSupportWin32::waitForAllTasks() +{ + while (m_startedThreadMask) + { + waitForResponse(); + } +} + +void btThreadSupportWin32::startThreads(const ConstructionInfo& threadConstructionInfo) +{ + static int uniqueId = 0; + uniqueId++; + btProcessorInfo& procInfo = m_processorInfo; + getProcessorInformation(&procInfo); + DWORD_PTR dwProcessAffinityMask = 0; + DWORD_PTR dwSystemAffinityMask = 0; + if (!GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask)) + { + dwProcessAffinityMask = 0; + } + ///The number of threads should be equal to the number of available cores - 1 + m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists) + + m_activeThreadStatus.resize(m_numThreads); + m_completeHandles.resize(m_numThreads); + m_startedThreadMask = 0; + + // set main thread affinity + if (DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask(procInfo, 0)) + { + SetThreadAffinityMask(GetCurrentThread(), mask); + SetThreadIdealProcessor(GetCurrentThread(), 0); + } + + for (int i = 0; i < m_numThreads; i++) + { + printf("starting thread %d\n", i); + + btThreadStatus& threadStatus = m_activeThreadStatus[i]; + + LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL; + SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize; + LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc; + LPVOID lpParameter = &threadStatus; + DWORD dwCreationFlags = 0; + LPDWORD lpThreadId = 0; + + threadStatus.m_userPtr = 0; + + sprintf(threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i); + threadStatus.m_eventStartHandle = CreateEventA(0, false, false, threadStatus.m_eventStartHandleName); + + sprintf(threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i); + threadStatus.m_eventCompleteHandle = CreateEventA(0, false, false, threadStatus.m_eventCompleteHandleName); + + m_completeHandles[i] = threadStatus.m_eventCompleteHandle; + + HANDLE handle = CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId); + //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST ); + // highest priority -- can cause erratic performance when numThreads > numCores + // we don't want worker threads to be higher priority than the main thread or the main thread could get + // totally shut out and unable to tell the workers to stop + //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL ); + + { + int processorId = i + 1; // leave processor 0 for main thread + DWORD_PTR teamMask = getProcessorTeamMask(procInfo, processorId); + if (teamMask) + { + // bind each thread to only execute on processors of it's assigned team + // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team) + // - for multi-socket Intel this will keep threads from migrating from one socket to another + // - for AMD Ryzen this will keep threads from migrating from one CCX to another + DWORD_PTR mask = teamMask & dwProcessAffinityMask; + if (mask) + { + SetThreadAffinityMask(handle, mask); + } + } + SetThreadIdealProcessor(handle, processorId); + } + + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_threadHandle = handle; + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + + printf("started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle); + } +} + +///tell the task scheduler we are done with the SPU tasks +void btThreadSupportWin32::stopThreads() +{ + for (int i = 0; i < m_activeThreadStatus.size(); i++) + { + btThreadStatus& threadStatus = m_activeThreadStatus[i]; + if (threadStatus.m_status > 0) + { + WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + } + + threadStatus.m_userPtr = NULL; + SetEvent(threadStatus.m_eventStartHandle); + WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + + CloseHandle(threadStatus.m_eventCompleteHandle); + CloseHandle(threadStatus.m_eventStartHandle); + CloseHandle(threadStatus.m_threadHandle); + } + + m_activeThreadStatus.clear(); + m_completeHandles.clear(); +} + +class btWin32CriticalSection : public btCriticalSection +{ +private: + CRITICAL_SECTION mCriticalSection; + +public: + btWin32CriticalSection() + { + InitializeCriticalSection(&mCriticalSection); + } + + ~btWin32CriticalSection() + { + DeleteCriticalSection(&mCriticalSection); + } + + void lock() + { + EnterCriticalSection(&mCriticalSection); + } + + void unlock() + { + LeaveCriticalSection(&mCriticalSection); + } +}; + +btCriticalSection* btThreadSupportWin32::createCriticalSection() +{ + unsigned char* mem = (unsigned char*)btAlignedAlloc(sizeof(btWin32CriticalSection), 16); + btWin32CriticalSection* cs = new (mem) btWin32CriticalSection(); + return cs; +} + +void btThreadSupportWin32::deleteCriticalSection(btCriticalSection* criticalSection) +{ + criticalSection->~btCriticalSection(); + btAlignedFree(criticalSection); +} + +btThreadSupportInterface* btThreadSupportInterface::create(const ConstructionInfo& info) +{ + return new btThreadSupportWin32(info); +} + +#endif //defined(_WIN32) && BT_THREADSAFE |