BLI: Switch threads implementation file to C++

Allows to use C++ primitives in the primitive implementation.
author: Sergey Sharybin <sergey.vfx@gmail.com> 2020-07-02 17:19:18 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2020-07-03 12:14:06 +0300
commit: 0f4049db5f3c6be582d878e2ae1e6fd12dcad6bf (patch)
tree: d9c9bf68ce097bc23c409b9c7fa19164cd74394d /source/blender/blenlib/intern/threads.cc
parent: edb49d3dc2e3a2bd3048cfdb55562fb853f37a21 (diff)
1 files changed, 899 insertions, 0 deletions
diff --git a/source/blender/blenlib/intern/threads.cc b/source/blender/blenlib/intern/threads.cc
new file mode 100644
index 00000000000..c2adc432644
--- /dev/null
+++ b/source/blender/blenlib/intern/threads.cc
@@ -0,0 +1,899 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2006 Blender Foundation
+ * All rights reserved.
+ */
+
+/** \file
+ * \ingroup bli
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "MEM_guardedalloc.h"
+
+#include "BLI_gsqueue.h"
+#include "BLI_listbase.h"
+#include "BLI_system.h"
+#include "BLI_task.h"
+#include "BLI_threads.h"
+
+#include "PIL_time.h"
+
+/* for checking system threads - BLI_system_thread_count */
+#ifdef WIN32
+#  include <sys/timeb.h>
+#  include <windows.h>
+#elif defined(__APPLE__)
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
+#else
+#  include <sys/time.h>
+#  include <unistd.h>
+#endif
+
+#include "atomic_ops.h"
+#include "numaapi.h"
+
+#if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && \
+    !defined(__clang__)
+#  define USE_APPLE_OMP_FIX
+#endif
+
+#ifdef USE_APPLE_OMP_FIX
+/* ************** libgomp (Apple gcc 4.2.1) TLS bug workaround *************** */
+extern pthread_key_t gomp_tls_key;
+static void *thread_tls_data;
+#endif
+
+/* ********** basic thread control API ************
+ *
+ * Many thread cases have an X amount of jobs, and only an Y amount of
+ * threads are useful (typically amount of cpus)
+ *
+ * This code can be used to start a maximum amount of 'thread slots', which
+ * then can be filled in a loop with an idle timer.
+ *
+ * A sample loop can look like this (pseudo c);
+ *
+ *     ListBase lb;
+ *     int maxthreads = 2;
+ *     int cont = 1;
+ *
+ *     BLI_threadpool_init(&lb, do_something_func, maxthreads);
+ *
+ *     while (cont) {
+ *         if (BLI_available_threads(&lb) && !(escape loop event)) {
+ *             // get new job (data pointer)
+ *             // tag job 'processed
+ *             BLI_threadpool_insert(&lb, job);
+ *         }
+ *         else PIL_sleep_ms(50);
+ *
+ *         // find if a job is ready, this the do_something_func() should write in job somewhere
+ *         cont = 0;
+ *         for (go over all jobs)
+ *             if (job is ready) {
+ *                 if (job was not removed) {
+ *                     BLI_threadpool_remove(&lb, job); *                 }
+ *             }
+ *             else cont = 1; *         }
+ *         // conditions to exit loop
+ *         if (if escape loop event) {
+ *             if (BLI_available_threadslots(&lb) == maxthreads) {
+ *                 break;
+ *             }
+ *         }
+ *     }
+ *
+ *     BLI_threadpool_end(&lb);
+ *
+ ************************************************ */
+static pthread_mutex_t _image_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _image_draw_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _viewer_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _custom1_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _rcache_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _opengl_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _nodes_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _movieclip_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_t mainid;
+static bool is_numa_available = false;
+static unsigned int thread_levels = 0; /* threads can be invoked inside threads */
+static int num_threads_override = 0;
+
+/* just a max for security reasons */
+#define RE_MAX_THREAD BLENDER_MAX_THREADS
+
+typedef struct ThreadSlot {
+  struct ThreadSlot *next, *prev;
+  void *(*do_thread)(void *);
+  void *callerdata;
+  pthread_t pthread;
+  int avail;
+} ThreadSlot;
+
+void BLI_threadapi_init(void)
+{
+  mainid = pthread_self();
+  if (numaAPI_Initialize() == NUMAAPI_SUCCESS) {
+    is_numa_available = true;
+  }
+}
+
+void BLI_threadapi_exit(void)
+{
+}
+
+/* tot = 0 only initializes malloc mutex in a safe way (see sequence.c)
+ * problem otherwise: scene render will kill of the mutex!
+ */
+
+void BLI_threadpool_init(ListBase *threadbase, void *(*do_thread)(void *), int tot)
+{
+  int a;
+
+  if (threadbase != nullptr && tot > 0) {
+    BLI_listbase_clear(threadbase);
+
+    if (tot > RE_MAX_THREAD) {
+      tot = RE_MAX_THREAD;
+    }
+    else if (tot < 1) {
+      tot = 1;
+    }
+
+    for (a = 0; a < tot; a++) {
+      ThreadSlot *tslot = static_cast<ThreadSlot *>(MEM_callocN(sizeof(ThreadSlot), "threadslot"));
+      BLI_addtail(threadbase, tslot);
+      tslot->do_thread = do_thread;
+      tslot->avail = 1;
+    }
+  }
+
+  unsigned int level = atomic_fetch_and_add_u(&thread_levels, 1);
+  if (level == 0) {
+#ifdef USE_APPLE_OMP_FIX
+    /* workaround for Apple gcc 4.2.1 omp vs background thread bug,
+     * we copy gomp thread local storage pointer to setting it again
+     * inside the thread that we start */
+    thread_tls_data = pthread_getspecific(gomp_tls_key);
+#endif
+  }
+}
+
+/* amount of available threads */
+int BLI_available_threads(ListBase *threadbase)
+{
+  int counter = 0;
+
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (tslot->avail) {
+      counter++;
+    }
+  }
+
+  return counter;
+}
+
+/* returns thread number, for sample patterns or threadsafe tables */
+int BLI_threadpool_available_thread_index(ListBase *threadbase)
+{
+  int counter = 0;
+
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (tslot->avail) {
+      return counter;
+    }
+    ++counter;
+  }
+
+  return 0;
+}
+
+static void *tslot_thread_start(void *tslot_p)
+{
+  ThreadSlot *tslot = (ThreadSlot *)tslot_p;
+
+#ifdef USE_APPLE_OMP_FIX
+  /* workaround for Apple gcc 4.2.1 omp vs background thread bug,
+   * set gomp thread local storage pointer which was copied beforehand */
+  pthread_setspecific(gomp_tls_key, thread_tls_data);
+#endif
+
+  return tslot->do_thread(tslot->callerdata);
+}
+
+int BLI_thread_is_main(void)
+{
+  return pthread_equal(pthread_self(), mainid);
+}
+
+void BLI_threadpool_insert(ListBase *threadbase, void *callerdata)
+{
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (tslot->avail) {
+      tslot->avail = 0;
+      tslot->callerdata = callerdata;
+      pthread_create(&tslot->pthread, nullptr, tslot_thread_start, tslot);
+      return;
+    }
+  }
+  printf("ERROR: could not insert thread slot\n");
+}
+
+void BLI_threadpool_remove(ListBase *threadbase, void *callerdata)
+{
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (tslot->callerdata == callerdata) {
+      pthread_join(tslot->pthread, nullptr);
+      tslot->callerdata = nullptr;
+      tslot->avail = 1;
+    }
+  }
+}
+
+void BLI_threadpool_remove_index(ListBase *threadbase, int index)
+{
+  int counter = 0;
+
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (counter == index && tslot->avail == 0) {
+      pthread_join(tslot->pthread, nullptr);
+      tslot->callerdata = nullptr;
+      tslot->avail = 1;
+      break;
+    }
+    ++counter;
+  }
+}
+
+void BLI_threadpool_clear(ListBase *threadbase)
+{
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (tslot->avail == 0) {
+      pthread_join(tslot->pthread, nullptr);
+      tslot->callerdata = nullptr;
+      tslot->avail = 1;
+    }
+  }
+}
+
+void BLI_threadpool_end(ListBase *threadbase)
+{
+
+  /* only needed if there's actually some stuff to end
+   * this way we don't end up decrementing thread_levels on an empty threadbase
+   * */
+  if (threadbase == nullptr || BLI_listbase_is_empty(threadbase)) {
+    return;
+  }
+
+  LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
+    if (tslot->avail == 0) {
+      pthread_join(tslot->pthread, nullptr);
+    }
+  }
+  BLI_freelistN(threadbase);
+}
+
+/* System Information */
+
+/* how many threads are native on this system? */
+int BLI_system_thread_count(void)
+{
+  static int t = -1;
+
+  if (num_threads_override != 0) {
+    return num_threads_override;
+  }
+  else if (LIKELY(t != -1)) {
+    return t;
+  }
+
+  {
+#ifdef WIN32
+    SYSTEM_INFO info;
+    GetSystemInfo(&info);
+    t = (int)info.dwNumberOfProcessors;
+#else
+#  ifdef __APPLE__
+    int mib[2];
+    size_t len;
+
+    mib[0] = CTL_HW;
+    mib[1] = HW_NCPU;
+    len = sizeof(t);
+    sysctl(mib, 2, &t, &len, nullptr, 0);
+#  else
+    t = (int)sysconf(_SC_NPROCESSORS_ONLN);
+#  endif
+#endif
+  }
+
+  CLAMP(t, 1, RE_MAX_THREAD);
+
+  return t;
+}
+
+void BLI_system_num_threads_override_set(int num)
+{
+  num_threads_override = num;
+}
+
+int BLI_system_num_threads_override_get(void)
+{
+  return num_threads_override;
+}
+
+/* Global Mutex Locks */
+
+static ThreadMutex *global_mutex_from_type(const int type)
+{
+  switch (type) {
+    case LOCK_IMAGE:
+      return &_image_lock;
+    case LOCK_DRAW_IMAGE:
+      return &_image_draw_lock;
+    case LOCK_VIEWER:
+      return &_viewer_lock;
+    case LOCK_CUSTOM1:
+      return &_custom1_lock;
+    case LOCK_RCACHE:
+      return &_rcache_lock;
+    case LOCK_OPENGL:
+      return &_opengl_lock;
+    case LOCK_NODES:
+      return &_nodes_lock;
+    case LOCK_MOVIECLIP:
+      return &_movieclip_lock;
+    case LOCK_COLORMANAGE:
+      return &_colormanage_lock;
+    case LOCK_FFTW:
+      return &_fftw_lock;
+    case LOCK_VIEW3D:
+      return &_view3d_lock;
+    default:
+      BLI_assert(0);
+      return nullptr;
+  }
+}
+
+void BLI_thread_lock(int type)
+{
+  pthread_mutex_lock(global_mutex_from_type(type));
+}
+
+void BLI_thread_unlock(int type)
+{
+  pthread_mutex_unlock(global_mutex_from_type(type));
+}
+
+/* Mutex Locks */
+
+void BLI_mutex_init(ThreadMutex *mutex)
+{
+  pthread_mutex_init(mutex, nullptr);
+}
+
+void BLI_mutex_lock(ThreadMutex *mutex)
+{
+  pthread_mutex_lock(mutex);
+}
+
+void BLI_mutex_unlock(ThreadMutex *mutex)
+{
+  pthread_mutex_unlock(mutex);
+}
+
+bool BLI_mutex_trylock(ThreadMutex *mutex)
+{
+  return (pthread_mutex_trylock(mutex) == 0);
+}
+
+void BLI_mutex_end(ThreadMutex *mutex)
+{
+  pthread_mutex_destroy(mutex);
+}
+
+ThreadMutex *BLI_mutex_alloc(void)
+{
+  ThreadMutex *mutex = static_cast<ThreadMutex *>(MEM_callocN(sizeof(ThreadMutex), "ThreadMutex"));
+  BLI_mutex_init(mutex);
+  return mutex;
+}
+
+void BLI_mutex_free(ThreadMutex *mutex)
+{
+  BLI_mutex_end(mutex);
+  MEM_freeN(mutex);
+}
+
+/* Spin Locks */
+
+void BLI_spin_init(SpinLock *spin)
+{
+#if defined(__APPLE__)
+  *spin = OS_SPINLOCK_INIT;
+#elif defined(_MSC_VER)
+  *spin = 0;
+#else
+  pthread_spin_init(spin, 0);
+#endif
+}
+
+void BLI_spin_lock(SpinLock *spin)
+{
+#if defined(__APPLE__)
+  OSSpinLockLock(spin);
+#elif defined(_MSC_VER)
+  while (InterlockedExchangeAcquire(spin, 1)) {
+    while (*spin) {
+      /* Spin-lock hint for processors with hyperthreading. */
+      YieldProcessor();
+    }
+  }
+#else
+  pthread_spin_lock(spin);
+#endif
+}
+
+void BLI_spin_unlock(SpinLock *spin)
+{
+#if defined(__APPLE__)
+  OSSpinLockUnlock(spin);
+#elif defined(_MSC_VER)
+  _ReadWriteBarrier();
+  *spin = 0;
+#else
+  pthread_spin_unlock(spin);
+#endif
+}
+
+#if defined(__APPLE__) || defined(_MSC_VER)
+void BLI_spin_end(SpinLock *UNUSED(spin))
+{
+}
+#else
+void BLI_spin_end(SpinLock *spin)
+{
+  pthread_spin_destroy(spin);
+}
+#endif
+
+/* Read/Write Mutex Lock */
+
+void BLI_rw_mutex_init(ThreadRWMutex *mutex)
+{
+  pthread_rwlock_init(mutex, nullptr);
+}
+
+void BLI_rw_mutex_lock(ThreadRWMutex *mutex, int mode)
+{
+  if (mode == THREAD_LOCK_READ) {
+    pthread_rwlock_rdlock(mutex);
+  }
+  else {
+    pthread_rwlock_wrlock(mutex);
+  }
+}
+
+void BLI_rw_mutex_unlock(ThreadRWMutex *mutex)
+{
+  pthread_rwlock_unlock(mutex);
+}
+
+void BLI_rw_mutex_end(ThreadRWMutex *mutex)
+{
+  pthread_rwlock_destroy(mutex);
+}
+
+ThreadRWMutex *BLI_rw_mutex_alloc(void)
+{
+  ThreadRWMutex *mutex = static_cast<ThreadRWMutex *>(
+      MEM_callocN(sizeof(ThreadRWMutex), "ThreadRWMutex"));
+  BLI_rw_mutex_init(mutex);
+  return mutex;
+}
+
+void BLI_rw_mutex_free(ThreadRWMutex *mutex)
+{
+  BLI_rw_mutex_end(mutex);
+  MEM_freeN(mutex);
+}
+
+/* Ticket Mutex Lock */
+
+struct TicketMutex {
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+  unsigned int queue_head, queue_tail;
+};
+
+TicketMutex *BLI_ticket_mutex_alloc(void)
+{
+  TicketMutex *ticket = static_cast<TicketMutex *>(
+      MEM_callocN(sizeof(TicketMutex), "TicketMutex"));
+
+  pthread_cond_init(&ticket->cond, nullptr);
+  pthread_mutex_init(&ticket->mutex, nullptr);
+
+  return ticket;
+}
+
+void BLI_ticket_mutex_free(TicketMutex *ticket)
+{
+  pthread_mutex_destroy(&ticket->mutex);
+  pthread_cond_destroy(&ticket->cond);
+  MEM_freeN(ticket);
+}
+
+void BLI_ticket_mutex_lock(TicketMutex *ticket)
+{
+  unsigned int queue_me;
+
+  pthread_mutex_lock(&ticket->mutex);
+  queue_me = ticket->queue_tail++;
+
+  while (queue_me != ticket->queue_head) {
+    pthread_cond_wait(&ticket->cond, &ticket->mutex);
+  }
+
+  pthread_mutex_unlock(&ticket->mutex);
+}
+
+void BLI_ticket_mutex_unlock(TicketMutex *ticket)
+{
+  pthread_mutex_lock(&ticket->mutex);
+  ticket->queue_head++;
+  pthread_cond_broadcast(&ticket->cond);
+  pthread_mutex_unlock(&ticket->mutex);
+}
+
+/* ************************************************ */
+
+/* Condition */
+
+void BLI_condition_init(ThreadCondition *cond)
+{
+  pthread_cond_init(cond, nullptr);
+}
+
+void BLI_condition_wait(ThreadCondition *cond, ThreadMutex *mutex)
+{
+  pthread_cond_wait(cond, mutex);
+}
+
+void BLI_condition_wait_global_mutex(ThreadCondition *cond, const int type)
+{
+  pthread_cond_wait(cond, global_mutex_from_type(type));
+}
+
+void BLI_condition_notify_one(ThreadCondition *cond)
+{
+  pthread_cond_signal(cond);
+}
+
+void BLI_condition_notify_all(ThreadCondition *cond)
+{
+  pthread_cond_broadcast(cond);
+}
+
+void BLI_condition_end(ThreadCondition *cond)
+{
+  pthread_cond_destroy(cond);
+}
+
+/* ************************************************ */
+
+struct ThreadQueue {
+  GSQueue *queue;
+  pthread_mutex_t mutex;
+  pthread_cond_t push_cond;
+  pthread_cond_t finish_cond;
+  volatile int nowait;
+  volatile int canceled;
+};
+
+ThreadQueue *BLI_thread_queue_init(void)
+{
+  ThreadQueue *queue;
+
+  queue = static_cast<ThreadQueue *>(MEM_callocN(sizeof(ThreadQueue), "ThreadQueue"));
+  queue->queue = BLI_gsqueue_new(sizeof(void *));
+
+  pthread_mutex_init(&queue->mutex, nullptr);
+  pthread_cond_init(&queue->push_cond, nullptr);
+  pthread_cond_init(&queue->finish_cond, nullptr);
+
+  return queue;
+}
+
+void BLI_thread_queue_free(ThreadQueue *queue)
+{
+  /* destroy everything, assumes no one is using queue anymore */
+  pthread_cond_destroy(&queue->finish_cond);
+  pthread_cond_destroy(&queue->push_cond);
+  pthread_mutex_destroy(&queue->mutex);
+
+  BLI_gsqueue_free(queue->queue);
+
+  MEM_freeN(queue);
+}
+
+void BLI_thread_queue_push(ThreadQueue *queue, void *work)
+{
+  pthread_mutex_lock(&queue->mutex);
+
+  BLI_gsqueue_push(queue->queue, &work);
+
+  /* signal threads waiting to pop */
+  pthread_cond_signal(&queue->push_cond);
+  pthread_mutex_unlock(&queue->mutex);
+}
+
+void *BLI_thread_queue_pop(ThreadQueue *queue)
+{
+  void *work = nullptr;
+
+  /* wait until there is work */
+  pthread_mutex_lock(&queue->mutex);
+  while (BLI_gsqueue_is_empty(queue->queue) && !queue->nowait) {
+    pthread_cond_wait(&queue->push_cond, &queue->mutex);
+  }
+
+  /* if we have something, pop it */
+  if (!BLI_gsqueue_is_empty(queue->queue)) {
+    BLI_gsqueue_pop(queue->queue, &work);
+
+    if (BLI_gsqueue_is_empty(queue->queue)) {
+      pthread_cond_broadcast(&queue->finish_cond);
+    }
+  }
+
+  pthread_mutex_unlock(&queue->mutex);
+
+  return work;
+}
+
+static void wait_timeout(struct timespec *timeout, int ms)
+{
+  ldiv_t div_result;
+  long sec, usec, x;
+
+#ifdef WIN32
+  {
+    struct _timeb now;
+    _ftime(&now);
+    sec = now.time;
+    usec = now.millitm * 1000; /* microsecond precision would be better */
+  }
+#else
+  {
+    struct timeval now;
+    gettimeofday(&now, nullptr);
+    sec = now.tv_sec;
+    usec = now.tv_usec;
+  }
+#endif
+
+  /* add current time + millisecond offset */
+  div_result = ldiv(ms, 1000);
+  timeout->tv_sec = sec + div_result.quot;
+
+  x = usec + (div_result.rem * 1000);
+
+  if (x >= 1000000) {
+    timeout->tv_sec++;
+    x -= 1000000;
+  }
+
+  timeout->tv_nsec = x * 1000;
+}
+
+void *BLI_thread_queue_pop_timeout(ThreadQueue *queue, int ms)
+{
+  double t;
+  void *work = nullptr;
+  struct timespec timeout;
+
+  t = PIL_check_seconds_timer();
+  wait_timeout(&timeout, ms);
+
+  /* wait until there is work */
+  pthread_mutex_lock(&queue->mutex);
+  while (BLI_gsqueue_is_empty(queue->queue) && !queue->nowait) {
+    if (pthread_cond_timedwait(&queue->push_cond, &queue->mutex, &timeout) == ETIMEDOUT) {
+      break;
+    }
+    else if (PIL_check_seconds_timer() - t >= ms * 0.001) {
+      break;
+    }
+  }
+
+  /* if we have something, pop it */
+  if (!BLI_gsqueue_is_empty(queue->queue)) {
+    BLI_gsqueue_pop(queue->queue, &work);
+
+    if (BLI_gsqueue_is_empty(queue->queue)) {
+      pthread_cond_broadcast(&queue->finish_cond);
+    }
+  }
+
+  pthread_mutex_unlock(&queue->mutex);
+
+  return work;
+}
+
+int BLI_thread_queue_len(ThreadQueue *queue)
+{
+  int size;
+
+  pthread_mutex_lock(&queue->mutex);
+  size = BLI_gsqueue_len(queue->queue);
+  pthread_mutex_unlock(&queue->mutex);
+
+  return size;
+}
+
+bool BLI_thread_queue_is_empty(ThreadQueue *queue)
+{
+  bool is_empty;
+
+  pthread_mutex_lock(&queue->mutex);
+  is_empty = BLI_gsqueue_is_empty(queue->queue);
+  pthread_mutex_unlock(&queue->mutex);
+
+  return is_empty;
+}
+
+void BLI_thread_queue_nowait(ThreadQueue *queue)
+{
+  pthread_mutex_lock(&queue->mutex);
+
+  queue->nowait = 1;
+
+  /* signal threads waiting to pop */
+  pthread_cond_broadcast(&queue->push_cond);
+  pthread_mutex_unlock(&queue->mutex);
+}
+
+void BLI_thread_queue_wait_finish(ThreadQueue *queue)
+{
+  /* wait for finish condition */
+  pthread_mutex_lock(&queue->mutex);
+
+  while (!BLI_gsqueue_is_empty(queue->queue)) {
+    pthread_cond_wait(&queue->finish_cond, &queue->mutex);
+  }
+
+  pthread_mutex_unlock(&queue->mutex);
+}
+
+/* **** Special functions to help performance on crazy NUMA setups. **** */
+
+#if 0  /* UNUSED */
+static bool check_is_threadripper2_alike_topology(void)
+{
+  /* NOTE: We hope operating system does not support CPU hot-swap to
+   * a different brand. And that SMP of different types is also not
+   * encouraged by the system. */
+  static bool is_initialized = false;
+  static bool is_threadripper2 = false;
+  if (is_initialized) {
+    return is_threadripper2;
+  }
+  is_initialized = true;
+  char *cpu_brand = BLI_cpu_brand_string();
+  if (cpu_brand == nullptr) {
+    return false;
+  }
+  if (strstr(cpu_brand, "Threadripper")) {
+    /* NOTE: We consider all Thread-rippers having similar topology to
+     * the second one. This is because we are trying to utilize NUMA node
+     * 0 as much as possible. This node does exist on earlier versions of
+     * thread-ripper and setting affinity to it should not have negative
+     * effect.
+     * This allows us to avoid per-model check, making the code more
+     * reliable for the CPUs which are not yet released.
+     */
+    if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) {
+      is_threadripper2 = true;
+    }
+  }
+  /* NOTE: While all dies of EPYC has memory controller, only two f them
+   * has access to a lower-indexed DDR slots. Those dies are same as on
+   * Threadripper2 with the memory controller.
+   * Now, it is rather likely that reasonable amount of users don't max
+   * up their DR slots, making it only two dies connected to a DDR slot
+   * with actual memory in it. */
+  if (strstr(cpu_brand, "EPYC")) {
+    /* NOTE: Similarly to Thread-ripper we do not do model check. */
+    is_threadripper2 = true;
+  }
+  MEM_freeN(cpu_brand);
+  return is_threadripper2;
+}
+
+static void threadripper_put_process_on_fast_node(void)
+{
+  if (!is_numa_available) {
+    return;
+  }
+  /* NOTE: Technically, we can use NUMA nodes 0 and 2 and using both of
+   * them in the affinity mask will allow OS to schedule threads more
+   * flexible,possibly increasing overall performance when multiple apps
+   * are crunching numbers.
+   *
+   * However, if scene fits into memory adjacent to a single die we don't
+   * want OS to re-schedule the process to another die since that will make
+   * it further away from memory allocated for .blend file. */
+  /* NOTE: Even if NUMA is available in the API but is disabled in BIOS on
+   * this workstation we still process here. If NUMA is disabled it will be a
+   * single node, so our action is no-visible-changes, but allows to keep
+   * things simple and unified. */
+  numaAPI_RunProcessOnNode(0);
+}
+
+static void threadripper_put_thread_on_fast_node(void)
+{
+  if (!is_numa_available) {
+    return;
+  }
+  /* NOTE: This is where things becomes more interesting. On the one hand
+   * we can use nodes 0 and 2 and allow operating system to do balancing
+   * of processes/threads for the maximum performance when multiple apps
+   * are running.
+   * On another hand, however, we probably want to use same node as the
+   * main thread since that's where the memory of .blend file is likely
+   * to be allocated.
+   * Since the main thread is currently on node 0, we also put thread on
+   * same node. */
+  /* See additional note about NUMA disabled in BIOS above. */
+  numaAPI_RunThreadOnNode(0);
+}
+#endif /* UNUSED */
+
+void BLI_thread_put_process_on_fast_node(void)
+{
+  /* Disabled for now since this causes only 16 threads to be used on a
+   * thread-ripper for computations like sculpting and fluid sim. The problem
+   * is that all threads created as children from this thread will inherit
+   * the NUMA node and so will end up on the same node. This can be fixed
+   * case-by-case by assigning the NUMA node for every child thread, however
+   * this is difficult for external libraries and OpenMP, and out of our
+   * control for plugins like external renderers. */
+#if 0
+  if (check_is_threadripper2_alike_topology()) {
+    threadripper_put_process_on_fast_node();
+  }
+#endif
+}
+
+void BLI_thread_put_thread_on_fast_node(void)
+{
+  /* Disabled for now, see comment above. */
+#if 0
+  if (check_is_threadripper2_alike_topology()) {
+    threadripper_put_thread_on_fast_node();
+  }
+#endif
+}
author	Sergey Sharybin <sergey.vfx@gmail.com>	2020-07-02 17:19:18 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2020-07-03 12:14:06 +0300
commit	0f4049db5f3c6be582d878e2ae1e6fd12dcad6bf (patch)
tree	d9c9bf68ce097bc23c409b9c7fa19164cd74394d /source/blender/blenlib/intern/threads.cc
parent	edb49d3dc2e3a2bd3048cfdb55562fb853f37a21 (diff)