diff options
-rw-r--r-- | source/blender/blenlib/BLI_threads.h | 6 | ||||
-rw-r--r-- | source/blender/blenlib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | source/blender/blenlib/intern/threads.c | 101 | ||||
-rw-r--r-- | source/blender/windowmanager/intern/wm_jobs.c | 1 | ||||
-rw-r--r-- | source/creator/creator.c | 2 |
5 files changed, 111 insertions, 0 deletions
diff --git a/source/blender/blenlib/BLI_threads.h b/source/blender/blenlib/BLI_threads.h index 81f8445783b..631a65ccade 100644 --- a/source/blender/blenlib/BLI_threads.h +++ b/source/blender/blenlib/BLI_threads.h @@ -204,6 +204,12 @@ void BLI_thread_queue_nowait(ThreadQueue *queue); # define BLI_thread_local_set(name, value) name = value #endif /* defined(__APPLE__) */ +/* **** Special functions to help performance on crazy NUMA setups. **** */ + +/* Make sure process/thread is using NUMA node with fast memory access. */ +void BLI_thread_put_process_on_fast_node(void); +void BLI_thread_put_thread_on_fast_node(void); + #ifdef __cplusplus } #endif diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt index e3f5773b1e4..16dfec77260 100644 --- a/source/blender/blenlib/CMakeLists.txt +++ b/source/blender/blenlib/CMakeLists.txt @@ -30,6 +30,7 @@ set(INC ../../../intern/guardedalloc ../../../intern/atomic ../../../intern/eigen + ../../../intern/numaapi/include ../../../extern/wcwidth ) diff --git a/source/blender/blenlib/intern/threads.c b/source/blender/blenlib/intern/threads.c index 862ce391109..f67d621f4a1 100644 --- a/source/blender/blenlib/intern/threads.c +++ b/source/blender/blenlib/intern/threads.c @@ -37,6 +37,7 @@ #include "BLI_listbase.h" #include "BLI_gsqueue.h" +#include "BLI_system.h" #include "BLI_task.h" #include "BLI_threads.h" @@ -55,6 +56,7 @@ #endif #include "atomic_ops.h" +#include "numaapi.h" #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__) # define USE_APPLE_OMP_FIX @@ -126,6 +128,7 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_t mainid; +static bool is_numa_available = false; static unsigned int thread_levels = 0; /* threads can be invoked inside threads */ static int num_threads_override = 0; @@ -155,6 +158,9 @@ void BLI_threadapi_init(void) mainid = pthread_self(); BLI_spin_init(&_malloc_lock); + if (numaAPI_Initialize() == NUMAAPI_SUCCESS) { + is_numa_available = true; + } } void BLI_threadapi_exit(void) @@ -840,3 +846,98 @@ void BLI_threaded_malloc_end(void) MEM_set_lock_callback(NULL, NULL); } } + +/* **** Special functions to help performance on crazy NUMA setups. **** */ + +static bool check_is_threadripper2_alike_topology(void) +{ + /* NOTE: We hope operating system does not support CPU hotswap to + * a different brand. And that SMP of different types is also not + * encouraged by the system. */ + static bool is_initialized = false; + static bool is_threadripper2 = false; + if (is_initialized) { + return is_threadripper2; + } + is_initialized = true; + char *cpu_brand = BLI_cpu_brand_string(); + if (cpu_brand == NULL) { + return false; + } + if (strstr(cpu_brand, "Threadripper")) { + /* NOTE: We consinder all Threadrippers having similar topology to + * the second one. This is because we are trying to utilize NUMA node + * 0 as much as possible. This node does exist on earlier versions of + * threadripper and setting affinity to it should not have negative + * effect. + * This allows us to avoid per-model check, making the code more + * reliable for the CPUs which are not yet released. + */ + if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) { + is_threadripper2 = true; + } + } + /* NOTE: While all dies of EPYC has memory controller, only two f them + * has access to a lower-indexed DDR slots. Those dies are same as on + * Threadripper2 with the memory controller. + * Now, it is rather likely that reasonable amount of users don't max + * up their DR slots, making it only two dies connected to a DDR slot + * with actual memory in it. */ + if (strstr(cpu_brand, "EPYC")) { + /* NOTE: Similarly to Threadripper we do not do model check. */ + is_threadripper2 = true; + } + return is_threadripper2; +} + +static void threadripper_put_process_on_fast_node(void) +{ + if (!is_numa_available) { + return; + } + /* NOTE: Technically, we can use NUMA nodes 0 and 2 and usning both of + * them in the affinity mask will allow OS to schedule threads more + * flexible,possibly increasing overall performance when multiple apps + * are crunching numbers. + * + * However, if scene fits into memory adjacent to a single die we don't + * want OS to re-schedule the process to another die since that will make + * it further away from memory allocated for .blend file. */ + /* NOTE: Even if NUMA is avasilable in the API but is disabled in BIOS on + * this workstation we still process here. If NUMA is disabled it will be a + * single node, so our action is no-visible-changes, but allows to keep + * things simple and unified. */ + numaAPI_RunProcessOnNode(0); +} + +static void threadripper_put_thread_on_fast_node(void) +{ + if (!is_numa_available) { + return; + } + /* NOTE: This is where things becomes more interesting. On the one hand + * we can use nodes 0 and 2 and allow operating system to do balancing + * of processes/threads for the maximum performance when multiple apps + * are running. + * On another hand, however, we probably want to use same node as the + * main thread since that's where the memory of .blend file is likely + * to be allocated. + * Since the main thread is currently on node 0, we also put thread on + * same node. */ + /* See additional note about NUMA disabled in BIOS above. */ + numaAPI_RunThreadOnNode(0); +} + +void BLI_thread_put_process_on_fast_node(void) +{ + if (check_is_threadripper2_alike_topology()) { + threadripper_put_process_on_fast_node(); + } +} + +void BLI_thread_put_thread_on_fast_node(void) +{ + if (check_is_threadripper2_alike_topology()) { + threadripper_put_thread_on_fast_node(); + } +} diff --git a/source/blender/windowmanager/intern/wm_jobs.c b/source/blender/windowmanager/intern/wm_jobs.c index 92d51c9a400..cb627b465f4 100644 --- a/source/blender/windowmanager/intern/wm_jobs.c +++ b/source/blender/windowmanager/intern/wm_jobs.c @@ -334,6 +334,7 @@ static void *do_job_thread(void *job_v) { wmJob *wm_job = job_v; + BLI_thread_put_thread_on_fast_node(); wm_job->startjob(wm_job->run_customdata, &wm_job->stop, &wm_job->do_update, &wm_job->progress); wm_job->ready = true; diff --git a/source/creator/creator.c b/source/creator/creator.c index e375b65fd75..1d39fd6f05a 100644 --- a/source/creator/creator.c +++ b/source/creator/creator.c @@ -52,6 +52,7 @@ #include "BLI_callbacks.h" #include "BLI_string.h" #include "BLI_system.h" +#include "BLI_threads.h" /* mostly init functions */ #include "BKE_appdir.h" @@ -364,6 +365,7 @@ int main( BKE_appdir_program_path_init(argv[0]); BLI_threadapi_init(); + BLI_thread_put_process_on_fast_node(); DNA_sdna_current_init(); |