diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2018-11-28 16:42:38 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2018-11-28 16:42:38 +0300 |
commit | 3ed0d5b4d4c06a0c88832525a7c55d21a04d86ff (patch) | |
tree | 44f3a8beab1bea3b712acbfc10edef1120d68f89 /source | |
parent | 2bd62b076fac80e073557a7d1dff0e989aaa97c1 (diff) | |
parent | ce927e15e0e3570a02834b6001519a024ca1e2b6 (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'source')
-rw-r--r-- | source/blender/blenlib/BLI_system.h | 4 | ||||
-rw-r--r-- | source/blender/blenlib/BLI_threads.h | 6 | ||||
-rw-r--r-- | source/blender/blenlib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | source/blender/blenlib/intern/system.c | 38 | ||||
-rw-r--r-- | source/blender/blenlib/intern/threads.c | 101 | ||||
-rw-r--r-- | source/blender/windowmanager/intern/wm_jobs.c | 1 | ||||
-rw-r--r-- | source/creator/creator.c | 2 |
7 files changed, 153 insertions, 0 deletions
diff --git a/source/blender/blenlib/BLI_system.h b/source/blender/blenlib/BLI_system.h index f51b9623803..7f88f8a18b1 100644 --- a/source/blender/blenlib/BLI_system.h +++ b/source/blender/blenlib/BLI_system.h @@ -30,6 +30,10 @@ int BLI_cpu_support_sse2(void); void BLI_system_backtrace(FILE *fp); + +/* Get CPU brand, result is to be MEM_freeN()-ed. */ +char *BLI_cpu_brand_string(void); + /* getpid */ #ifdef WIN32 # define BLI_SYSTEM_PID_H <process.h> diff --git a/source/blender/blenlib/BLI_threads.h b/source/blender/blenlib/BLI_threads.h index 81f8445783b..631a65ccade 100644 --- a/source/blender/blenlib/BLI_threads.h +++ b/source/blender/blenlib/BLI_threads.h @@ -204,6 +204,12 @@ void BLI_thread_queue_nowait(ThreadQueue *queue); # define BLI_thread_local_set(name, value) name = value #endif /* defined(__APPLE__) */ +/* **** Special functions to help performance on crazy NUMA setups. **** */ + +/* Make sure process/thread is using NUMA node with fast memory access. */ +void BLI_thread_put_process_on_fast_node(void); +void BLI_thread_put_thread_on_fast_node(void); + #ifdef __cplusplus } #endif diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt index 91887c1ef5e..921ecc29e18 100644 --- a/source/blender/blenlib/CMakeLists.txt +++ b/source/blender/blenlib/CMakeLists.txt @@ -30,6 +30,7 @@ set(INC ../../../intern/guardedalloc ../../../intern/atomic ../../../intern/eigen + ../../../intern/numaapi/include ../../../extern/wcwidth ) diff --git a/source/blender/blenlib/intern/system.c b/source/blender/blenlib/intern/system.c index ecb977c6e61..38fe2c7a9eb 100644 --- a/source/blender/blenlib/intern/system.c +++ b/source/blender/blenlib/intern/system.c @@ -27,6 +27,7 @@ #include "BLI_utildefines.h" #include "BLI_system.h" +#include "BLI_string.h" #include "MEM_guardedalloc.h" @@ -138,3 +139,40 @@ void BLI_system_backtrace(FILE *fp) } /* end BLI_system_backtrace */ + +/* NOTE: The code for CPU brand string is adopted from Cycles. */ + +#if !defined(_WIN32) || defined(FREE_WINDOWS) +static void __cpuid(int data[4], int selector) +{ +#if defined(__x86_64__) + asm("cpuid" : "=a" (data[0]), "=b" (data[1]), "=c" (data[2]), "=d" (data[3]) : "a"(selector)); +#elif defined(__i386__) + asm("pushl %%ebx \n\t" + "cpuid \n\t" + "movl %%ebx, %1 \n\t" + "popl %%ebx \n\t" + : "=a" (data[0]), "=r" (data[1]), "=c" (data[2]), "=d" (data[3]) + : "a"(selector) + : "ebx"); +#else + data[0] = data[1] = data[2] = data[3] = 0; +#endif +} +#endif + +char *BLI_cpu_brand_string(void) +{ + char buf[48] = { 0 }; + int result[4] = { 0 }; + __cpuid(result, 0x80000000); + if (result[0] >= (int)0x80000004) { + __cpuid((int*)(buf + 0), 0x80000002); + __cpuid((int*)(buf + 16), 0x80000003); + __cpuid((int*)(buf + 32), 0x80000004); + char *brand = BLI_strdup(buf); + /* TODO(sergey): Make it a bit more presentable by removing trademark. */ + return brand; + } + return NULL; +} diff --git a/source/blender/blenlib/intern/threads.c b/source/blender/blenlib/intern/threads.c index 862ce391109..f67d621f4a1 100644 --- a/source/blender/blenlib/intern/threads.c +++ b/source/blender/blenlib/intern/threads.c @@ -37,6 +37,7 @@ #include "BLI_listbase.h" #include "BLI_gsqueue.h" +#include "BLI_system.h" #include "BLI_task.h" #include "BLI_threads.h" @@ -55,6 +56,7 @@ #endif #include "atomic_ops.h" +#include "numaapi.h" #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__) # define USE_APPLE_OMP_FIX @@ -126,6 +128,7 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_t mainid; +static bool is_numa_available = false; static unsigned int thread_levels = 0; /* threads can be invoked inside threads */ static int num_threads_override = 0; @@ -155,6 +158,9 @@ void BLI_threadapi_init(void) mainid = pthread_self(); BLI_spin_init(&_malloc_lock); + if (numaAPI_Initialize() == NUMAAPI_SUCCESS) { + is_numa_available = true; + } } void BLI_threadapi_exit(void) @@ -840,3 +846,98 @@ void BLI_threaded_malloc_end(void) MEM_set_lock_callback(NULL, NULL); } } + +/* **** Special functions to help performance on crazy NUMA setups. **** */ + +static bool check_is_threadripper2_alike_topology(void) +{ + /* NOTE: We hope operating system does not support CPU hotswap to + * a different brand. And that SMP of different types is also not + * encouraged by the system. */ + static bool is_initialized = false; + static bool is_threadripper2 = false; + if (is_initialized) { + return is_threadripper2; + } + is_initialized = true; + char *cpu_brand = BLI_cpu_brand_string(); + if (cpu_brand == NULL) { + return false; + } + if (strstr(cpu_brand, "Threadripper")) { + /* NOTE: We consinder all Threadrippers having similar topology to + * the second one. This is because we are trying to utilize NUMA node + * 0 as much as possible. This node does exist on earlier versions of + * threadripper and setting affinity to it should not have negative + * effect. + * This allows us to avoid per-model check, making the code more + * reliable for the CPUs which are not yet released. + */ + if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) { + is_threadripper2 = true; + } + } + /* NOTE: While all dies of EPYC has memory controller, only two f them + * has access to a lower-indexed DDR slots. Those dies are same as on + * Threadripper2 with the memory controller. + * Now, it is rather likely that reasonable amount of users don't max + * up their DR slots, making it only two dies connected to a DDR slot + * with actual memory in it. */ + if (strstr(cpu_brand, "EPYC")) { + /* NOTE: Similarly to Threadripper we do not do model check. */ + is_threadripper2 = true; + } + return is_threadripper2; +} + +static void threadripper_put_process_on_fast_node(void) +{ + if (!is_numa_available) { + return; + } + /* NOTE: Technically, we can use NUMA nodes 0 and 2 and usning both of + * them in the affinity mask will allow OS to schedule threads more + * flexible,possibly increasing overall performance when multiple apps + * are crunching numbers. + * + * However, if scene fits into memory adjacent to a single die we don't + * want OS to re-schedule the process to another die since that will make + * it further away from memory allocated for .blend file. */ + /* NOTE: Even if NUMA is avasilable in the API but is disabled in BIOS on + * this workstation we still process here. If NUMA is disabled it will be a + * single node, so our action is no-visible-changes, but allows to keep + * things simple and unified. */ + numaAPI_RunProcessOnNode(0); +} + +static void threadripper_put_thread_on_fast_node(void) +{ + if (!is_numa_available) { + return; + } + /* NOTE: This is where things becomes more interesting. On the one hand + * we can use nodes 0 and 2 and allow operating system to do balancing + * of processes/threads for the maximum performance when multiple apps + * are running. + * On another hand, however, we probably want to use same node as the + * main thread since that's where the memory of .blend file is likely + * to be allocated. + * Since the main thread is currently on node 0, we also put thread on + * same node. */ + /* See additional note about NUMA disabled in BIOS above. */ + numaAPI_RunThreadOnNode(0); +} + +void BLI_thread_put_process_on_fast_node(void) +{ + if (check_is_threadripper2_alike_topology()) { + threadripper_put_process_on_fast_node(); + } +} + +void BLI_thread_put_thread_on_fast_node(void) +{ + if (check_is_threadripper2_alike_topology()) { + threadripper_put_thread_on_fast_node(); + } +} diff --git a/source/blender/windowmanager/intern/wm_jobs.c b/source/blender/windowmanager/intern/wm_jobs.c index 92d51c9a400..cb627b465f4 100644 --- a/source/blender/windowmanager/intern/wm_jobs.c +++ b/source/blender/windowmanager/intern/wm_jobs.c @@ -334,6 +334,7 @@ static void *do_job_thread(void *job_v) { wmJob *wm_job = job_v; + BLI_thread_put_thread_on_fast_node(); wm_job->startjob(wm_job->run_customdata, &wm_job->stop, &wm_job->do_update, &wm_job->progress); wm_job->ready = true; diff --git a/source/creator/creator.c b/source/creator/creator.c index 5a61f077a84..ce25a71c6d8 100644 --- a/source/creator/creator.c +++ b/source/creator/creator.c @@ -52,6 +52,7 @@ #include "BLI_callbacks.h" #include "BLI_string.h" #include "BLI_system.h" +#include "BLI_threads.h" /* mostly init functions */ #include "BKE_appdir.h" @@ -364,6 +365,7 @@ int main( BKE_appdir_program_path_init(argv[0]); BLI_threadapi_init(); + BLI_thread_put_process_on_fast_node(); DNA_sdna_current_init(); |