10 files changed, 114 insertions, 248 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 92dfc9fa85d..42626d05cf9 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -25,13 +25,17 @@ set(SRC
 	util_thread.cpp
 	util_time.cpp
 	util_transform.cpp
-	util_windows.cpp
 )
 
-if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
-	list(APPEND SRC
-		util_view.cpp
-	)
+if(WITH_CYCLES_STANDALONE)
+	if (WITH_CYCLES_STANDALONE_GUI)
+		list(APPEND SRC
+			util_view.cpp
+		)
+	endif()
+	list(APPEND INC_SYS ../../third_party/numaapi/include)
+else()
+	list(APPEND INC_SYS ../../numaapi/include)
 endif()
 
 set(SRC_HEADERS
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 34f428f111c..cc2d7017fd8 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -20,6 +20,8 @@
 #include "util/util_types.h"
 #include "util/util_string.h"
 
+#include <numaapi.h>
+
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
 #    include <intrin.h>
@@ -34,74 +36,81 @@
 
 CCL_NAMESPACE_BEGIN
 
-int system_cpu_group_count()
+bool system_cpu_ensure_initialized()
 {
-#ifdef _WIN32
-	util_windows_init_numa_groups();
-	return GetActiveProcessorGroupCount();
-#else
-	/* TODO(sergey): Need to adopt for other platforms. */
-	return 1;
-#endif
+	static bool is_initialized = false;
+	static bool result = false;
+	if (is_initialized) {
+		return result;
+	}
+	is_initialized = true;
+	const NUMAAPI_Result numa_result = numaAPI_Initialize();
+	result = (numa_result == NUMAAPI_SUCCESS);
+	return result;
 }
 
-int system_cpu_group_thread_count(int group)
+/* Fallback solution, which doesn't use NUMA/CPU groups. */
+static int system_cpu_thread_count_fallback()
 {
-	/* TODO(sergey): Need make other platforms aware of groups. */
 #ifdef _WIN32
-	util_windows_init_numa_groups();
-	return GetActiveProcessorCount(group);
+	SYSTEM_INFO info;
+	GetSystemInfo(&info);
+	return info.dwNumberOfProcessors;
 #elif defined(__APPLE__)
-	(void) group;
 	int count;
 	size_t len = sizeof(count);
 	int mib[2] = { CTL_HW, HW_NCPU };
 	sysctl(mib, 2, &count, &len, NULL, 0);
 	return count;
 #else
-	(void) group;
 	return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }
 
 int system_cpu_thread_count()
 {
-	static uint count = 0;
-
-	if(count > 0) {
-		return count;
+	const int num_nodes = system_cpu_num_numa_nodes();
+	int num_threads = 0;
+	for (int node = 0; node < num_nodes; ++node) {
+		if (!system_cpu_is_numa_node_available(node)) {
+			continue;
+		}
+		num_threads += system_cpu_num_numa_node_processors(node);
 	}
+	return num_threads;
+}
 
-	int max_group = system_cpu_group_count();
-	VLOG(1) << "Detected " << max_group << " CPU groups.";
-	for(int group = 0; group < max_group; ++group) {
-		int num_threads = system_cpu_group_thread_count(group);
-		VLOG(1) << "Group " << group
-		        << " has " << num_threads << " threads.";
-		count += num_threads;
+int system_cpu_num_numa_nodes()
+{
+	if (!system_cpu_ensure_initialized()) {
+		/* Fallback to a single node with all the threads. */
+		return 1;
 	}
+	return numaAPI_GetNumNodes();
+}
 
-	if(count < 1) {
-		count = 1;
+bool system_cpu_is_numa_node_available(int node)
+{
+	if (!system_cpu_ensure_initialized()) {
+		return true;
 	}
+	return numaAPI_IsNodeAvailable(node);
+}
 
-	return count;
+int system_cpu_num_numa_node_processors(int node)
+{
+	if (!system_cpu_ensure_initialized()) {
+		return system_cpu_thread_count_fallback();
+	}
+	return numaAPI_GetNumNodeProcessors(node);
 }
 
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *groups)
+bool system_cpu_run_thread_on_node(int node)
 {
-#ifdef _WIN32
-	unsigned short group_count = max_groups;
-	if(!GetProcessGroupAffinity(GetCurrentProcess(), &group_count, groups)) {
-		return 0;
+	if (!system_cpu_ensure_initialized()) {
+		return true;
 	}
-	return group_count;
-#else
-	(void) max_groups;
-	(void) groups;
-	return 0;
-#endif
+	return numaAPI_RunThreadOnNode(node);
 }
 
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 241ac897157..15f69bcf153 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -21,18 +21,28 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Get number of available CPU groups. */
-int system_cpu_group_count();
+/* Make sure CPU groups / NUMA API is initialized. */
+bool system_cpu_ensure_initialized();
 
-/* Get number of threads/processors in the specified group. */
-int system_cpu_group_thread_count(int group);
-
-/* Get total number of threads in all groups. */
+/* Get total number of threads in all NUMA nodes / CPU groups. */
 int system_cpu_thread_count();
 
-/* Get current process groups. */
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *grpups);
+/* Get number of available nodes.
+ *
+ * This is in fact an index of last node plus one and it's not guaranteed
+ * that all nodes up to this one are available. */
+int system_cpu_num_numa_nodes();
+
+/* Returns truth if the given node is available for compute. */
+bool system_cpu_is_numa_node_available(int node);
+
+/* Get number of available processors on a given node. */
+int system_cpu_num_numa_node_processors(int node);
+
+/* Runs the current thread and its children on a specific node.
+ *
+ * Returns truth if affinity has successfully changed. */
+bool system_cpu_run_thread_on_node(int node);
 
 string system_cpu_brand_string();
 int system_cpu_bits();
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 2d21d6b5a18..50a2bb160ff 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -204,50 +204,26 @@ void TaskScheduler::init(int num_threads)
 		/* launch threads that will be waiting for work */
 		threads.resize(num_threads);
 
-		const int num_groups = system_cpu_group_count();
-		unsigned short num_process_groups = 0;
-		vector<unsigned short> process_groups;
-		int current_group_threads = 0;
-		if(num_groups > 1) {
-			process_groups.resize(num_groups);
-			num_process_groups = system_cpu_process_groups(num_groups,
-			                                               &process_groups[0]);
-			if(num_process_groups == 1) {
-				current_group_threads = system_cpu_group_thread_count(process_groups[0]);
-			}
-		}
+		const int num_nodes = system_cpu_num_numa_nodes();
 		int thread_index = 0;
-		for(int group = 0; group < num_groups; ++group) {
-			/* NOTE: That's not really efficient from threading point of view,
-			 * but it is simple to read and it doesn't make sense to use more
-			 * user-specified threads than logical threads anyway.
-			 */
-			int num_group_threads = (group == num_groups - 1)
-			        ? (threads.size() - thread_index)
-			        : system_cpu_group_thread_count(group);
-			for(int group_thread = 0;
-				group_thread < num_group_threads && thread_index < threads.size();
-				++group_thread, ++thread_index)
+		for (int node = 0;
+		     node < num_nodes && thread_index < threads.size();
+		     ++node)
+		{
+			if (!system_cpu_is_numa_node_available(node)) {
+				continue;
+			}
+			const int num_node_processors =
+			        system_cpu_num_numa_node_processors(node);
+			for (int i = 0;
+			     i < num_node_processors && thread_index < threads.size();
+			     ++i)
 			{
-				/* NOTE: Thread group of -1 means we would not force thread affinity. */
-				int thread_group;
-				if(num_groups == 1) {
-					/* Use default affinity if there's only one CPU group in the system. */
-					thread_group = -1;
-				}
-				else if(use_auto_threads &&
-				        num_process_groups == 1 &&
-						num_threads <= current_group_threads)
-				{
-					/* If we fit into curent CPU group we also don't force any affinity. */
-					thread_group = -1;
-				}
-				else {
-					thread_group = group;
-				}
-				threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run,
-				                                                 thread_index + 1),
-				                                   thread_group);
+				threads[thread_index] = new thread(
+				        function_bind(&TaskScheduler::thread_run,
+				                      thread_index + 1),
+				        node);
+				thread_index++;
 			}
 		}
 	}
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 37d8bdbd4b0..4d30e3f564f 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -21,10 +21,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-thread::thread(function<void()> run_cb, int group)
+thread::thread(function<void()> run_cb, int node)
   : run_cb_(run_cb),
     joined_(false),
-	group_(group)
+	node_(node)
 {
 	thread_ = std::thread(&thread::run, this);
 }
@@ -39,19 +39,8 @@ thread::~thread()
 void *thread::run(void *arg)
 {
 	thread *self = (thread*)(arg);
-	if(self->group_ != -1) {
-#ifdef _WIN32
-		HANDLE thread_handle = GetCurrentThread();
-		GROUP_AFFINITY group_affinity = { 0 };
-		int num_threads = system_cpu_group_thread_count(self->group_);
-		group_affinity.Group = self->group_;
-		group_affinity.Mask = (num_threads == 64)
-		                              ? -1
-		                              :  (1ull << num_threads) - 1;
-		if(SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) {
-			fprintf(stderr, "Error setting thread affinity.\n");
-		}
-#endif
+	if (self->node_ != -1) {
+		system_cpu_run_thread_on_node(self->node_);
 	}
 	self->run_cb_();
 	return NULL;
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 6250bb95dcf..d54199a37fc 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -46,7 +46,7 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
-	thread(function<void()> run_cb, int group = -1);
+	thread(function<void()> run_cb, int node = -1);
 	~thread();
 
 	static void *run(void *arg);
@@ -56,7 +56,7 @@ protected:
 	function<void()> run_cb_;
 	std::thread thread_;
 	bool joined_;
-	int group_;
+	int node_;
 };
 
 /* Own wrapper around pthread's spin lock to make it's use easier. */
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
deleted file mode 100644
index 073db2a27db..00000000000
--- a/intern/cycles/util/util_windows.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_windows.h"
-
-#ifdef _WIN32
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef _M_X64
-#  include <VersionHelpers.h>
-#endif
-
-#if _WIN32_WINNT < 0x0601
-tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-tGetActiveProcessorCount *GetActiveProcessorCount;
-tSetThreadGroupAffinity *SetThreadGroupAffinity;
-tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-static WORD GetActiveProcessorGroupCount_stub()
-{
-	return 1;
-}
-
-static DWORD GetActiveProcessorCount_stub(WORD /*GroupNumber*/)
-{
-	SYSTEM_INFO info;
-	GetSystemInfo(&info);
-	return info.dwNumberOfProcessors;
-}
-
-static BOOL SetThreadGroupAffinity_stub(
-        HANDLE /*hThread*/,
-        const GROUP_AFFINITY  * /*GroupAffinity*/,
-        PGROUP_AFFINITY /*PreviousGroupAffinity*/)
-{
-	return TRUE;
-}
-
-static BOOL GetProcessGroupAffinity_stub(HANDLE hProcess,
-                                         PUSHORT GroupCount,
-                                         PUSHORT GroupArray)
-{
-	if(*GroupCount < 1) {
-		return FALSE;
-	}
-	*GroupCount = 1;
-	GroupArray[0] = 0;
-	return TRUE;
-}
-
-static bool supports_numa()
-{
-#ifndef _M_X64
-	return false;
-#else
-	return IsWindows7OrGreater();
-#endif
-}
-
-void util_windows_init_numa_groups()
-{
-	static bool initialized = false;
-	if(initialized) {
-		return;
-	}
-	initialized = true;
-#if _WIN32_WINNT < 0x0601
-	if(!supports_numa()) {
-		/* Use stubs on platforms which doesn't have rean NUMA/Groups. */
-		GetActiveProcessorGroupCount = GetActiveProcessorGroupCount_stub;
-		GetActiveProcessorCount = GetActiveProcessorCount_stub;
-		SetThreadGroupAffinity = SetThreadGroupAffinity_stub;
-		GetProcessGroupAffinity = GetProcessGroupAffinity_stub;
-		return;
-	}
-	HMODULE kernel = GetModuleHandleA("kernel32.dll");
-#  define READ_SYMBOL(sym) sym = (t##sym*)GetProcAddress(kernel, #sym)
-	READ_SYMBOL(GetActiveProcessorGroupCount);
-	READ_SYMBOL(GetActiveProcessorCount);
-	READ_SYMBOL(SetThreadGroupAffinity);
-	READ_SYMBOL(GetProcessGroupAffinity);
-#  undef READ_SUMBOL
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif  /* _WIN32 */
diff --git a/intern/cycles/util/util_windows.h b/intern/cycles/util/util_windows.h
index 9b9268fed7a..bd1bc85adff 100644
--- a/intern/cycles/util/util_windows.h
+++ b/intern/cycles/util/util_windows.h
@@ -31,29 +31,6 @@
 
 #include <windows.h>
 
-CCL_NAMESPACE_BEGIN
-
-#if _WIN32_WINNT < 0x0601
-typedef WORD tGetActiveProcessorGroupCount();
-typedef DWORD tGetActiveProcessorCount(WORD GroupNumber);
-typedef BOOL tSetThreadGroupAffinity(HANDLE hThread,
-                                     const GROUP_AFFINITY  *GroupAffinity,
-                                     PGROUP_AFFINITY PreviousGroupAffinity);
-typedef BOOL tGetProcessGroupAffinity(HANDLE  hProcess,
-                                     PUSHORT GroupCount,
-                                     PUSHORT GroupArray);
-
-extern tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-extern tGetActiveProcessorCount *GetActiveProcessorCount;
-extern tSetThreadGroupAffinity *SetThreadGroupAffinity;
-extern tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-/* Make sure NUMA and processor groups API is initialized. */
-void util_windows_init_numa_groups();
-
-CCL_NAMESPACE_END
-
-#endif  /* WIN32 */
+#endif  /* _WIN32 */
 
 #endif  /* __UTIL_WINDOWS_H__ */
diff --git a/intern/numaapi/include/numaapi.h b/intern/numaapi/include/numaapi.h
index a4f32d88458..7b5b50fdf39 100644
--- a/intern/numaapi/include/numaapi.h
+++ b/intern/numaapi/include/numaapi.h
@@ -67,7 +67,7 @@ int numaAPI_GetNumNodes(void);
 // Returns truth if the given node is available for compute.
 bool numaAPI_IsNodeAvailable(int node);
 
-// Getnumber of available processors on a given node.
+// Get number of available processors on a given node.
 int numaAPI_GetNumNodeProcessors(int node);
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/intern/numaapi/source/numaapi_linux.c b/intern/numaapi/source/numaapi_linux.c
index 559e97b67d3..62e9dcdfadf 100644
--- a/intern/numaapi/source/numaapi_linux.c
+++ b/intern/numaapi/source/numaapi_linux.c
@@ -34,6 +34,8 @@
 #  include <dlfcn.h>
 #endif
 
+#include <stdio.h>
+
 #ifdef WITH_DYNLOAD
 
 // Descriptor numa library.
@@ -61,6 +63,7 @@ typedef struct bitmask* tnuma_allocate_nodemask(void);
 typedef void tnuma_free_cpumask(struct bitmask* bitmask);
 typedef void tnuma_free_nodemask(struct bitmask* bitmask);
 typedef int tnuma_run_on_node_mask(struct bitmask *nodemask);
+typedef int tnuma_run_on_node_mask_all(struct bitmask *nodemask);
 typedef void tnuma_set_interleave_mask(struct bitmask *nodemask);
 typedef void tnuma_set_localalloc(void);
 
@@ -83,6 +86,7 @@ static tnuma_allocate_nodemask* numa_allocate_nodemask;
 static tnuma_free_nodemask* numa_free_nodemask;
 static tnuma_free_cpumask* numa_free_cpumask;
 static tnuma_run_on_node_mask* numa_run_on_node_mask;
+static tnuma_run_on_node_mask_all* numa_run_on_node_mask_all;
 static tnuma_set_interleave_mask* numa_set_interleave_mask;
 static tnuma_set_localalloc* numa_set_localalloc;
 
@@ -157,6 +161,7 @@ static NUMAAPI_Result loadNumaSymbols(void) {
   NUMA_LIBRARY_FIND(numa_free_cpumask);
   NUMA_LIBRARY_FIND(numa_free_nodemask);
   NUMA_LIBRARY_FIND(numa_run_on_node_mask);
+  NUMA_LIBRARY_FIND(numa_run_on_node_mask_all);
   NUMA_LIBRARY_FIND(numa_set_interleave_mask);
   NUMA_LIBRARY_FIND(numa_set_localalloc);
 
@@ -192,10 +197,7 @@ int numaAPI_GetNumNodes(void) {
 }
 
 bool numaAPI_IsNodeAvailable(int node) {
-  if (numa_node_size(node, NULL) > 0) {
-    return true;
-  }
-  return false;
+  return numaAPI_GetNumNodeProcessors(node) > 0;
 }
 
 int numaAPI_GetNumNodeProcessors(int node) {
@@ -235,13 +237,15 @@ bool numaAPI_RunThreadOnNode(int node) {
   struct bitmask* node_mask = numa_allocate_nodemask();
   numa_bitmask_clearall(node_mask);
   numa_bitmask_setbit(node_mask, node);
-  numa_run_on_node_mask(node_mask);
+  numa_run_on_node_mask_all(node_mask);
   // TODO(sergey): The following commands are based on x265 code, we might want
   // to make those optional, or require to call those explicitly.
   //
   // Current assumption is that this is similar to SetThreadGroupAffinity().
-  numa_set_interleave_mask(node_mask);
-  numa_set_localalloc();
+  if (numa_node_size(node, NULL) > 0) {
+    numa_set_interleave_mask(node_mask);
+    numa_set_localalloc();
+  }
 #ifdef WITH_DYNLOAD
   if (numa_free_nodemask != NULL) {
     numa_free_nodemask(node_mask);