From 5100e4419eb1651886d38d3fa62dec80481a3628 Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Thu, 10 Jan 2019 08:46:38 +1100
Subject: Cleanup: ensure header guards come first

Causes clang-format not to detect header guards,
indenting all preprocessor lines in the header.
---
 intern/clog/CLG_log.h                   | 6 +++---
 intern/ghost/intern/GHOST_EventNDOF.h   | 7 +++----
 intern/ghost/intern/GHOST_NDOFManager.h | 7 +++----
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'intern')

diff --git a/intern/clog/CLG_log.h b/intern/clog/CLG_log.h
index 61fc93be9e7..390122289b1 100644
--- a/intern/clog/CLG_log.h
+++ b/intern/clog/CLG_log.h
@@ -18,8 +18,8 @@
  * ***** END GPL LICENSE BLOCK *****
  */
 
-#ifndef __CLOG_H__
-#define __CLOG_H__
+#ifndef __CLG_LOG_H__
+#define __CLG_LOG_H__
 
 /** \file clog/CLG_log.h
  *  \ingroup clog
@@ -211,4 +211,4 @@ void CLG_logref_init(CLG_LogRef *clg_ref);
 }
 #endif
 
-#endif /* __CLOG_H__ */
+#endif /* __CLG_LOG_H__ */
diff --git a/intern/ghost/intern/GHOST_EventNDOF.h b/intern/ghost/intern/GHOST_EventNDOF.h
index b81f74becee..a7a7639c98e 100644
--- a/intern/ghost/intern/GHOST_EventNDOF.h
+++ b/intern/ghost/intern/GHOST_EventNDOF.h
@@ -24,16 +24,15 @@
  *  \ingroup GHOST
  */
 
+#ifndef __GHOST_EVENTNDOF_H__
+#define __GHOST_EVENTNDOF_H__
+
 #ifndef WITH_INPUT_NDOF
 #  error NDOF code included in non-NDOF-enabled build
 #endif
 
-#ifndef __GHOST_EVENTNDOF_H__
-#define __GHOST_EVENTNDOF_H__
-
 #include "GHOST_Event.h"
 
-
 class GHOST_EventNDOFMotion : public GHOST_Event
 {
 protected:
diff --git a/intern/ghost/intern/GHOST_NDOFManager.h b/intern/ghost/intern/GHOST_NDOFManager.h
index 78f24e07a6e..b23673940e6 100644
--- a/intern/ghost/intern/GHOST_NDOFManager.h
+++ b/intern/ghost/intern/GHOST_NDOFManager.h
@@ -21,16 +21,15 @@
  * ***** END GPL LICENSE BLOCK *****
  */
 
+#ifndef __GHOST_NDOFMANAGER_H__
+#define __GHOST_NDOFMANAGER_H__
+
 #ifndef WITH_INPUT_NDOF
 #  error NDOF code included in non-NDOF-enabled build
 #endif
 
-#ifndef __GHOST_NDOFMANAGER_H__
-#define __GHOST_NDOFMANAGER_H__
-
 #include "GHOST_System.h"
 
-
 // #define DEBUG_NDOF_MOTION
 // #define DEBUG_NDOF_BUTTONS
 
-- 
cgit v1.2.3


From b1e286bbfe9b8632a93ba40a7a40c77652479b7c Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Fri, 11 Jan 2019 09:22:21 +1100
Subject: MSVC: remove compiler __func__ define

No longer needed and exposes a bug in clang-format see: D4185
---
 intern/clog/CLG_log.h                       | 9 ---------
 intern/guardedalloc/intern/mallocn_intern.h | 4 ----
 2 files changed, 13 deletions(-)

(limited to 'intern')

diff --git a/intern/clog/CLG_log.h b/intern/clog/CLG_log.h
index 390122289b1..79d0d874d52 100644
--- a/intern/clog/CLG_log.h
+++ b/intern/clog/CLG_log.h
@@ -90,11 +90,6 @@ extern "C" {
 #  define _CLOG_ATTR_PRINTF_FORMAT(format_param, dots_param)
 #endif
 
-#if defined(_MSC_VER) && !defined(__func__)
-#  define __func__MSVC
-#  define __func__ __FUNCTION__
-#endif
-
 #define STRINGIFY_ARG(x) "" #x
 #define STRINGIFY_APPEND(a, b) "" a #b
 #define STRINGIFY(x) STRINGIFY_APPEND("", x)
@@ -203,10 +198,6 @@ void CLG_logref_init(CLG_LogRef *clg_ref);
 #define CLOG_STR_ERROR_N(clg_ref, ...)       CLOG_STR_AT_SEVERITY_N(clg_ref, CLG_SEVERITY_ERROR, 0, __VA_ARGS__)
 #define CLOG_STR_FATAL_N(clg_ref, ...)       CLOG_STR_AT_SEVERITY_N(clg_ref, CLG_SEVERITY_FATAL, 0, __VA_ARGS__)
 
-#ifdef __func__MSVC
-#  undef __func__MSVC
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h
index 754a79f08b5..79c0271aff0 100644
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@@ -39,10 +39,6 @@
 #  include <sys/mman.h>
 #endif
 
-#if defined(_MSC_VER)
-#  define __func__ __FUNCTION__
-#endif
-
 #ifdef __GNUC__
 #  define UNUSED(x) UNUSED_ ## x __attribute__((__unused__))
 #else
-- 
cgit v1.2.3


From e5a1a9288c66ce218a03abf7666336a39ba03b8f Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Fri, 11 Jan 2019 11:12:38 +0100
Subject: Fix T60320: Cycles OpenCL denoising filter errors on some drivers.

---
 intern/cycles/kernel/filter/filter_nlm_gpu.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'intern')

diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 4ca49ea6733..cffd61cb7d1 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -36,8 +36,6 @@ ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
 	if(sy >= s) {
 		return false;
 	}
-	co->z = sx-r;
-	co->w = sy-r;
 
 	/* Pixels still need to lie inside the denoising buffer after applying the offset,
 	 * so determine the area for which this is the case. */
@@ -59,8 +57,8 @@ ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
 	if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
 		return false;
 	}
-	co->x = x;
-	co->y = y;
+
+	*co = make_int4(x, y, sx - r, sy - r);
 
 	*ofs = (sy*s + sx) * stride;
 
-- 
cgit v1.2.3


From c1dd74580ed8352b9f6c96d816a604ebb4f3c39d Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Fri, 11 Jan 2019 15:01:54 +0100
Subject: Fix T60227: Crash when Cycles uses more than system threads

Tweaked scheduling so it survives this situation by scattering
"extra" threads uniformly over all the NUMA nodes.

There are still tweaks possible to make some specific hardware
configurations work better.
---
 intern/cycles/util/util_task.cpp   | 165 ++++++++++++++++++++++++++++---------
 intern/cycles/util/util_thread.cpp |   5 ++
 intern/cycles/util/util_thread.h   |   6 ++
 3 files changed, 138 insertions(+), 38 deletions(-)

(limited to 'intern')

diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 50a2bb160ff..7e9f7313fba 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -185,59 +185,149 @@ list<TaskScheduler::Entry> TaskScheduler::queue;
 thread_mutex TaskScheduler::queue_mutex;
 thread_condition_variable TaskScheduler::queue_cond;
 
-void TaskScheduler::init(int num_threads)
+namespace {
+
+/* Get number of processors on each of the available nodes. The result is sized
+ * by the highest node index, and element corresponds to number of processors on
+ * that node.
+ * If node is not available, then the corresponding number of processors is
+ * zero. */
+void get_per_node_num_processors(vector<int>* num_per_node_processors)
 {
-	thread_scoped_lock lock(mutex);
-
-	/* multiple cycles instances can use this task scheduler, sharing the same
-	 * threads, so we keep track of the number of users. */
-	if(users == 0) {
-		do_exit = false;
-
-		const bool use_auto_threads = (num_threads == 0);
-		if(use_auto_threads) {
-			/* automatic number of threads */
-			num_threads = system_cpu_thread_count();
+	const int num_nodes = system_cpu_num_numa_nodes();
+	if(num_nodes == 0) {
+		LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen.";
+		return;
+	}
+	num_per_node_processors->resize(num_nodes);
+	for(int node = 0; node < num_nodes; ++node) {
+		if(!system_cpu_is_numa_node_available(node)) {
+			(*num_per_node_processors)[node] = 0;
+			continue;
 		}
-		VLOG(1) << "Creating pool of " << num_threads << " threads.";
+		(*num_per_node_processors)[node] =
+		        system_cpu_num_numa_node_processors(node);
+	}
+}
 
-		/* launch threads that will be waiting for work */
-		threads.resize(num_threads);
+/* Calculate total number of processors on all available nodes.
+ * This is similar to system_cpu_thread_count(), but uses pre-calculated number
+ * of processors on each of the node, avoiding extra system calls and checks for
+ * the node availability. */
+int get_num_total_processors(const vector<int>& num_per_node_processors)
+{
+	int num_total_processors = 0;
+	foreach(int num_node_processors, num_per_node_processors) {
+		num_total_processors += num_node_processors;
+	}
+	return num_total_processors;
+}
 
-		const int num_nodes = system_cpu_num_numa_nodes();
-		int thread_index = 0;
-		for (int node = 0;
-		     node < num_nodes && thread_index < threads.size();
-		     ++node)
+/* Assign every thread a node on which is should be running, for the best
+ * performance. */
+void distribute_threads_on_nodes(const vector<thread*>& threads)
+{
+	const int num_threads = threads.size();
+	/* TODO(sergey): Skip overriding affinity if threads fits into the current
+	 * nodes/CPU group. This will allow user to tweak affinity for weird and
+	 * wonderful reasons. */
+	vector<int> num_per_node_processors;
+	get_per_node_num_processors(&num_per_node_processors);
+	if(num_per_node_processors.size() == 0) {
+		/* Error was already repported, here we can't do anything, so we simply
+		 * leave default affinity to all the worker threads. */
+		return;
+	}
+	const int num_nodes = num_per_node_processors.size();
+	int thread_index = 0;
+	/* First pass: fill in all the nodes to their maximum.
+	*
+	 * If there is less threads than the overall nodes capacity, some of the
+	 * nodes or parts of them will idle.
+	 *
+	 * TODO(sergey): Consider picking up fastest nodes if number of threads
+	 * fits on them. For example, on Threadripper2 we might consider using nodes
+	 * 0 and 2 if user requested 32 render threads. */
+	const int num_total_node_processors =
+	        get_num_total_processors(num_per_node_processors);
+	int current_node_index = 0;
+	while(thread_index < num_total_node_processors &&
+	      thread_index < num_threads) {
+		const int num_node_processors =
+		        num_per_node_processors[current_node_index];
+		for(int processor_index = 0;
+		    processor_index < num_node_processors;
+		    ++processor_index)
 		{
-			if (!system_cpu_is_numa_node_available(node)) {
-				continue;
-			}
-			const int num_node_processors =
-			        system_cpu_num_numa_node_processors(node);
-			for (int i = 0;
-			     i < num_node_processors && thread_index < threads.size();
-			     ++i)
-			{
-				threads[thread_index] = new thread(
-				        function_bind(&TaskScheduler::thread_run,
-				                      thread_index + 1),
-				        node);
-				thread_index++;
+			VLOG(1) << "Scheduling thread " << thread_index << " to node "
+			        << current_node_index << ".";
+			threads[thread_index]->schedule_to_node(current_node_index);
+			++thread_index;
+			if(thread_index == num_threads) {
+				/* All threads are scheduled on their nodes. */
+				return;
 			}
 		}
+		++current_node_index;
 	}
+	/* Second pass: keep scheduling threads to each node one by one, uniformly
+	 * fillign them in.
+	 * This is where things becomes tricky to predict for the maximum
+	 * performance: on the one hand this avoids too much threading overhead on
+	 * few nodes, but for the final performance having all the overhead on one
+	 * node might be better idea (since other nodes will have better chance of
+	 * rendering faster).
+	 * But more tricky is that nodes might have difference capacity, so we might
+	 * want to do some weighted scheduling. For example, if node 0 has 16
+	 * processors and node 1 has 32 processors, we'd better schedule 1 extra
+	 * thread on node 0 and 2 extra threads on node 1. */
+	current_node_index = 0;
+	while(thread_index < num_threads) {
+		/* Skip unavailable nodes. */
+		/* TODO(sergey): Add sanity check against deadlock. */
+		while(num_per_node_processors[current_node_index] == 0) {
+			current_node_index = (current_node_index + 1) % num_nodes;
+		}
+		VLOG(1) << "Scheduling thread " << thread_index << " to node "
+		        << current_node_index << ".";
+		++thread_index;
+		current_node_index = (current_node_index + 1) % num_nodes;
+	}
+}
+
+}  // namespace
 
-	users++;
+void TaskScheduler::init(int num_threads)
+{
+	thread_scoped_lock lock(mutex);
+	/* Multiple cycles instances can use this task scheduler, sharing the same
+	 * threads, so we keep track of the number of users. */
+	++users;
+	if(users != 1) {
+		return;
+	}
+	do_exit = false;
+	const bool use_auto_threads = (num_threads == 0);
+	if(use_auto_threads) {
+		/* Automatic number of threads. */
+		num_threads = system_cpu_thread_count();
+	}
+	VLOG(1) << "Creating pool of " << num_threads << " threads.";
+	/* Launch threads that will be waiting for work. */
+	threads.resize(num_threads);
+	for(int thread_index = 0; thread_index < num_threads; ++thread_index) {
+		threads[thread_index] = new thread(
+		        function_bind(&TaskScheduler::thread_run, thread_index + 1));
+	}
+	distribute_threads_on_nodes(threads);
 }
 
 void TaskScheduler::exit()
 {
 	thread_scoped_lock lock(mutex);
-
 	users--;
-
 	if(users == 0) {
+		VLOG(1) << "De-initializing thread pool of task scheduler.";
 		/* stop all waiting threads */
 		TaskScheduler::queue_mutex.lock();
 		do_exit = true;
@@ -249,7 +339,6 @@ void TaskScheduler::exit()
 			t->join();
 			delete t;
 		}
-
 		threads.clear();
 	}
 }
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 4d30e3f564f..1880eefcb9c 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -58,4 +58,9 @@ bool thread::join()
 	}
 }
 
+void thread::schedule_to_node(int node)
+{
+	node_ = node;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index d54199a37fc..d21a7a8c773 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -46,12 +46,18 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
+	/* NOTE: Node index of -1 means that affinity will be inherited from the
+	 * parent thread and no override on top of that will happen. */
 	thread(function<void()> run_cb, int node = -1);
 	~thread();
 
 	static void *run(void *arg);
 	bool join();
 
+	/* For an existing thread descriptor which is NOT running yet, assign node
+	 * on which it should be running. */
+	void schedule_to_node(int node);
+
 protected:
 	function<void()> run_cb_;
 	std::thread thread_;
-- 
cgit v1.2.3