Merge branch 'blender2.7'

author: Sergey Sharybin <sergey.vfx@gmail.com> 2019-01-11 17:09:46 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2019-01-11 17:09:46 +0300
commit: cca35c10135290f1a7e298d51964eef2c6c67ce0 (patch)
tree: c2c96c0e58b46ba38b53142f8d4330ea829a9c01 /intern
parent: 5793a84f1262ce78398320f8518816ff7e62cdd8 (diff)
parent: c1dd74580ed8352b9f6c96d816a604ebb4f3c39d (diff)
4 files changed, 140 insertions, 42 deletions
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 4ca49ea6733..cffd61cb7d1 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -36,8 +36,6 @@ ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
 	if(sy >= s) {
 		return false;
 	}
-	co->z = sx-r;
-	co->w = sy-r;
 
 	/* Pixels still need to lie inside the denoising buffer after applying the offset,
 	 * so determine the area for which this is the case. */
@@ -59,8 +57,8 @@ ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
 	if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
 		return false;
 	}
-	co->x = x;
-	co->y = y;
+
+	*co = make_int4(x, y, sx - r, sy - r);
 
 	*ofs = (sy*s + sx) * stride;
 
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 50a2bb160ff..7e9f7313fba 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -185,59 +185,149 @@ list<TaskScheduler::Entry> TaskScheduler::queue;
 thread_mutex TaskScheduler::queue_mutex;
 thread_condition_variable TaskScheduler::queue_cond;
 
-void TaskScheduler::init(int num_threads)
+namespace {
+
+/* Get number of processors on each of the available nodes. The result is sized
+ * by the highest node index, and element corresponds to number of processors on
+ * that node.
+ * If node is not available, then the corresponding number of processors is
+ * zero. */
+void get_per_node_num_processors(vector<int>* num_per_node_processors)
 {
-	thread_scoped_lock lock(mutex);
-
-	/* multiple cycles instances can use this task scheduler, sharing the same
-	 * threads, so we keep track of the number of users. */
-	if(users == 0) {
-		do_exit = false;
-
-		const bool use_auto_threads = (num_threads == 0);
-		if(use_auto_threads) {
-			/* automatic number of threads */
-			num_threads = system_cpu_thread_count();
+	const int num_nodes = system_cpu_num_numa_nodes();
+	if(num_nodes == 0) {
+		LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen.";
+		return;
+	}
+	num_per_node_processors->resize(num_nodes);
+	for(int node = 0; node < num_nodes; ++node) {
+		if(!system_cpu_is_numa_node_available(node)) {
+			(*num_per_node_processors)[node] = 0;
+			continue;
 		}
-		VLOG(1) << "Creating pool of " << num_threads << " threads.";
+		(*num_per_node_processors)[node] =
+		        system_cpu_num_numa_node_processors(node);
+	}
+}
 
-		/* launch threads that will be waiting for work */
-		threads.resize(num_threads);
+/* Calculate total number of processors on all available nodes.
+ * This is similar to system_cpu_thread_count(), but uses pre-calculated number
+ * of processors on each of the node, avoiding extra system calls and checks for
+ * the node availability. */
+int get_num_total_processors(const vector<int>& num_per_node_processors)
+{
+	int num_total_processors = 0;
+	foreach(int num_node_processors, num_per_node_processors) {
+		num_total_processors += num_node_processors;
+	}
+	return num_total_processors;
+}
 
-		const int num_nodes = system_cpu_num_numa_nodes();
-		int thread_index = 0;
-		for (int node = 0;
-		     node < num_nodes && thread_index < threads.size();
-		     ++node)
+/* Assign every thread a node on which is should be running, for the best
+ * performance. */
+void distribute_threads_on_nodes(const vector<thread*>& threads)
+{
+	const int num_threads = threads.size();
+	/* TODO(sergey): Skip overriding affinity if threads fits into the current
+	 * nodes/CPU group. This will allow user to tweak affinity for weird and
+	 * wonderful reasons. */
+	vector<int> num_per_node_processors;
+	get_per_node_num_processors(&num_per_node_processors);
+	if(num_per_node_processors.size() == 0) {
+		/* Error was already repported, here we can't do anything, so we simply
+		 * leave default affinity to all the worker threads. */
+		return;
+	}
+	const int num_nodes = num_per_node_processors.size();
+	int thread_index = 0;
+	/* First pass: fill in all the nodes to their maximum.
+	*
+	 * If there is less threads than the overall nodes capacity, some of the
+	 * nodes or parts of them will idle.
+	 *
+	 * TODO(sergey): Consider picking up fastest nodes if number of threads
+	 * fits on them. For example, on Threadripper2 we might consider using nodes
+	 * 0 and 2 if user requested 32 render threads. */
+	const int num_total_node_processors =
+	        get_num_total_processors(num_per_node_processors);
+	int current_node_index = 0;
+	while(thread_index < num_total_node_processors &&
+	      thread_index < num_threads) {
+		const int num_node_processors =
+		        num_per_node_processors[current_node_index];
+		for(int processor_index = 0;
+		    processor_index < num_node_processors;
+		    ++processor_index)
 		{
-			if (!system_cpu_is_numa_node_available(node)) {
-				continue;
-			}
-			const int num_node_processors =
-			        system_cpu_num_numa_node_processors(node);
-			for (int i = 0;
-			     i < num_node_processors && thread_index < threads.size();
-			     ++i)
-			{
-				threads[thread_index] = new thread(
-				        function_bind(&TaskScheduler::thread_run,
-				                      thread_index + 1),
-				        node);
-				thread_index++;
+			VLOG(1) << "Scheduling thread " << thread_index << " to node "
+			        << current_node_index << ".";
+			threads[thread_index]->schedule_to_node(current_node_index);
+			++thread_index;
+			if(thread_index == num_threads) {
+				/* All threads are scheduled on their nodes. */
+				return;
 			}
 		}
+		++current_node_index;
 	}
+	/* Second pass: keep scheduling threads to each node one by one, uniformly
+	 * fillign them in.
+	 * This is where things becomes tricky to predict for the maximum
+	 * performance: on the one hand this avoids too much threading overhead on
+	 * few nodes, but for the final performance having all the overhead on one
+	 * node might be better idea (since other nodes will have better chance of
+	 * rendering faster).
+	 * But more tricky is that nodes might have difference capacity, so we might
+	 * want to do some weighted scheduling. For example, if node 0 has 16
+	 * processors and node 1 has 32 processors, we'd better schedule 1 extra
+	 * thread on node 0 and 2 extra threads on node 1. */
+	current_node_index = 0;
+	while(thread_index < num_threads) {
+		/* Skip unavailable nodes. */
+		/* TODO(sergey): Add sanity check against deadlock. */
+		while(num_per_node_processors[current_node_index] == 0) {
+			current_node_index = (current_node_index + 1) % num_nodes;
+		}
+		VLOG(1) << "Scheduling thread " << thread_index << " to node "
+		        << current_node_index << ".";
+		++thread_index;
+		current_node_index = (current_node_index + 1) % num_nodes;
+	}
+}
+
+}  // namespace
 
-	users++;
+void TaskScheduler::init(int num_threads)
+{
+	thread_scoped_lock lock(mutex);
+	/* Multiple cycles instances can use this task scheduler, sharing the same
+	 * threads, so we keep track of the number of users. */
+	++users;
+	if(users != 1) {
+		return;
+	}
+	do_exit = false;
+	const bool use_auto_threads = (num_threads == 0);
+	if(use_auto_threads) {
+		/* Automatic number of threads. */
+		num_threads = system_cpu_thread_count();
+	}
+	VLOG(1) << "Creating pool of " << num_threads << " threads.";
+	/* Launch threads that will be waiting for work. */
+	threads.resize(num_threads);
+	for(int thread_index = 0; thread_index < num_threads; ++thread_index) {
+		threads[thread_index] = new thread(
+		        function_bind(&TaskScheduler::thread_run, thread_index + 1));
+	}
+	distribute_threads_on_nodes(threads);
 }
 
 void TaskScheduler::exit()
 {
 	thread_scoped_lock lock(mutex);
-
 	users--;
-
 	if(users == 0) {
+		VLOG(1) << "De-initializing thread pool of task scheduler.";
 		/* stop all waiting threads */
 		TaskScheduler::queue_mutex.lock();
 		do_exit = true;
@@ -249,7 +339,6 @@ void TaskScheduler::exit()
 			t->join();
 			delete t;
 		}
-
 		threads.clear();
 	}
 }
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 4d30e3f564f..1880eefcb9c 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -58,4 +58,9 @@ bool thread::join()
 	}
 }
 
+void thread::schedule_to_node(int node)
+{
+	node_ = node;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index d54199a37fc..d21a7a8c773 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -46,12 +46,18 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
+	/* NOTE: Node index of -1 means that affinity will be inherited from the
+	 * parent thread and no override on top of that will happen. */
 	thread(function<void()> run_cb, int node = -1);
 	~thread();
 
 	static void *run(void *arg);
 	bool join();
 
+	/* For an existing thread descriptor which is NOT running yet, assign node
+	 * on which it should be running. */
+	void schedule_to_node(int node);
+
 protected:
 	function<void()> run_cb_;
 	std::thread thread_;
author	Sergey Sharybin <sergey.vfx@gmail.com>	2019-01-11 17:09:46 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2019-01-11 17:09:46 +0300
commit	cca35c10135290f1a7e298d51964eef2c6c67ce0 (patch)
tree	c2c96c0e58b46ba38b53142f8d4330ea829a9c01 /intern
parent	5793a84f1262ce78398320f8518816ff7e62cdd8 (diff)
parent	c1dd74580ed8352b9f6c96d816a604ebb4f3c39d (diff)