Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2019-01-11 17:09:46 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2019-01-11 17:09:46 +0300
commitcca35c10135290f1a7e298d51964eef2c6c67ce0 (patch)
treec2c96c0e58b46ba38b53142f8d4330ea829a9c01 /intern
parent5793a84f1262ce78398320f8518816ff7e62cdd8 (diff)
parentc1dd74580ed8352b9f6c96d816a604ebb4f3c39d (diff)
Merge branch 'blender2.7'
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h6
-rw-r--r--intern/cycles/util/util_task.cpp165
-rw-r--r--intern/cycles/util/util_thread.cpp5
-rw-r--r--intern/cycles/util/util_thread.h6
4 files changed, 140 insertions, 42 deletions
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 4ca49ea6733..cffd61cb7d1 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -36,8 +36,6 @@ ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
if(sy >= s) {
return false;
}
- co->z = sx-r;
- co->w = sy-r;
/* Pixels still need to lie inside the denoising buffer after applying the offset,
* so determine the area for which this is the case. */
@@ -59,8 +57,8 @@ ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
return false;
}
- co->x = x;
- co->y = y;
+
+ *co = make_int4(x, y, sx - r, sy - r);
*ofs = (sy*s + sx) * stride;
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 50a2bb160ff..7e9f7313fba 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -185,59 +185,149 @@ list<TaskScheduler::Entry> TaskScheduler::queue;
thread_mutex TaskScheduler::queue_mutex;
thread_condition_variable TaskScheduler::queue_cond;
-void TaskScheduler::init(int num_threads)
+namespace {
+
+/* Get number of processors on each of the available nodes. The result is sized
+ * by the highest node index, and element corresponds to number of processors on
+ * that node.
+ * If node is not available, then the corresponding number of processors is
+ * zero. */
+void get_per_node_num_processors(vector<int>* num_per_node_processors)
{
- thread_scoped_lock lock(mutex);
-
- /* multiple cycles instances can use this task scheduler, sharing the same
- * threads, so we keep track of the number of users. */
- if(users == 0) {
- do_exit = false;
-
- const bool use_auto_threads = (num_threads == 0);
- if(use_auto_threads) {
- /* automatic number of threads */
- num_threads = system_cpu_thread_count();
+ const int num_nodes = system_cpu_num_numa_nodes();
+ if(num_nodes == 0) {
+ LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen.";
+ return;
+ }
+ num_per_node_processors->resize(num_nodes);
+ for(int node = 0; node < num_nodes; ++node) {
+ if(!system_cpu_is_numa_node_available(node)) {
+ (*num_per_node_processors)[node] = 0;
+ continue;
}
- VLOG(1) << "Creating pool of " << num_threads << " threads.";
+ (*num_per_node_processors)[node] =
+ system_cpu_num_numa_node_processors(node);
+ }
+}
- /* launch threads that will be waiting for work */
- threads.resize(num_threads);
+/* Calculate total number of processors on all available nodes.
+ * This is similar to system_cpu_thread_count(), but uses pre-calculated number
+ * of processors on each of the node, avoiding extra system calls and checks for
+ * the node availability. */
+int get_num_total_processors(const vector<int>& num_per_node_processors)
+{
+ int num_total_processors = 0;
+ foreach(int num_node_processors, num_per_node_processors) {
+ num_total_processors += num_node_processors;
+ }
+ return num_total_processors;
+}
- const int num_nodes = system_cpu_num_numa_nodes();
- int thread_index = 0;
- for (int node = 0;
- node < num_nodes && thread_index < threads.size();
- ++node)
+/* Assign every thread a node on which is should be running, for the best
+ * performance. */
+void distribute_threads_on_nodes(const vector<thread*>& threads)
+{
+ const int num_threads = threads.size();
+ /* TODO(sergey): Skip overriding affinity if threads fits into the current
+ * nodes/CPU group. This will allow user to tweak affinity for weird and
+ * wonderful reasons. */
+ vector<int> num_per_node_processors;
+ get_per_node_num_processors(&num_per_node_processors);
+ if(num_per_node_processors.size() == 0) {
+ /* Error was already repported, here we can't do anything, so we simply
+ * leave default affinity to all the worker threads. */
+ return;
+ }
+ const int num_nodes = num_per_node_processors.size();
+ int thread_index = 0;
+ /* First pass: fill in all the nodes to their maximum.
+ *
+ * If there is less threads than the overall nodes capacity, some of the
+ * nodes or parts of them will idle.
+ *
+ * TODO(sergey): Consider picking up fastest nodes if number of threads
+ * fits on them. For example, on Threadripper2 we might consider using nodes
+ * 0 and 2 if user requested 32 render threads. */
+ const int num_total_node_processors =
+ get_num_total_processors(num_per_node_processors);
+ int current_node_index = 0;
+ while(thread_index < num_total_node_processors &&
+ thread_index < num_threads) {
+ const int num_node_processors =
+ num_per_node_processors[current_node_index];
+ for(int processor_index = 0;
+ processor_index < num_node_processors;
+ ++processor_index)
{
- if (!system_cpu_is_numa_node_available(node)) {
- continue;
- }
- const int num_node_processors =
- system_cpu_num_numa_node_processors(node);
- for (int i = 0;
- i < num_node_processors && thread_index < threads.size();
- ++i)
- {
- threads[thread_index] = new thread(
- function_bind(&TaskScheduler::thread_run,
- thread_index + 1),
- node);
- thread_index++;
+ VLOG(1) << "Scheduling thread " << thread_index << " to node "
+ << current_node_index << ".";
+ threads[thread_index]->schedule_to_node(current_node_index);
+ ++thread_index;
+ if(thread_index == num_threads) {
+ /* All threads are scheduled on their nodes. */
+ return;
}
}
+ ++current_node_index;
}
+ /* Second pass: keep scheduling threads to each node one by one, uniformly
+ * fillign them in.
+ * This is where things becomes tricky to predict for the maximum
+ * performance: on the one hand this avoids too much threading overhead on
+ * few nodes, but for the final performance having all the overhead on one
+ * node might be better idea (since other nodes will have better chance of
+ * rendering faster).
+ * But more tricky is that nodes might have difference capacity, so we might
+ * want to do some weighted scheduling. For example, if node 0 has 16
+ * processors and node 1 has 32 processors, we'd better schedule 1 extra
+ * thread on node 0 and 2 extra threads on node 1. */
+ current_node_index = 0;
+ while(thread_index < num_threads) {
+ /* Skip unavailable nodes. */
+ /* TODO(sergey): Add sanity check against deadlock. */
+ while(num_per_node_processors[current_node_index] == 0) {
+ current_node_index = (current_node_index + 1) % num_nodes;
+ }
+ VLOG(1) << "Scheduling thread " << thread_index << " to node "
+ << current_node_index << ".";
+ ++thread_index;
+ current_node_index = (current_node_index + 1) % num_nodes;
+ }
+}
+
+} // namespace
- users++;
+void TaskScheduler::init(int num_threads)
+{
+ thread_scoped_lock lock(mutex);
+ /* Multiple cycles instances can use this task scheduler, sharing the same
+ * threads, so we keep track of the number of users. */
+ ++users;
+ if(users != 1) {
+ return;
+ }
+ do_exit = false;
+ const bool use_auto_threads = (num_threads == 0);
+ if(use_auto_threads) {
+ /* Automatic number of threads. */
+ num_threads = system_cpu_thread_count();
+ }
+ VLOG(1) << "Creating pool of " << num_threads << " threads.";
+ /* Launch threads that will be waiting for work. */
+ threads.resize(num_threads);
+ for(int thread_index = 0; thread_index < num_threads; ++thread_index) {
+ threads[thread_index] = new thread(
+ function_bind(&TaskScheduler::thread_run, thread_index + 1));
+ }
+ distribute_threads_on_nodes(threads);
}
void TaskScheduler::exit()
{
thread_scoped_lock lock(mutex);
-
users--;
-
if(users == 0) {
+ VLOG(1) << "De-initializing thread pool of task scheduler.";
/* stop all waiting threads */
TaskScheduler::queue_mutex.lock();
do_exit = true;
@@ -249,7 +339,6 @@ void TaskScheduler::exit()
t->join();
delete t;
}
-
threads.clear();
}
}
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 4d30e3f564f..1880eefcb9c 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -58,4 +58,9 @@ bool thread::join()
}
}
+void thread::schedule_to_node(int node)
+{
+ node_ = node;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index d54199a37fc..d21a7a8c773 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -46,12 +46,18 @@ typedef std::condition_variable thread_condition_variable;
class thread {
public:
+ /* NOTE: Node index of -1 means that affinity will be inherited from the
+ * parent thread and no override on top of that will happen. */
thread(function<void()> run_cb, int node = -1);
~thread();
static void *run(void *arg);
bool join();
+ /* For an existing thread descriptor which is NOT running yet, assign node
+ * on which it should be running. */
+ void schedule_to_node(int node);
+
protected:
function<void()> run_cb_;
std::thread thread_;