source/blender/compositor/realtime_compositor/intern/scheduler.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

/* SPDX-License-Identifier: GPL-2.0-or-later */

#include "BLI_map.hh"
#include "BLI_set.hh"
#include "BLI_stack.hh"
#include "BLI_vector.hh"
#include "BLI_vector_set.hh"

#include "NOD_derived_node_tree.hh"

#include "BKE_node.h"
#include "BKE_node_runtime.hh"

#include "COM_scheduler.hh"
#include "COM_utilities.hh"

namespace blender::realtime_compositor {

using namespace nodes::derived_node_tree_types;

/* Find the active context from the given context and its descendants contexts. The active context
 * is the one whose node instance key matches the active_viewer_key stored in the root node tree.
 * The instance key of each context is computed by calling BKE_node_instance_key given the key of
 * the parent as well as the group node making the context. */
static const DTreeContext *find_active_context_recursive(const DTreeContext *context,
                                                         bNodeInstanceKey key)
{
  /* The instance key of the given context matches the active viewer instance key, so this is the
   * active context, return it. */
  if (key.value == context->derived_tree().root_context().btree().active_viewer_key.value) {
    return context;
  }

  /* For each of the group nodes, compute their instance key and contexts and call this function
   * recursively. */
  for (const bNode *group_node : context->btree().group_nodes()) {
    const bNodeInstanceKey child_key = BKE_node_instance_key(key, &context->btree(), group_node);
    const DTreeContext *child_context = context->child_context(*group_node);
    const DTreeContext *found_context = find_active_context_recursive(child_context, child_key);

    /* If the found context is null, that means neither the child context nor one of its descendant
     * contexts is active. */
    if (!found_context) {
      continue;
    }

    /* Otherwise, we have found our active context, return it. */
    return found_context;
  }

  /* Neither the given context nor one of its descendant contexts is active, so return null. */
  return nullptr;
}

/* Find the active context for the given node tree. The active context represents the node tree
 * currently being edited. In most cases, that would be the top level node tree itself, but in the
 * case where the user is editing the node tree of a node group, the active context would be a
 * representation of the node tree of that node group. Note that the context also stores the group
 * node that the user selected to edit the node tree, so the context fully represents a particular
 * instance of the node group. */
static const DTreeContext *find_active_context(const DerivedNodeTree &tree)
{
  /* The root context has an instance key of NODE_INSTANCE_KEY_BASE by definition. */
  return find_active_context_recursive(&tree.root_context(), NODE_INSTANCE_KEY_BASE);
}

/* Return the output node which is marked as NODE_DO_OUTPUT. If multiple types of output nodes are
 * marked, then the preference will be CMP_NODE_COMPOSITE > CMP_NODE_VIEWER > CMP_NODE_SPLITVIEWER.
 * If no output node exists, a null node will be returned. */
static DNode find_output_in_context(const DTreeContext *context)
{
  const bNodeTree &tree = context->btree();

  for (const bNode *node : tree.nodes_by_type("CompositorNodeComposite")) {
    if (node->flag & NODE_DO_OUTPUT) {
      return DNode(context, node);
    }
  }

  for (const bNode *node : tree.nodes_by_type("CompositorNodeViewer")) {
    if (node->flag & NODE_DO_OUTPUT) {
      return DNode(context, node);
    }
  }

  for (const bNode *node : tree.nodes_by_type("CompositorNodeSplitViewer")) {
    if (node->flag & NODE_DO_OUTPUT) {
      return DNode(context, node);
    }
  }

  return DNode();
}

/* Compute the output node whose result should be computed. This node is the output node that
 * satisfies the requirements in the find_output_in_context function. First, the active context is
 * searched for an output node, if non was found, the root context is search. For more information
 * on what contexts mean here, see the find_active_context function. */
static DNode compute_output_node(const DerivedNodeTree &tree)
{
  const DTreeContext *active_context = find_active_context(tree);

  const DNode node = find_output_in_context(active_context);
  if (node) {
    return node;
  }

  /* If the active context is the root one and no output node was found, we consider this node tree
   * to have no output node, even if one of the non-active descendants have an output node. */
  if (active_context->is_root()) {
    return DNode();
  }

  /* The active context doesn't have an output node, search in the root context as a fallback. */
  return find_output_in_context(&tree.root_context());
}

/* A type representing a mapping that associates each node with a heuristic estimation of the
 * number of intermediate buffers needed to compute it and all of its dependencies. See the
 * compute_number_of_needed_buffers function for more information. */
using NeededBuffers = Map<DNode, int>;

/* Compute a heuristic estimation of the number of intermediate buffers needed to compute each node
 * and all of its dependencies for all nodes that the given node depends on. The output is a map
 * that maps each node with the number of intermediate buffers needed to compute it and all of its
 * dependencies.
 *
 * Consider a node that takes n number of buffers as an input from a number of node dependencies,
 * which we shall call the input nodes. The node also computes and outputs m number of buffers.
 * In order for the node to compute its output, a number of intermediate buffers will be needed.
 * Since the node takes n buffers and outputs m buffers, then the number of buffers directly
 * needed by the node is (n + m). But each of the input buffers are computed by a node that, in
 * turn, needs a number of buffers to compute its output. So the total number of buffers needed
 * to compute the output of the node is max(n + m, d) where d is the number of buffers needed by
 * the input node that needs the largest number of buffers. We only consider the input node that
 * needs the largest number of buffers, because those buffers can be reused by any input node
 * that needs a lesser number of buffers.
 *
 * Shader nodes, however, are a special case because links between two shader nodes inside the same
 * shader operation don't pass a buffer, but a single value in the compiled shader. So for shader
 * nodes, only inputs and outputs linked to nodes that are not shader nodes should be considered.
 * Note that this might not actually be true, because the compiler may decide to split a shader
 * operation into multiples ones that will pass buffers, but this is not something that can be
 * known at scheduling-time. See the discussion in COM_compile_state.hh, COM_evaluator.hh, and
 * COM_shader_operation.hh for more information. In the node tree shown below, node 4 will have
 * exactly the same number of needed buffers by node 3, because its inputs and outputs are all
 * internally linked in the shader operation.
 *
 *                                      Shader Operation
 *                   +------------------------------------------------------+
 * .------------.    |  .------------.  .------------.      .------------.  |  .------------.
 * |   Node 1   |    |  |   Node 3   |  |   Node 4   |      |   Node 5   |  |  |   Node 6   |
 * |            |----|--|            |--|            |------|            |--|--|            |
 * |            |  .-|--|            |  |            |  .---|            |  |  |            |
 * '------------'  | |  '------------'  '------------'  |   '------------'  |  '------------'
 *                 | +----------------------------------|-------------------+
 * .------------.  |                                    |
 * |   Node 2   |  |                                    |
 * |            |--'------------------------------------'
 * |            |
 * '------------'
 *
 * Note that the computed output is not guaranteed to be accurate, and will not be in most cases.
 * The computation is merely a heuristic estimation that works well in most cases. This is due to a
 * number of reasons:
 * - The node tree is actually a graph that allows output sharing, which is not something that was
 *   taken into consideration in this implementation because it is difficult to correctly consider.
 * - Each node may allocate any number of internal buffers, which is not taken into account in this
 *   implementation because it rarely affects the output and is done by very few nodes.
 * - The compiler may decide to compiler the schedule differently depending on runtime information
 *   which we can merely speculate at scheduling-time as described above. */
static NeededBuffers compute_number_of_needed_buffers(DNode output_node)
{
  NeededBuffers needed_buffers;

  /* A stack of nodes used to traverse the node tree starting from the output node. */
  Stack<DNode> node_stack = {output_node};

  /* Traverse the node tree in a post order depth first manner and compute the number of needed
   * buffers for each node. Post order traversal guarantee that all the node dependencies of each
   * node are computed before it. This is done by pushing all the uncomputed node dependencies to
   * the node stack first and only popping and computing the node when all its node dependencies
   * were computed. */
  while (!node_stack.is_empty()) {
    /* Do not pop the node immediately, as it may turn out that we can't compute its number of
     * needed buffers just yet because its dependencies weren't computed, it will be popped later
     * when needed. */
    DNode &node = node_stack.peek();

    /* Go over the node dependencies connected to the inputs of the node and push them to the node
     * stack if they were not computed already. */
    Set<DNode> pushed_nodes;
    for (const bNodeSocket *input : node->input_sockets()) {
      const DInputSocket dinput{node.context(), input};

      /* Get the output linked to the input. If it is null, that means the input is unlinked and
       * has no dependency node. */
      const DOutputSocket doutput = get_output_linked_to_input(dinput);
      if (!doutput) {
        continue;
      }

      /* The node dependency was already computed or pushed before, so skip it. */
      if (needed_buffers.contains(doutput.node()) || pushed_nodes.contains(doutput.node())) {
        continue;
      }

      /* The output node needs to be computed, push the node dependency to the node stack and
       * indicate that it was pushed. */
      node_stack.push(doutput.node());
      pushed_nodes.add_new(doutput.node());
    }

    /* If any of the node dependencies were pushed, that means that not all of them were computed
     * and consequently we can't compute the number of needed buffers for this node just yet. */
    if (!pushed_nodes.is_empty()) {
      continue;
    }

    /* We don't need to store the result of the pop because we already peeked at it before. */
    node_stack.pop();

    /* Compute the number of buffers that the node takes as an input as well as the number of
     * buffers needed to compute the most demanding of the node dependencies. */
    int number_of_input_buffers = 0;
    int buffers_needed_by_dependencies = 0;
    for (const bNodeSocket *input : node->input_sockets()) {
      const DInputSocket dinput{node.context(), input};

      /* Get the output linked to the input. If it is null, that means the input is unlinked.
       * Unlinked inputs do not take a buffer, so skip those inputs. */
      const DOutputSocket doutput = get_output_linked_to_input(dinput);
      if (!doutput) {
        continue;
      }

      /* Since this input is linked, if the link is not between two shader nodes, it means that the
       * node takes a buffer through this input and so we increment the number of input buffers. */
      if (!is_shader_node(node) || !is_shader_node(doutput.node())) {
        number_of_input_buffers++;
      }

      /* If the number of buffers needed by the node dependency is more than the total number of
       * buffers needed by the dependencies, then update the latter to be the former. This is
       * computing the "d" in the aforementioned equation "max(n + m, d)". */
      const int buffers_needed_by_dependency = needed_buffers.lookup(doutput.node());
      if (buffers_needed_by_dependency > buffers_needed_by_dependencies) {
        buffers_needed_by_dependencies = buffers_needed_by_dependency;
      }
    }

    /* Compute the number of buffers that will be computed/output by this node. */
    int number_of_output_buffers = 0;
    for (const bNodeSocket *output : node->output_sockets()) {
      const DOutputSocket doutput{node.context(), output};

      /* The output is not linked, it outputs no buffer. */
      if (!output->is_logically_linked()) {
        continue;
      }

      /* If any of the links is not between two shader nodes, it means that the node outputs
       * a buffer through this output and so we increment the number of output buffers. */
      if (!is_output_linked_to_node_conditioned(doutput, is_shader_node) ||
          !is_shader_node(node)) {
        number_of_output_buffers++;
      }
    }

    /* Compute the heuristic estimation of the number of needed intermediate buffers to compute
     * this node and all of its dependencies. This is computing the aforementioned equation
     * "max(n + m, d)". */
    const int total_buffers = MAX2(number_of_input_buffers + number_of_output_buffers,
                                   buffers_needed_by_dependencies);
    needed_buffers.add(node, total_buffers);
  }

  return needed_buffers;
}

/* There are multiple different possible orders of evaluating a node graph, each of which needs
 * to allocate a number of intermediate buffers to store its intermediate results. It follows
 * that we need to find the evaluation order which uses the least amount of intermediate buffers.
 * For instance, consider a node that takes two input buffers A and B. Each of those buffers is
 * computed through a number of nodes constituting a sub-graph whose root is the node that
 * outputs that buffer. Suppose the number of intermediate buffers needed to compute A and B are
 * N(A) and N(B) respectively and N(A) > N(B). Then evaluating the sub-graph computing A would be
 * a better option than that of B, because had B was computed first, its outputs will need to be
 * stored in extra buffers in addition to the buffers needed by A. The number of buffers needed by
 * each node is estimated as described in the compute_number_of_needed_buffers function.
 *
 * This is a heuristic generalization of the Sethi–Ullman algorithm, a generalization that
 * doesn't always guarantee an optimal evaluation order, as the optimal evaluation order is very
 * difficult to compute, however, this method works well in most cases. Moreover it assumes that
 * all buffers will have roughly the same size, which may not always be the case. */
Schedule compute_schedule(const DerivedNodeTree &tree)
{
  Schedule schedule;

  /* Compute the output node whose result should be computed. */
  const DNode output_node = compute_output_node(tree);

  /* No output node, the node tree has no effect, return an empty schedule. */
  if (!output_node) {
    return schedule;
  }

  /* Compute the number of buffers needed by each node connected to the output. */
  const NeededBuffers needed_buffers = compute_number_of_needed_buffers(output_node);

  /* A stack of nodes used to traverse the node tree starting from the output node. */
  Stack<DNode> node_stack = {output_node};

  /* Traverse the node tree in a post order depth first manner, scheduling the nodes in an order
   * informed by the number of buffers needed by each node. Post order traversal guarantee that all
   * the node dependencies of each node are scheduled before it. This is done by pushing all the
   * unscheduled node dependencies to the node stack first and only popping and scheduling the node
   * when all its node dependencies were scheduled. */
  while (!node_stack.is_empty()) {
    /* Do not pop the node immediately, as it may turn out that we can't schedule it just yet
     * because its dependencies weren't scheduled, it will be popped later when needed. */
    DNode &node = node_stack.peek();

    /* Compute the nodes directly connected to the node inputs sorted by their needed buffers such
     * that the node with the lowest number of needed buffers comes first. Note that we actually
     * want the node with the highest number of needed buffers to be schedule first, but since
     * those are pushed to the traversal stack, we need to push them in reverse order. */
    Vector<DNode> sorted_dependency_nodes;
    for (const bNodeSocket *input : node->input_sockets()) {
      const DInputSocket dinput{node.context(), input};

      /* Get the output linked to the input. If it is null, that means the input is unlinked and
       * has no dependency node, so skip it. */
      const DOutputSocket doutput = get_output_linked_to_input(dinput);
      if (!doutput) {
        continue;
      }

      /* The dependency node was added before, so skip it. The number of dependency nodes is very
       * small, typically less than 3, so a linear search is okay. */
      if (sorted_dependency_nodes.contains(doutput.node())) {
        continue;
      }

      /* The dependency node was already schedule, so skip it. */
      if (schedule.contains(doutput.node())) {
        continue;
      }

      /* Sort in ascending order on insertion, the number of dependency nodes is very small,
       * typically less than 3, so insertion sort is okay. */
      int insertion_position = 0;
      for (int i = 0; i < sorted_dependency_nodes.size(); i++) {
        if (needed_buffers.lookup(doutput.node()) >
            needed_buffers.lookup(sorted_dependency_nodes[i])) {
          insertion_position++;
        }
        else {
          break;
        }
      }
      sorted_dependency_nodes.insert(insertion_position, doutput.node());
    }

    /* Push the sorted dependency nodes to the node stack in order. */
    for (const DNode &dependency_node : sorted_dependency_nodes) {
      node_stack.push(dependency_node);
    }

    /* If there are no sorted dependency nodes, that means they were all already scheduled or that
     * none exists in the first place, so we can pop and schedule the node now. */
    if (sorted_dependency_nodes.is_empty()) {
      /* The node might have already been scheduled, so we don't use add_new here and simply don't
       * add it if it was already scheduled. */
      schedule.add(node_stack.pop());
    }
  }

  return schedule;
}

}  // namespace blender::realtime_compositor