1 files changed, 173 insertions, 0 deletions
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..465b4a8d4da
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+  DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+                      const int max_num_points,
+                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+                      const function<void(device_vector<float4> &)> &read_output)
+{
+  bool first_device = true;
+  bool success = true;
+
+  device_->foreach_device([&](Device *device) {
+    if (!first_device) {
+      LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+                    "single device.";
+      return;
+    }
+    first_device = false;
+
+    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+    /* Allocate and copy device buffers. */
+    DCHECK_EQ(input.device, device);
+    DCHECK_EQ(output.device, device);
+    DCHECK_LE(output.size(), input.size());
+
+    input.alloc(max_num_points);
+    int num_points = fill_input(input);
+    if (num_points == 0) {
+      return;
+    }
+
+    input.copy_to_device();
+    output.alloc(num_points);
+    output.zero_to_device();
+
+    /* Evaluate on CPU or GPU. */
+    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+                                                  eval_gpu(device, type, input, output);
+
+    /* Copy data back from device if not cancelled. */
+    if (success) {
+      output.copy_from_device(0, 1, output.size());
+      read_output(output);
+    }
+
+    input.free();
+    output.free();
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+  /* Find required kernel function. */
+  const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+  /* Simple parallel_for over all work items. */
+  const int64_t work_size = output.size();
+  KernelShaderEvalInput *input_data = input.data();
+  float4 *output_data = output.data();
+  bool success = true;
+
+  tbb::task_arena local_arena(device->info.cpu_threads);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+      /* TODO: is this fast enough? */
+      if (progress_.get_cancel()) {
+        success = false;
+        return;
+      }
+
+      const int thread_index = tbb::this_task_arena::current_thread_index();
+      KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+      switch (type) {
+        case SHADER_EVAL_DISPLACE:
+          kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+          break;
+        case SHADER_EVAL_BACKGROUND:
+          kernels.shader_eval_background(kg, input_data, output_data, work_index);
+          break;
+      }
+    });
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  /* Find required kernel function. */
+  DeviceKernel kernel;
+  switch (type) {
+    case SHADER_EVAL_DISPLACE:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+      break;
+    case SHADER_EVAL_BACKGROUND:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+      break;
+  };
+
+  /* Create device queue. */
+  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+  queue->init_execution();
+
+  /* Execute work on GPU in chunk, so we can cancel.
+   * TODO : query appropriate size from device.*/
+  const int chunk_size = 65536;
+
+  const int work_size = output.size();
+  void *d_input = (void *)input.device_pointer;
+  void *d_output = (void *)output.device_pointer;
+
+  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int d_work_size = min(chunk_size, work_size - d_offset);
+    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+    queue->enqueue(kernel, d_work_size, args);
+    queue->synchronize();
+
+    if (progress_.get_cancel()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END