1 files changed, 281 insertions, 0 deletions
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+  /* TODO: limit this to number of threads of CPU device, it may be smaller than
+   * the system number of threads when we reduce the number of CPU threads in
+   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+  return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  const int thread_index = tbb::this_task_arena::current_thread_index();
+  DCHECK_GE(thread_index, 0);
+  DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+  return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      kernels_(*(device->get_cpu_kernels()))
+{
+  DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+  /* Cache per-thread kernel globals. */
+  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  const int64_t image_width = effective_buffer_params_.width;
+  const int64_t image_height = effective_buffer_params_.height;
+  const int64_t total_pixels_num = image_width * image_height;
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
+  }
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+      if (is_cancel_requested()) {
+        return;
+      }
+
+      const int y = work_index / image_width;
+      const int x = work_index - y * image_width;
+
+      KernelWorkTile work_tile;
+      work_tile.x = effective_buffer_params_.full_x + x;
+      work_tile.y = effective_buffer_params_.full_y + y;
+      work_tile.w = 1;
+      work_tile.h = 1;
+      work_tile.start_sample = start_sample;
+      work_tile.num_samples = 1;
+      work_tile.offset = effective_buffer_params_.offset;
+      work_tile.stride = effective_buffer_params_.stride;
+
+      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+    });
+  });
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
+  }
+
+  statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                                    const KernelWorkTile &work_tile,
+                                                    const int samples_num)
+{
+  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+  const bool has_bake = device_scene_->data.bake.use;
+
+  IntegratorStateCPU integrator_states[2] = {};
+
+  IntegratorStateCPU *state = &integrator_states[0];
+  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+  KernelWorkTile sample_work_tile = work_tile;
+  float *render_buffer = buffers_->buffer.data();
+
+  for (int sample = 0; sample < samples_num; ++sample) {
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    if (has_bake) {
+      if (!kernels_.integrator_init_from_bake(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+    else {
+      if (!kernels_.integrator_init_from_camera(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+
+    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+    if (has_shadow_catcher) {
+      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+    }
+
+    ++sample_work_tile.start_sample;
+  }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  half4 *rgba_half = gpu_display->map_texture_buffer();
+  if (!rgba_half) {
+    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+     * some implementations of GPUDisplay which can not map memory? */
+    return;
+  }
+
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.pixels_half_rgba = rgba_half;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+  });
+
+  gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+  return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+  buffers_->buffer.copy_to_device();
+  return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+  buffers_->zero();
+  return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  uint num_active_pixels = 0;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(full_y, full_y + height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+      bool row_converged = true;
+      uint num_row_pixels_active = 0;
+      for (int x = 0; x < width; ++x) {
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
+      }
+
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+      if (!row_converged) {
+        kernels_.adaptive_sampling_filter_x(
+            kernel_globals, render_buffer, y, full_x, width, offset, stride);
+      }
+    });
+  });
+
+  if (num_active_pixels) {
+    local_arena.execute([&]() {
+      tbb::parallel_for(full_x, full_x + width, [&](int x) {
+        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+        kernels_.adaptive_sampling_filter_y(
+            kernel_globals, render_buffer, x, full_y, height, offset, stride);
+      });
+    });
+  }
+
+  return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
+CCL_NAMESPACE_END