/* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

#include "integrator/path_trace_work_cpu.h"

#include "device/cpu/kernel.h"
#include "device/device.h"

#include "kernel/film/write.h"
#include "kernel/integrator/path_state.h"

#include "integrator/pass_accessor_cpu.h"
#include "integrator/path_trace_display.h"

#include "scene/scene.h"
#include "session/buffers.h"

#include "util/atomic.h"
#include "util/log.h"
#include "util/tbb.h"

CCL_NAMESPACE_BEGIN

/* Create TBB arena for execution of path tracing and rendering tasks. */
static inline tbb::task_arena local_tbb_arena_create(const Device *device)
{
  /* TODO: limit this to number of threads of CPU device, it may be smaller than
   * the system number of threads when we reduce the number of CPU threads in
   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
  return tbb::task_arena(device->info.cpu_threads);
}

/* Get CPUKernelThreadGlobals for the current thread. */
static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
{
  const int thread_index = tbb::this_task_arena::current_thread_index();
  DCHECK_GE(thread_index, 0);
  DCHECK_LE(thread_index, kernel_thread_globals.size());

  return &kernel_thread_globals[thread_index];
}

PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
                                   Film *film,
                                   DeviceScene *device_scene,
                                   bool *cancel_requested_flag)
    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
      kernels_(Device::get_cpu_kernels())
{
  DCHECK_EQ(device->info.type, DEVICE_CPU);
}

void PathTraceWorkCPU::init_execution()
{
  /* Cache per-thread kernel globals. */
  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
}

void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
                                      int samples_num,
                                      int sample_offset)
{
  const int64_t image_width = effective_buffer_params_.width;
  const int64_t image_height = effective_buffer_params_.height;
  const int64_t total_pixels_num = image_width * image_height;

  if (device_->profiler.active()) {
    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
      kernel_globals.start_profiling();
    }
  }

  tbb::task_arena local_arena = local_tbb_arena_create(device_);
  local_arena.execute([&]() {
    parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
      if (is_cancel_requested()) {
        return;
      }

      const int y = work_index / image_width;
      const int x = work_index - y * image_width;

      KernelWorkTile work_tile;
      work_tile.x = effective_buffer_params_.full_x + x;
      work_tile.y = effective_buffer_params_.full_y + y;
      work_tile.w = 1;
      work_tile.h = 1;
      work_tile.start_sample = start_sample;
      work_tile.sample_offset = sample_offset;
      work_tile.num_samples = 1;
      work_tile.offset = effective_buffer_params_.offset;
      work_tile.stride = effective_buffer_params_.stride;

      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);

      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
    });
  });
  if (device_->profiler.active()) {
    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
      kernel_globals.stop_profiling();
    }
  }

  statistics.occupancy = 1.0f;
}

void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobalsCPU *kernel_globals,
                                                    const KernelWorkTile &work_tile,
                                                    const int samples_num)
{
  const bool has_bake = device_scene_->data.bake.use;

  IntegratorStateCPU integrator_states[2];

  IntegratorStateCPU *state = &integrator_states[0];
  IntegratorStateCPU *shadow_catcher_state = nullptr;

  if (device_scene_->data.integrator.has_shadow_catcher) {
    shadow_catcher_state = &integrator_states[1];
    path_state_init_queues(shadow_catcher_state);
  }

  KernelWorkTile sample_work_tile = work_tile;
  float *render_buffer = buffers_->buffer.data();

  for (int sample = 0; sample < samples_num; ++sample) {
    if (is_cancel_requested()) {
      break;
    }

    if (has_bake) {
      if (!kernels_.integrator_init_from_bake(
              kernel_globals, state, &sample_work_tile, render_buffer)) {
        break;
      }
    }
    else {
      if (!kernels_.integrator_init_from_camera(
              kernel_globals, state, &sample_work_tile, render_buffer)) {
        break;
      }
    }

    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);

#ifdef WITH_PATH_GUIDING
    if (kernel_globals->data.integrator.train_guiding) {
      /* Push the generated sample data to the global sample data storage. */
      guiding_push_sample_data_to_global_storage(kernel_globals, state, render_buffer);
    }
#endif

    if (shadow_catcher_state) {
      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
    }

    ++sample_work_tile.start_sample;
  }
}

void PathTraceWorkCPU::copy_to_display(PathTraceDisplay *display,
                                       PassMode pass_mode,
                                       int num_samples)
{
  half4 *rgba_half = display->map_texture_buffer();
  if (!rgba_half) {
    /* TODO(sergey): Look into using copy_to_display() if mapping failed. Might be needed for
     * some implementations of PathTraceDisplay which can not map memory? */
    return;
  }

  const KernelFilm &kfilm = device_scene_->data.film;

  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);

  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);

  PassAccessor::Destination destination = get_display_destination_template(display);
  destination.pixels_half_rgba = rgba_half;

  tbb::task_arena local_arena = local_tbb_arena_create(device_);
  local_arena.execute([&]() {
    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
  });

  display->unmap_texture_buffer();
}

void PathTraceWorkCPU::destroy_gpu_resources(PathTraceDisplay * /*display*/)
{
}

bool PathTraceWorkCPU::copy_render_buffers_from_device()
{
  return buffers_->copy_from_device();
}

bool PathTraceWorkCPU::copy_render_buffers_to_device()
{
  buffers_->buffer.copy_to_device();
  return true;
}

bool PathTraceWorkCPU::zero_render_buffers()
{
  buffers_->zero();
  return true;
}

int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
{
  const int full_x = effective_buffer_params_.full_x;
  const int full_y = effective_buffer_params_.full_y;
  const int width = effective_buffer_params_.width;
  const int height = effective_buffer_params_.height;
  const int offset = effective_buffer_params_.offset;
  const int stride = effective_buffer_params_.stride;

  float *render_buffer = buffers_->buffer.data();

  uint num_active_pixels = 0;

  tbb::task_arena local_arena = local_tbb_arena_create(device_);

  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
  local_arena.execute([&]() {
    parallel_for(full_y, full_y + height, [&](int y) {
      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];

      bool row_converged = true;
      uint num_row_pixels_active = 0;
      for (int x = 0; x < width; ++x) {
        if (!kernels_.adaptive_sampling_convergence_check(
                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
          ++num_row_pixels_active;
          row_converged = false;
        }
      }

      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);

      if (!row_converged) {
        kernels_.adaptive_sampling_filter_x(
            kernel_globals, render_buffer, y, full_x, width, offset, stride);
      }
    });
  });

  if (num_active_pixels) {
    local_arena.execute([&]() {
      parallel_for(full_x, full_x + width, [&](int x) {
        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
        kernels_.adaptive_sampling_filter_y(
            kernel_globals, render_buffer, x, full_y, height, offset, stride);
      });
    });
  }

  return num_active_pixels;
}

void PathTraceWorkCPU::cryptomatte_postproces()
{
  const int width = effective_buffer_params_.width;
  const int height = effective_buffer_params_.height;

  float *render_buffer = buffers_->buffer.data();

  tbb::task_arena local_arena = local_tbb_arena_create(device_);

  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
  local_arena.execute([&]() {
    parallel_for(0, height, [&](int y) {
      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
      int pixel_index = y * width;

      for (int x = 0; x < width; ++x, ++pixel_index) {
        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
      }
    });
  });
}

#ifdef WITH_PATH_GUIDING
/* Note: It seems that this is called before every rendering iteration/progression and not once per
 * rendering. May be we find a way to call it only once per rendering. */
void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
                                                   void *sample_data_storage,
                                                   const bool train)
{
  /* Linking the global guiding structures (e.g., Field and SampleStorage) to the per-thread
   * kernel globals. */
  for (int thread_index = 0; thread_index < kernel_thread_globals_.size(); thread_index++) {
    CPUKernelThreadGlobals &kg = kernel_thread_globals_[thread_index];
    openpgl::cpp::Field *field = (openpgl::cpp::Field *)guiding_field;

    /* Allocate sampling distributions. */
    kg.opgl_guiding_field = field;

#  if PATH_GUIDING_LEVEL >= 4
    if (kg.opgl_surface_sampling_distribution) {
      delete kg.opgl_surface_sampling_distribution;
      kg.opgl_surface_sampling_distribution = nullptr;
    }
    if (kg.opgl_volume_sampling_distribution) {
      delete kg.opgl_volume_sampling_distribution;
      kg.opgl_volume_sampling_distribution = nullptr;
    }

    if (field) {
      kg.opgl_surface_sampling_distribution = new openpgl::cpp::SurfaceSamplingDistribution(field);
      kg.opgl_volume_sampling_distribution = new openpgl::cpp::VolumeSamplingDistribution(field);
    }
#  endif

    /* Reserve storage for training. */
    kg.data.integrator.train_guiding = train;
    kg.opgl_sample_data_storage = (openpgl::cpp::SampleStorage *)sample_data_storage;

    if (train) {
      kg.opgl_path_segment_storage->Reserve(kg.data.integrator.transparent_max_bounce +
                                            kg.data.integrator.max_bounce + 3);
      kg.opgl_path_segment_storage->Clear();
    }
  }
}

void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(
    KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *ccl_restrict render_buffer)
{
#  ifdef WITH_CYCLES_DEBUG
  if (VLOG_WORK_IS_ON) {
    /* Check if the generated path segments contain valid values. */
    const bool validSegments = kg->opgl_path_segment_storage->ValidateSegments();
    if (!validSegments) {
      VLOG_WORK << "Guiding: invalid path segments!";
    }
  }

  /* Write debug render pass to validate it matches combined pass. */
  pgl_vec3f pgl_final_color = kg->opgl_path_segment_storage->CalculatePixelEstimate(false);
  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
                                        kernel_data.film.pass_stride;
  ccl_global float *buffer = render_buffer + render_buffer_offset;
  float3 final_color = make_float3(pgl_final_color.x, pgl_final_color.y, pgl_final_color.z);
  if (kernel_data.film.pass_guiding_color != PASS_UNUSED) {
    film_write_pass_float3(buffer + kernel_data.film.pass_guiding_color, final_color);
  }
#  else
  (void)state;
  (void)render_buffer;
#  endif

  /* Convert the path segment representation of the random walk into radiance samples. */
#  if PATH_GUIDING_LEVEL >= 2
  const bool use_direct_light = kernel_data.integrator.use_guiding_direct_light;
  const bool use_mis_weights = kernel_data.integrator.use_guiding_mis_weights;
  kg->opgl_path_segment_storage->PrepareSamples(
      false, nullptr, use_mis_weights, use_direct_light, false);
#  endif

#  ifdef WITH_CYCLES_DEBUG
  /* Check if the training/radiance samples generated py the path segment storage are valid.*/
  if (VLOG_WORK_IS_ON) {
    const bool validSamples = kg->opgl_path_segment_storage->ValidateSamples();
    if (!validSamples) {
      VLOG_WORK
          << "Guiding: path segment storage generated/contains invalid radiance/training samples!";
    }
  }
#  endif

#  if PATH_GUIDING_LEVEL >= 3
  /* Push radiance samples from current random walk/path to the global sample storage. */
  size_t num_samples = 0;
  const openpgl::cpp::SampleData *samples = kg->opgl_path_segment_storage->GetSamples(num_samples);
  kg->opgl_sample_data_storage->AddSamples(samples, num_samples);
#  endif

  /* Clear storage for the current path, to be ready for the next path. */
  kg->opgl_path_segment_storage->Clear();
}
#endif

CCL_NAMESPACE_END