/* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

#pragma once

#include "kernel/types.h"
#include "util/atomic.h"

CCL_NAMESPACE_BEGIN

/* Control Flow
 *
 * Utilities for control flow between kernels. The implementation is different between CPU and
 * GPU devices. For the latter part of the logic is handled on the host side with wavefronts.
 *
 * There is a main path for regular path tracing camera for path tracing. Shadows for next
 * event estimation branch off from this into their own path, that may be computed in
 * parallel while the main path continues. Additionally, shading kernels are sorted using
 * a key for coherence.
 *
 * Each kernel on the main path must call one of these functions. These may not be called
 * multiple times from the same kernel.
 *
 * integrator_path_init(kg, state, next_kernel)
 * integrator_path_next(kg, state, current_kernel, next_kernel)
 * integrator_path_terminate(kg, state, current_kernel)
 *
 * For the shadow path similar functions are used, and again each shadow kernel must call
 * one of them, and only once.
 */

ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state)
{
  return INTEGRATOR_STATE(state, path, queued_kernel) == 0;
}

ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state)
{
  return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0;
}

#ifdef __KERNEL_GPU__

ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
                                                 IntegratorState state,
                                                 const DeviceKernel next_kernel)
{
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
}

ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
                                                 IntegratorState state,
                                                 const DeviceKernel current_kernel,
                                                 const DeviceKernel next_kernel)
{
  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
                              1);
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
}

ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
                                                      IntegratorState state,
                                                      const DeviceKernel current_kernel)
{
  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
                              1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
}

ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
{
  IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32(
      &kernel_integrator_state.next_shadow_path_index[0], 1);
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
  return shadow_state;
}

ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
                                                        IntegratorShadowState state,
                                                        const DeviceKernel current_kernel,
                                                        const DeviceKernel next_kernel)
{
  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
                              1);
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
}

ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
                                                             IntegratorShadowState state,
                                                             const DeviceKernel current_kernel)
{
  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
                              1);
  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
}

/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
#  define INTEGRATOR_SORT_KEY(key, state) \
    (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))

ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
                                                        IntegratorState state,
                                                        const DeviceKernel next_kernel,
                                                        const uint32_t key)
{
  const int key_ = INTEGRATOR_SORT_KEY(key, state);
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}

ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
                                                        IntegratorState state,
                                                        const DeviceKernel current_kernel,
                                                        const DeviceKernel next_kernel,
                                                        const uint32_t key)
{
  const int key_ = INTEGRATOR_SORT_KEY(key, state);
  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
                              1);
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}

#else

ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
                                                 IntegratorState state,
                                                 const DeviceKernel next_kernel)
{
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
}

ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
                                                        IntegratorState state,
                                                        const DeviceKernel next_kernel,
                                                        const uint32_t key)
{
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  (void)key;
}

ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
                                                 IntegratorState state,
                                                 const DeviceKernel current_kernel,
                                                 const DeviceKernel next_kernel)
{
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  (void)current_kernel;
}

ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
                                                      IntegratorState state,
                                                      const DeviceKernel current_kernel)
{
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
  (void)current_kernel;
}

ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
                                                        IntegratorState state,
                                                        const DeviceKernel current_kernel,
                                                        const DeviceKernel next_kernel,
                                                        const uint32_t key)
{
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  (void)key;
  (void)current_kernel;
}

ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
{
  IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
  return shadow_state;
}

ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
                                                        IntegratorShadowState state,
                                                        const DeviceKernel current_kernel,
                                                        const DeviceKernel next_kernel)
{
  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
  (void)current_kernel;
}

ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
                                                             IntegratorShadowState state,
                                                             const DeviceKernel current_kernel)
{
  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
  (void)current_kernel;
}

#endif

CCL_NAMESPACE_END