/*
 * Copyright 2011-2017 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

CCL_NAMESPACE_BEGIN

ccl_device_inline void kernel_filter_construct_gramian(int x,
                                                       int y,
                                                       int storage_stride,
                                                       int dx,
                                                       int dy,
                                                       int t,
                                                       int buffer_stride,
                                                       int pass_stride,
                                                       int frame_offset,
                                                       bool use_time,
                                                       const ccl_global float *ccl_restrict buffer,
                                                       const ccl_global float *ccl_restrict
                                                           transform,
                                                       ccl_global int *rank,
                                                       float weight,
                                                       ccl_global float *XtWX,
                                                       ccl_global float3 *XtWY,
                                                       int localIdx)
{
  if (weight < 1e-3f) {
    return;
  }

  int p_offset = y * buffer_stride + x;
  int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;

#ifdef __KERNEL_GPU__
  const int stride = storage_stride;
#else
  const int stride = 1;
  (void)storage_stride;
#endif

#ifdef __KERNEL_CUDA__
  ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
  ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
#else
  float design_row[DENOISE_FEATURES + 1];
#endif

  float3 q_color = filter_get_color(buffer + q_offset, pass_stride);

  /* If the pixel was flagged as an outlier during prefiltering, skip it. */
  if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
    return;
  }

  filter_get_design_row_transform(make_int3(x, y, t),
                                  buffer + p_offset,
                                  make_int3(x + dx, y + dy, t),
                                  buffer + q_offset,
                                  pass_stride,
                                  *rank,
                                  design_row,
                                  transform,
                                  stride,
                                  use_time);

#ifdef __KERNEL_GPU__
  math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
  math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
#else
  math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
  math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
#endif
}

ccl_device_inline void kernel_filter_finalize(int x,
                                              int y,
                                              ccl_global float *buffer,
                                              ccl_global int *rank,
                                              int storage_stride,
                                              ccl_global float *XtWX,
                                              ccl_global float3 *XtWY,
                                              int4 buffer_params,
                                              int sample)
{
#ifdef __KERNEL_GPU__
  const int stride = storage_stride;
#else
  const int stride = 1;
  (void)storage_stride;
#endif

  if (XtWX[0] < 1e-3f) {
    /* There is not enough information to determine a denoised result.
     * As a fallback, keep the original value of the pixel. */
    return;
  }

  /* The weighted average of pixel colors (essentially, the NLM-filtered image).
   * In case the solution of the linear model fails due to numerical issues or
   * returns nonsensical negative values, fall back to this value. */
  float3 mean_color = XtWY[0] / XtWX[0];

  math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);

  float3 final_color = XtWY[0];
  if (!isfinite3_safe(final_color) ||
      (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
    final_color = mean_color;
  }

  /* Clamp pixel value to positive values and reverse the highlight compression transform. */
  final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));

  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
                                                   buffer_params.z;
  if (buffer_params.w >= 0) {
    final_color *= sample;
    if (buffer_params.w > 0) {
      final_color.x += combined_buffer[buffer_params.w + 0];
      final_color.y += combined_buffer[buffer_params.w + 1];
      final_color.z += combined_buffer[buffer_params.w + 2];
    }
  }
  combined_buffer[0] = final_color.x;
  combined_buffer[1] = final_color.y;
  combined_buffer[2] = final_color.z;
}

CCL_NAMESPACE_END