/*
 * Copyright 2011-2017 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* CUDA kernel entry points */

#ifdef __CUDA_ARCH__

#include "util_cuda_capabilities.h"

#include "../../filter_compat_cuda.h"

#include "../../filter_kernel.h"

/* kernels */

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_divide_shadow(int sample,
                                 TilesInfo *tiles,
                                 float *unfilteredA,
                                 float *unfilteredB,
                                 float *sampleVariance,
                                 float *sampleVarianceV,
                                 float *bufferVariance,
                                 int4 prefilter_rect,
                                 int buffer_pass_stride,
                                 int buffer_denoising_offset,
                                 bool use_gradients)
{
	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
	if(x < prefilter_rect.z && y < prefilter_rect.w) {
		kernel_filter_divide_shadow(sample,
		                            tiles,
		                            x, y,
		                            unfilteredA,
		                            unfilteredB,
		                            sampleVariance,
		                            sampleVarianceV,
		                            bufferVariance,
		                            prefilter_rect,
		                            buffer_pass_stride,
		                            buffer_denoising_offset,
		                            use_gradients);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_get_feature(int sample,
                               TilesInfo *tiles,
                               int m_offset,
                               int v_offset,
                               float *mean,
                               float *variance,
                               int4 prefilter_rect,
                               int buffer_pass_stride,
                               int buffer_denoising_offset,
                               bool use_cross_denoising)
{
	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
	if(x < prefilter_rect.z && y < prefilter_rect.w) {
		kernel_filter_get_feature(sample,
		                          tiles,
		                          m_offset, v_offset,
		                          x, y,
		                          mean, variance,
		                          prefilter_rect,
		                          buffer_pass_stride,
		                          buffer_denoising_offset,
		                          use_cross_denoising);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
{
	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
	if(x < prefilter_rect.z && y < prefilter_rect.w) {
		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_construct_transform(int sample, float const* __restrict__ buffer,
                                       float *transform, int *rank,
                                       int4 filter_area, int4 rect,
                                       int half_window, float pca_threshold)
{
	int x = blockDim.x*blockIdx.x + threadIdx.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y;
	if(x < filter_area.z && y < filter_area.w) {
		int *l_rank = rank + y*filter_area.z + x;
		float *l_transform = transform + y*filter_area.z + x;
		kernel_filter_construct_transform(sample, buffer,
		                                  x + filter_area.x, y + filter_area.y,
		                                  rect, l_transform, l_rank,
		                                  half_window, pca_threshold,
		                                  filter_area.z*filter_area.w,
		                                  threadIdx.y*blockDim.x + threadIdx.x);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_divide_combined(float *buffers, int sample,
                                   int offset, int stride,
                                   int4 filter_area, int pass_stride,
                                   int no_denoising_offset)
{
	int x = blockDim.x*blockIdx.x + threadIdx.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y;
	if(x < filter_area.z && y < filter_area.w) {
		kernel_filter_divide_combined(x + filter_area.x, y + filter_area.y, sample, buffers, offset, stride, pass_stride, no_denoising_offset);
	}
}


extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
                                       float ccl_readonly_ptr weightImage,
                                       float ccl_readonly_ptr varianceImage,
                                       float *differenceImage,
                                       int4 rect, int w,
                                       int channel_offset,
                                       float a, float k_2) {
	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
	if(x < rect.z && y < rect.w) {
		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_blur(float ccl_readonly_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
	if(x < rect.z && y < rect.w) {
		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_weight(float ccl_readonly_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
	if(x < rect.z && y < rect.w) {
		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_update_output(int dx, int dy,
                                     float ccl_readonly_ptr differenceImage,
                                     float ccl_readonly_ptr image,
                                     float *outImage, float *accumImage,
                                     int4 rect, int w,
                                     int f) {
	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
	if(x < rect.z && y < rect.w) {
		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_readonly_ptr accumImage, int4 rect, int w) {
	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
	if(x < rect.z && y < rect.w) {
		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
                                         float ccl_readonly_ptr differenceImage,
                                         float ccl_readonly_ptr buffer,
                                         float *color_pass,
                                         float *variance_pass,
                                         float const* __restrict__ transform,
                                         int *rank,
                                         float *XtWX,
                                         float3 *XtWY,
                                         int4 rect,
                                         int4 filter_rect,
                                         int w, int h, int f) {
	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
		kernel_filter_nlm_construct_gramian(x, y, dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, rect, filter_rect, w, h, f);
	}
}

extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_finalize(int w, int h,
                            float *buffer, int *rank,
                            float *XtWX, float3 *XtWY,
                            int4 filter_area, int4 buffer_params,
                            int sample) {
	int x = blockDim.x*blockIdx.x + threadIdx.x;
	int y = blockDim.y*blockIdx.y + threadIdx.y;
	if(x < filter_area.z && y < filter_area.w) {
		int storage_ofs = y*filter_area.z+x;
		rank += storage_ofs;
		XtWX += storage_ofs;
		XtWY += storage_ofs;
		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
	}
}

#endif