/* * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* CUDA kernel entry points */ #ifdef __CUDA_ARCH__ #include "kernel_config.h" #include "kernel/kernel_compat_cuda.h" #include "kernel/filter/filter_kernel.h" /* kernels */ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_copy_input(float *buffer, CCL_FILTER_TILE_INFO, int4 prefilter_rect, int buffer_pass_stride) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); int itile = ytile * 3 + xtile; float *const in = ((float *)ccl_get_tile_buffer(itile)) + (tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride; buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride; for (int i = 0; i < buffer_pass_stride; ++i) buffer[i] = in[i]; } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < sw && y < sh) { if (num_inputs > 0) { float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float); float *out = rgb + (x + y * sw) * 3; out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; } if (num_inputs > 1) { float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float); float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3; out[0] = in[0] / num_samples; out[1] = in[1] / num_samples; out[2] = in[2] / num_samples; } if (num_inputs > 2) { float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float); float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3; out[0] = in[0] / num_samples; out[1] = in[1] / num_samples; out[2] = in[2] / num_samples; } } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < sw && y < sh) { float *in = rgb + ((ix + x) + (iy + y) * iw) * 3; float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride; out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, CCL_FILTER_TILE_INFO, float *unfilteredA, float *unfilteredB, float *sampleVariance, float *sampleVarianceV, float *bufferVariance, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_divide_shadow(sample, tile_info, x, y, unfilteredA, unfilteredB, sampleVariance, sampleVarianceV, bufferVariance, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_get_feature(int sample, CCL_FILTER_TILE_INFO, int m_offset, int v_offset, float *mean, float *variance, float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_get_feature(sample, tile_info, m_offset, v_offset, x, y, mean, variance, scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_write_feature(int sample, int4 buffer_params, int4 filter_area, float *from, float *buffer, int out_offset, int4 prefilter_rect) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < filter_area.z && y < filter_area.w) { kernel_filter_write_feature(sample, x + filter_area.x, y + filter_area.y, buffer_params, from, buffer, out_offset, prefilter_rect); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_detect_outliers(float *image, float *variance, float *depth, float *output, int4 prefilter_rect, int pass_stride) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, CCL_FILTER_TILE_INFO, float *transform, int *rank, int4 filter_area, int4 rect, int radius, float pca_threshold, int pass_stride, int frame_stride, bool use_time) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < filter_area.z && y < filter_area.w) { int *l_rank = rank + y*filter_area.z + x; float *l_transform = transform + y*filter_area.z + x; kernel_filter_construct_transform(buffer, tile_info, x + filter_area.x, y + filter_area.y, rect, pass_stride, frame_stride, use_time, l_transform, l_rank, radius, pca_threshold, filter_area.z*filter_area.w, threadIdx.y*blockDim.x + threadIdx.x); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, const float *ccl_restrict scale_image, float *difference_image, int w, int h, int stride, int pass_stride, int r, int channel_offset, int frame_offset, float a, float k_2) { int4 co, rect; int ofs; if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, scale_image, difference_image + ofs, rect, stride, channel_offset, frame_offset, a, k_2); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int w, int h, int stride, int pass_stride, int r, int f) { int4 co, rect; int ofs; if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int w, int h, int stride, int pass_stride, int r, int f) { int4 co, rect; int ofs; if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, const float *ccl_restrict image, float *out_image, float *accum_image, int w, int h, int stride, int pass_stride, int channel_offset, int r, int f) { int4 co, rect; int ofs; if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, out_image, accum_image, rect, channel_offset, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int w, int h, int stride) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < w && y < h) { kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_construct_gramian(int t, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float const* __restrict__ transform, int *rank, float *XtWX, float3 *XtWY, int4 filter_window, int w, int h, int stride, int pass_stride, int r, int f, int frame_offset, bool use_time) { int4 co, rect; int ofs; if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, t, difference_image + ofs, buffer, transform, rank, XtWX, XtWY, rect, filter_window, stride, f, pass_stride, frame_offset, use_time, threadIdx.y*blockDim.x + threadIdx.x); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_finalize(float *buffer, int *rank, float *XtWX, float3 *XtWY, int4 filter_area, int4 buffer_params, int sample) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < filter_area.z && y < filter_area.w) { int storage_ofs = y*filter_area.z+x; rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; kernel_filter_finalize(x, y, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); } } #endif