diff options
Diffstat (limited to 'intern/cycles/kernel/kernels/cuda/filter.cu')
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 413 |
1 files changed, 0 insertions, 413 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu deleted file mode 100644 index 6c9642d1f03..00000000000 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#include "kernel_config.h" - -#include "kernel/kernel_compat_cuda.h" - -#include "kernel/filter/filter_kernel.h" - -/* kernels */ - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_copy_input(float *buffer, - CCL_FILTER_TILE_INFO, - int4 prefilter_rect, - int buffer_pass_stride) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int itile = ytile * 3 + xtile; - float *const in = ((float *)ccl_get_tile_buffer(itile)) + - (tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride; - buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride; - for (int i = 0; i < buffer_pass_stride; ++i) - buffer[i] = in[i]; - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < sw && y < sh) { - if (num_inputs > 0) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float); - float *out = rgb + (x + y * sw) * 3; - out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f); - out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f); - out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f); - } - if (num_inputs > 1) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float); - float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3; - out[0] = in[0] / num_samples; - out[1] = in[1] / num_samples; - out[2] = in[2] / num_samples; - } - if (num_inputs > 2) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float); - float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3; - out[0] = in[0] / num_samples; - out[1] = in[1] / num_samples; - out[2] = in[2] / num_samples; - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < sw && y < sh) { - float *in = rgb + ((ix + x) + (iy + y) * iw) * 3; - float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride; - out[0] = in[0] * num_samples; - out[1] = in[1] * num_samples; - out[2] = in[2] * num_samples; - } -} - - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_divide_shadow(int sample, - CCL_FILTER_TILE_INFO, - float *unfilteredA, - float *unfilteredB, - float *sampleVariance, - float *sampleVarianceV, - float *bufferVariance, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_divide_shadow(sample, - tile_info, - x, y, - unfilteredA, - unfilteredB, - sampleVariance, - sampleVarianceV, - bufferVariance, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_get_feature(int sample, - CCL_FILTER_TILE_INFO, - int m_offset, - int v_offset, - float *mean, - float *variance, - float scale, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_get_feature(sample, - tile_info, - m_offset, v_offset, - x, y, - mean, variance, - scale, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_write_feature(int sample, - int4 buffer_params, - int4 filter_area, - float *from, - float *buffer, - int out_offset, - int4 prefilter_rect) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - kernel_filter_write_feature(sample, - x + filter_area.x, - y + filter_area.y, - buffer_params, - from, - buffer, - out_offset, - prefilter_rect); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_detect_outliers(float *image, - float *variance, - float *depth, - float *output, - int4 prefilter_rect, - int pass_stride) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, - CCL_FILTER_TILE_INFO, - float *transform, int *rank, - int4 filter_area, int4 rect, - int radius, float pca_threshold, - int pass_stride, int frame_stride, - bool use_time) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - int *l_rank = rank + y*filter_area.z + x; - float *l_transform = transform + y*filter_area.z + x; - kernel_filter_construct_transform(buffer, - tile_info, - x + filter_area.x, y + filter_area.y, - rect, - pass_stride, frame_stride, - use_time, - l_transform, l_rank, - radius, pca_threshold, - filter_area.z*filter_area.w, - threadIdx.y*blockDim.x + threadIdx.x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, - const float *ccl_restrict variance_image, - const float *ccl_restrict scale_image, - float *difference_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, - weight_image, - variance_image, - scale_image, - difference_image + ofs, - rect, stride, - channel_offset, - frame_offset, - a, k_2); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, - float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_blur(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_weight(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, - const float *ccl_restrict image, - float *out_image, - float *accum_image, - int w, - int h, - int stride, - int pass_stride, - int channel_offset, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, - difference_image + ofs, - image, - out_image, - accum_image, - rect, - channel_offset, - stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *out_image, - const float *ccl_restrict accum_image, - int w, - int h, - int stride) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < w && y < h) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(int t, - const float *ccl_restrict difference_image, - const float *ccl_restrict buffer, - float const* __restrict__ transform, - int *rank, - float *XtWX, - float3 *XtWY, - int4 filter_window, - int w, - int h, - int stride, - int pass_stride, - int r, - int f, - int frame_offset, - bool use_time) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { - kernel_filter_nlm_construct_gramian(co.x, co.y, - co.z, co.w, - t, - difference_image + ofs, - buffer, - transform, rank, - XtWX, XtWY, - rect, filter_window, - stride, f, - pass_stride, - frame_offset, - use_time, - threadIdx.y*blockDim.x + threadIdx.x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_finalize(float *buffer, - int *rank, - float *XtWX, - float3 *XtWY, - int4 filter_area, - int4 buffer_params, - int sample) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - int storage_ofs = y*filter_area.z+x; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - kernel_filter_finalize(x, y, buffer, rank, - filter_area.z*filter_area.w, - XtWX, XtWY, - buffer_params, sample); - } -} - -#endif - |