Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/kernels/cuda/filter.cu
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
1 files changed, 0 insertions, 413 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
-                              CCL_FILTER_TILE_INFO,
-                              int4 prefilter_rect,
-                              int buffer_pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-		int itile = ytile * 3 + xtile;
-		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
-			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
-		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
-		for (int i = 0; i < buffer_pass_stride; ++i)
-			buffer[i] = in[i];
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		if (num_inputs > 0) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3;
-			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
-			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
-			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
-		}
-		if (num_inputs > 1) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-		if (num_inputs > 2) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
-		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
-		out[0] = in[0] * num_samples;
-		out[1] = in[1] * num_samples;
-		out[2] = in[2] * num_samples;
-	}
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
-                                 CCL_FILTER_TILE_INFO,
-                                 float *unfilteredA,
-                                 float *unfilteredB,
-                                 float *sampleVariance,
-                                 float *sampleVarianceV,
-                                 float *bufferVariance,
-                                 int4 prefilter_rect,
-                                 int buffer_pass_stride,
-                                 int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            tile_info,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
-                               CCL_FILTER_TILE_INFO,
-                               int m_offset,
-                               int v_offset,
-                               float *mean,
-                               float *variance,
-                               float scale,
-                               int4 prefilter_rect,
-                               int buffer_pass_stride,
-                               int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          tile_info,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
-                                 int4 buffer_params,
-                                 int4 filter_area,
-                                 float *from,
-                                 float *buffer,
-                                 int out_offset,
-                                 int4 prefilter_rect)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-	                                x + filter_area.x,
-	                                y + filter_area.y,
-	                                buffer_params,
-	                                from,
-	                                buffer,
-	                                out_offset,
-	                                prefilter_rect);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
-                                   float *variance,
-                                   float *depth,
-                                   float *output,
-                                   int4 prefilter_rect,
-                                   int pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
-                                       CCL_FILTER_TILE_INFO,
-                                       float *transform, int *rank,
-                                       int4 filter_area, int4 rect,
-                                       int radius, float pca_threshold,
-                                       int pass_stride, int frame_stride,
-                                       bool use_time)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int *l_rank = rank + y*filter_area.z + x;
-		float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  tile_info,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
-                                       const float *ccl_restrict variance_image,
-                                       const float *ccl_restrict scale_image,
-                                       float *difference_image,
-                                       int w,
-                                       int h,
-                                       int stride,
-                                       int pass_stride,
-                                       int r,
-                                       int channel_offset,
-                                       int frame_offset,
-                                       float a,
-                                       float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
-                            float *out_image,
-                            int w,
-                            int h,
-                            int stride,
-                            int pass_stride,
-                            int r,
-                            int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                   float *out_image,
-                                   int w,
-                                   int h,
-                                   int stride,
-                                   int pass_stride,
-                                   int r,
-                                   int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
-                                     const float *ccl_restrict image,
-                                     float *out_image,
-                                     float *accum_image,
-                                     int w,
-                                     int h,
-                                     int stride,
-                                     int pass_stride,
-                                     int channel_offset,
-                                     int r,
-                                     int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
-                                 const float *ccl_restrict accum_image,
-                                 int w,
-                                 int h,
-                                 int stride)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
-                                         const float *ccl_restrict difference_image,
-                                         const float *ccl_restrict buffer,
-                                         float const* __restrict__ transform,
-                                         int *rank,
-                                         float *XtWX,
-                                         float3 *XtWY,
-                                         int4 filter_window,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f,
-                                         int frame_offset,
-                                         bool use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
-                            int *rank,
-                            float *XtWX,
-                            float3 *XtWY,
-                            int4 filter_area,
-                            int4 buffer_params,
-                            int sample)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
-
-#endif
-
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/kernels/cuda/filter.cu
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)