diff options
Diffstat (limited to 'intern/cycles/kernel/kernel_adaptive_sampling.h')
-rw-r--r-- | intern/cycles/kernel/kernel_adaptive_sampling.h | 274 |
1 files changed, 97 insertions, 177 deletions
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h index 98b7bf7e7dc..2bee12f0473 100644 --- a/intern/cycles/kernel/kernel_adaptive_sampling.h +++ b/intern/cycles/kernel/kernel_adaptive_sampling.h @@ -14,226 +14,146 @@ * limitations under the License. */ -#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__ -#define __KERNEL_ADAPTIVE_SAMPLING_H__ +#pragma once + +#include "kernel/kernel_write_passes.h" CCL_NAMESPACE_BEGIN -/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */ +/* Check whether the pixel has converged and should not be sampled anymore. */ -ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg, - ccl_global float *buffer, - int sample) +ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS, + ccl_global float *render_buffer) { - /* TODO Stefan: Is this better in linear, sRGB or something else? */ - float4 I = *((ccl_global float4 *)buffer); - float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); - /* The per pixel error as seen in section 2.1 of - * "A hierarchical automatic stopping condition for Monte Carlo global illumination" - * A small epsilon is added to the divisor to prevent division by zero. */ - float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) / - (sample * 0.0001f + sqrtf(I.x + I.y + I.z)); - if (error < kernel_data.integrator.adaptive_threshold * (float)sample) { - /* Set the fourth component to non-zero value to indicate that this pixel has converged. */ - buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f; + if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) { + return true; } -} - -/* Adjust the values of an adaptively sampled pixel. */ - -ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg, - ccl_global float *buffer, - float sample_multiplier) -{ - *(ccl_global float4 *)(buffer) *= sample_multiplier; - /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */ - kernel_assert(kernel_data.film.pass_adaptive_aux_buffer); - *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier; + const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index); + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * + kernel_data.film.pass_stride; + ccl_global float *buffer = render_buffer + render_buffer_offset; -#ifdef __PASSES__ - int flag = kernel_data.film.pass_flag; - - if (flag & PASSMASK(NORMAL)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier; + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + return buffer[aux_w_offset] == 0.0f; +} - if (flag & PASSMASK(UV)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier; +/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */ - if (flag & PASSMASK(MOTION)) { - *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier; - *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier; +ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride) +{ + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED); + kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED); + + const int render_pixel_index = offset + x + y * stride; + ccl_global float *buffer = render_buffer + + (uint64_t)render_pixel_index * kernel_data.film.pass_stride; + + /* TODO(Stefan): Is this better in linear, sRGB or something else? */ + + const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer); + if (!reset && A.w != 0.0f) { + /* If the pixel was considered converged, its state will not change in this kernmel. Early + * output before doing any math. + * + * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */ + return true; } - if (kernel_data.film.use_light_pass) { - int light_flag = kernel_data.film.light_pass_flag; - - if (light_flag & PASSMASK(MIST)) - *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier; - - /* Shadow pass omitted on purpose. It has its own scale parameter. */ - - if (light_flag & PASSMASK(DIFFUSE_INDIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(GLOSSY_INDIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(TRANSMISSION_INDIRECT)) - *(ccl_global float3 *)(buffer + - kernel_data.film.pass_transmission_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(VOLUME_INDIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(DIFFUSE_DIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier; - if (light_flag & PASSMASK(GLOSSY_DIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier; - if (light_flag & PASSMASK(TRANSMISSION_DIRECT)) - *(ccl_global float3 *)(buffer + - kernel_data.film.pass_transmission_direct) *= sample_multiplier; - if (light_flag & PASSMASK(VOLUME_DIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier; - - if (light_flag & PASSMASK(EMISSION)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier; - if (light_flag & PASSMASK(BACKGROUND)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier; - if (light_flag & PASSMASK(AO)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier; - - if (light_flag & PASSMASK(DIFFUSE_COLOR)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier; - if (light_flag & PASSMASK(GLOSSY_COLOR)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier; - if (light_flag & PASSMASK(TRANSMISSION_COLOR)) - *(ccl_global float3 *)(buffer + - kernel_data.film.pass_transmission_color) *= sample_multiplier; - } -#endif - -#ifdef __DENOISING_FEATURES__ - -# define scale_float3_variance(buffer, offset, scale) \ - *(buffer + offset) *= scale; \ - *(buffer + offset + 1) *= scale; \ - *(buffer + offset + 2) *= scale; \ - *(buffer + offset + 3) *= scale * scale; \ - *(buffer + offset + 4) *= scale * scale; \ - *(buffer + offset + 5) *= scale * scale; - -# define scale_shadow_variance(buffer, offset, scale) \ - *(buffer + offset) *= scale; \ - *(buffer + offset + 1) *= scale; \ - *(buffer + offset + 2) *= scale * scale; - - if (kernel_data.film.pass_denoising_data) { - scale_shadow_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier); - scale_shadow_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier); - if (kernel_data.film.pass_denoising_clean) { - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); - *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier; - *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier; - *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier; - } - else { - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); - } - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier); - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier); - *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier; - *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH + - 1) *= sample_multiplier * sample_multiplier; - } -#endif /* __DENOISING_FEATURES__ */ - - /* Cryptomatte. */ - if (kernel_data.film.cryptomatte_passes) { - int num_slots = 0; - num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0; - num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0; - num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0; - num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth; - ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer + - kernel_data.film.pass_cryptomatte); - for (int slot = 0; slot < num_slots; slot++) { - id_buffer[slot].y *= sample_multiplier; - } - } + const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined); - /* AOVs. */ - for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) { - *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier; - } - for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) { - *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier; - } + const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]); + const float inv_sample = 1.0f / sample; + + /* The per pixel error as seen in section 2.1 of + * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */ + const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) * + inv_sample; + const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample); + /* A small epsilon is added to the divisor to prevent division by zero. */ + const float error = error_difference / (0.0001f + error_normalize); + const bool did_converge = (error < threshold); + + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + buffer[aux_w_offset] = did_converge; + + return did_converge; } /* This is a simple box filter in two passes. * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */ -ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile) +ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride) { - bool any = false; + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED); + bool prev = false; - for (int x = tile->x; x < tile->x + tile->w; ++x) { - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w == 0.0f) { - any = true; - if (x > tile->x && !prev) { + for (int x = start_x; x < start_x + width; ++x) { + int index = offset + x + y * stride; + ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + + if (buffer[aux_w_offset] == 0.0f) { + if (x > start_x && !prev) { index = index - 1; - buffer = tile->buffer + index * kernel_data.film.pass_stride; - aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); - (*aux).w = 0.0f; + buffer = render_buffer + index * kernel_data.film.pass_stride; + buffer[aux_w_offset] = 0.0f; } prev = true; } else { if (prev) { - (*aux).w = 0.0f; + buffer[aux_w_offset] = 0.0f; } prev = false; } } - return any; } -ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile) +ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride) { + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED); + bool prev = false; - bool any = false; - for (int y = tile->y; y < tile->y + tile->h; ++y) { - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w == 0.0f) { - any = true; - if (y > tile->y && !prev) { - index = index - tile->stride; - buffer = tile->buffer + index * kernel_data.film.pass_stride; - aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); - (*aux).w = 0.0f; + for (int y = start_y; y < start_y + height; ++y) { + int index = offset + x + y * stride; + ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + + if (buffer[aux_w_offset] == 0.0f) { + if (y > start_y && !prev) { + index = index - stride; + buffer = render_buffer + index * kernel_data.film.pass_stride; + buffer[aux_w_offset] = 0.0f; } prev = true; } else { if (prev) { - (*aux).w = 0.0f; + buffer[aux_w_offset] = 0.0f; } prev = false; } } - return any; } CCL_NAMESPACE_END - -#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */ |