Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2017-09-27 02:03:50 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2017-10-04 22:11:14 +0300
commit12f453820514e9478afdda0acf4c4fb1eac11e1c (patch)
treee8f9293c814457361febf1908e7131b0dc9ddfbd /intern
parente3e16cecc4f080edbbd14e4bf1cfc580c5957d62 (diff)
Code refactor: use split variance calculation for mega kernels too.
There is no significant difference in denoised benchmark scenes and denoising ctests, so might as well make it all consistent.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/device_cpu.cpp14
-rw-r--r--intern/cycles/device/device_cuda.cpp8
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp8
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h25
-rw-r--r--intern/cycles/kernel/kernel_passes.h34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h6
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h12
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu12
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl12
9 files changed, 35 insertions, 96 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 72330b02a28..ff34f4f9ce4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -176,10 +176,10 @@ public:
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
- KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
- KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
@@ -563,8 +563,7 @@ public:
(float*) buffer_variance_ptr,
&task->rect.x,
task->render_buffer.pass_stride,
- task->render_buffer.denoising_data_offset,
- use_split_kernel);
+ task->render_buffer.denoising_data_offset);
}
}
return true;
@@ -587,8 +586,7 @@ public:
(float*) variance_ptr,
&task->rect.x,
task->render_buffer.pass_stride,
- task->render_buffer.denoising_data_offset,
- use_split_kernel);
+ task->render_buffer.denoising_data_offset);
}
}
return true;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index e5464dcf34e..54e012191ae 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1173,7 +1173,6 @@ public:
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
- bool use_split_variance = use_split_kernel();
void *args[] = {&task->render_buffer.samples,
&task->tiles_mem.device_pointer,
&a_ptr,
@@ -1183,8 +1182,7 @@ public:
&buffer_variance_ptr,
&task->rect,
&task->render_buffer.pass_stride,
- &task->render_buffer.denoising_data_offset,
- &use_split_variance};
+ &task->render_buffer.denoising_data_offset};
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
cuda_assert(cuCtxSynchronize());
@@ -1209,7 +1207,6 @@ public:
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
- bool use_split_variance = use_split_kernel();
void *args[] = {&task->render_buffer.samples,
&task->tiles_mem.device_pointer,
&mean_offset,
@@ -1218,8 +1215,7 @@ public:
&variance_ptr,
&task->rect,
&task->render_buffer.pass_stride,
- &task->render_buffer.denoising_data_offset,
- &use_split_variance};
+ &task->render_buffer.denoising_data_offset};
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
cuda_assert(cuCtxSynchronize());
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 7bdf81462b8..8095611f099 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -982,7 +982,6 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
- char split_kernel = is_split_kernel()? 1 : 0;
kernel_set_args(ckFilterDivideShadow, 0,
task->render_buffer.samples,
tiles_mem,
@@ -993,8 +992,7 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
buffer_variance_mem,
task->rect,
task->render_buffer.pass_stride,
- task->render_buffer.denoising_data_offset,
- split_kernel);
+ task->render_buffer.denoising_data_offset);
enqueue_kernel(ckFilterDivideShadow,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
@@ -1015,7 +1013,6 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
- char split_kernel = is_split_kernel()? 1 : 0;
kernel_set_args(ckFilterGetFeature, 0,
task->render_buffer.samples,
tiles_mem,
@@ -1025,8 +1022,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
variance_mem,
task->rect,
task->render_buffer.pass_stride,
- task->render_buffer.denoising_data_offset,
- split_kernel);
+ task->render_buffer.denoising_data_offset);
enqueue_kernel(ckFilterGetFeature,
task->rect.z-task->rect.x,
task->rect.w-task->rect.y);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 2aeb54a62be..eefcbfea230 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -35,8 +35,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
ccl_global float *bufferVariance,
int4 rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance)
+ int buffer_denoising_offset)
{
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
@@ -57,10 +56,12 @@ ccl_device void kernel_filter_divide_shadow(int sample,
float varB = center_buffer[5];
int odd_sample = (sample+1)/2;
int even_sample = sample/2;
- if(use_split_variance) {
- varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
- varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
- }
+
+ /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+ * update does not work efficiently with atomics in the kernel. */
+ varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+
varA /= max(odd_sample - 1, 1);
varB /= max(even_sample - 1, 1);
@@ -84,8 +85,7 @@ ccl_device void kernel_filter_get_feature(int sample,
ccl_global float *mean,
ccl_global float *variance,
int4 rect, int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance)
+ int buffer_denoising_offset)
{
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
@@ -97,12 +97,9 @@ ccl_device void kernel_filter_get_feature(int sample,
mean[idx] = center_buffer[m_offset] / sample;
if(sample > 1) {
- if(use_split_variance) {
- variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
- }
- else {
- variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
- }
+ /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+ * update does not work efficiently with atomics in the kernel. */
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
}
else {
/* Can't compute variance with single sample, just set it very high. */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index fff7f4cfdb7..bd756185e78 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -67,18 +67,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer
/* The online one-pass variance update that's used for the megakernel can't easily be implemented
* with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+1, sample, value*value);
-# else
- if(sample == 0) {
- kernel_write_pass_float(buffer+1, sample, 0.0f);
- }
- else {
- float new_mean = buffer[0] * (1.0f / (sample + 1));
- float old_mean = (buffer[0] - value) * (1.0f / sample);
- kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
- }
-# endif
}
# if defined(__SPLIT_KERNEL__)
@@ -95,19 +84,7 @@ ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buff
ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
{
kernel_write_pass_float3_unaligned(buffer, sample, value);
-# ifdef __SPLIT_KERNEL__
kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
-# else
- if(sample == 0) {
- kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
- }
- else {
- float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
- float3 new_mean = sum * (1.0f / (sample + 1));
- float3 old_mean = (sum - value) * (1.0f / sample);
- kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
- }
-# endif
}
ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
@@ -125,18 +102,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob
kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
float value = path_total_shaded / max(path_total, 1e-7f);
-# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+2, sample/2, value*value);
-# else
- if(sample < 2) {
- kernel_write_pass_float(buffer+2, sample/2, 0.0f);
- }
- else {
- float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
- float new_value = buffer[1] / max(buffer[0], 1e-7f);
- kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
- }
-# endif
}
#endif /* __DENOISING_FEATURES__ */
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index 2ed713299fd..bf13ba62806 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -27,8 +27,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
float *bufferV,
int* prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance);
+ int buffer_denoising_offset);
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
TilesInfo *tiles,
@@ -40,8 +39,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
float *variance,
int* prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance);
+ int buffer_denoising_offset);
void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
ccl_global float *image,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 8dc1a8d583c..2fbb0ea2bdb 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -45,8 +45,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
float *bufferVariance,
int* prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance)
+ int buffer_denoising_offset)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
@@ -60,8 +59,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
bufferVariance,
load_int4(prefilter_rect),
buffer_pass_stride,
- buffer_denoising_offset,
- use_split_variance);
+ buffer_denoising_offset);
#endif
}
@@ -74,8 +72,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
float *mean, float *variance,
int* prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance)
+ int buffer_denoising_offset)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
@@ -86,8 +83,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
mean, variance,
load_int4(prefilter_rect),
buffer_pass_stride,
- buffer_denoising_offset,
- use_split_variance);
+ buffer_denoising_offset);
#endif
}
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 009c3fde9d5..c8172355a7f 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -37,8 +37,7 @@ kernel_cuda_filter_divide_shadow(int sample,
float *bufferVariance,
int4 prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance)
+ int buffer_denoising_offset)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
@@ -53,8 +52,7 @@ kernel_cuda_filter_divide_shadow(int sample,
bufferVariance,
prefilter_rect,
buffer_pass_stride,
- buffer_denoising_offset,
- use_split_variance);
+ buffer_denoising_offset);
}
}
@@ -68,8 +66,7 @@ kernel_cuda_filter_get_feature(int sample,
float *variance,
int4 prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- bool use_split_variance)
+ int buffer_denoising_offset)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
@@ -81,8 +78,7 @@ kernel_cuda_filter_get_feature(int sample,
mean, variance,
prefilter_rect,
buffer_pass_stride,
- buffer_denoising_offset,
- use_split_variance);
+ buffer_denoising_offset);
}
}
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index f015ac47d8a..7a7b596a350 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -31,8 +31,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
ccl_global float *bufferVariance,
int4 prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- char use_split_variance)
+ int buffer_denoising_offset)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
@@ -47,8 +46,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
bufferVariance,
prefilter_rect,
buffer_pass_stride,
- buffer_denoising_offset,
- use_split_variance);
+ buffer_denoising_offset);
}
}
@@ -60,8 +58,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
ccl_global float *variance,
int4 prefilter_rect,
int buffer_pass_stride,
- int buffer_denoising_offset,
- char use_split_variance)
+ int buffer_denoising_offset)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
@@ -73,8 +70,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
mean, variance,
prefilter_rect,
buffer_pass_stride,
- buffer_denoising_offset,
- use_split_variance);
+ buffer_denoising_offset);
}
}