diff options
author | Lukas Stockner <lukas.stockner@freenet.de> | 2017-02-09 00:53:06 +0300 |
---|---|---|
committer | Lukas Stockner <lukas.stockner@freenet.de> | 2017-02-09 00:53:06 +0300 |
commit | 04abe01b6c65de2681af57caed43ef1aa4d1eb9f (patch) | |
tree | 0467832ef68517062d757fc2a63d1934268b114f | |
parent | 2f6db0e227d8835bc4b2ec5d0e181c5cf29da7dc (diff) |
Cycles Denoising: Use device-independent denoising code for CUDA as well
As a result, cross-denoising on CUDA works now.
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 30 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 615 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 80 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 16 | ||||
-rw-r--r-- | intern/cycles/filter/filter_defines.h | 9 | ||||
-rw-r--r-- | intern/cycles/filter/filter_prefilter.h | 24 | ||||
-rw-r--r-- | intern/cycles/filter/filter_transform.h | 5 | ||||
-rw-r--r-- | intern/cycles/filter/filter_transform_cuda.h | 3 | ||||
-rw-r--r-- | intern/cycles/filter/filter_transform_sse.h | 5 | ||||
-rw-r--r-- | intern/cycles/filter/kernels/cpu/filter_cpu.h | 16 | ||||
-rw-r--r-- | intern/cycles/filter/kernels/cpu/filter_cpu_impl.h | 29 | ||||
-rw-r--r-- | intern/cycles/filter/kernels/cuda/filter.cu | 59 |
12 files changed, 419 insertions, 472 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index bd5630ae958..4c12556bf28 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -137,10 +137,10 @@ public: KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel; - KernelFunctions<void(*)(int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; - KernelFunctions<void(*)(int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)> filter_divide_combined_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; + KernelFunctions<void(*)(int, int, int, float*, int, int, int, int)> filter_divide_combined_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; @@ -148,7 +148,7 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float, int, int)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, float*, int, int, int, float*, int*, int*, int, float)> filter_construct_transform_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; @@ -363,9 +363,7 @@ public: (int*) task->storage.rank.device_pointer, &task->rect.x, task->half_window, - task->pca_threshold, - 1, - 0); + task->pca_threshold); } } return true; @@ -463,12 +461,8 @@ public: for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_divide_shadow_kernel()(task->render_buffer.samples, - (float**) task->neighbors.buffers, + task->tiles, x, y, - task->neighbors.tile_x, - task->neighbors.tile_y, - task->neighbors.offsets, - task->neighbors.strides, (float*) a_ptr, (float*) b_ptr, (float*) sample_variance_ptr, @@ -492,14 +486,10 @@ public: for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_get_feature_kernel()(task->render_buffer.samples, - (float**) task->neighbors.buffers, + task->tiles, mean_offset, variance_offset, x, y, - task->neighbors.tile_x, - task->neighbors.tile_y, - task->neighbors.offsets, - task->neighbors.strides, (float*) mean_ptr, (float*) variance_ptr, &task->rect.x, @@ -560,7 +550,7 @@ public: denoising.filter_area = make_int4(tile.x + overscan, tile.y + overscan, tile.w - 2*overscan, tile.h - 2*overscan); denoising.render_buffer.samples = end_sample; - denoising.neighbors.init_from_single_tile(tile); + denoising.tiles_from_single_tile(tile); denoising.init_from_kerneldata(&kg.__data); denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); @@ -583,7 +573,7 @@ public: RenderTile rtiles[9]; rtiles[4] = tile; task.get_neighbor_tiles(rtiles); - denoising.neighbors.init_from_rendertiles(rtiles); + denoising.tiles_from_rendertiles(rtiles); denoising.init_from_kerneldata(&kg.__data); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index ba4424f844b..1bbe98113ec 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -21,6 +21,7 @@ #include "device.h" #include "device_intern.h" +#include "device_denoising.h" #include "buffers.h" @@ -143,7 +144,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -520,7 +521,9 @@ public: void mem_zero(device_memory& mem) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); + if(mem.data_pointer) { + memset((void*)mem.data_pointer, 0, mem.memory_size()); + } cuda_push_context(); if(mem.device_pointer) @@ -542,6 +545,11 @@ public: } } + virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_offset(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; @@ -845,368 +853,343 @@ public: } } - void non_local_means(int4 rect, CUdeviceptr image, CUdeviceptr weight, CUdeviceptr out, CUdeviceptr variance, CUdeviceptr difference, CUdeviceptr blurDifference, CUdeviceptr weightAccum, int r, int f, float a, float k_2) { +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + int4 rect = task->rect; int w = align_up(rect.z-rect.x, 4); int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + CUdeviceptr difference = task->nlm_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); - cuda_assert(cuMemsetD8(out, 0, sizeof(float)*w*h)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuNLMCalcDifference)); - - int xthreads = (int)sqrt((float)threads_per_block); - int ythreads = (int)sqrt((float)threads_per_block); - int xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads; - int yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads; + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); int dx, dy; int4 local_rect; - void *calc_difference_args[] = {&dx, &dy, &weight, &variance, &difference, &local_rect, &w, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; - void *update_output_args[] = {&dx, &dy, &blurDifference, &image, &out, &weightAccum, &local_rect, &w, &f}; + int channel_offset = 0; + void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; + void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; for(int i = 0; i < (2*r+1)*(2*r+1); i++) { dy = i / (2*r+1) - r; dx = i % (2*r+1) - r; local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); - cuda_assert(cuLaunchKernel(cuNLMCalcDifference, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, calc_difference_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMBlur, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, blur_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMCalcWeight, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, calc_weight_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMBlur, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, blur_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMUpdateOutput, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, update_output_args, 0)); + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); } local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); - void *normalize_args[] = {&out, &weightAccum, &local_rect, &w}; - cuda_assert(cuLaunchKernel(cuNLMNormalize, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, normalize_args, 0)); + void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); } - void denoise(RenderTile &rtile, int sample) + bool denoising_construct_transform(DenoisingTask *task) { if(have_error()) - return; + return false; cuda_push_context(); - CUfunction cuFilterDivideShadow, cuFilterGetFeature, cuFilterCombineHalves; - CUfunction cuFilterConstructTransform, cuFilterDivideCombined; - CUdeviceptr d_buffers = cuda_device_ptr(rtile.buffer); + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->render_buffer.samples, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->half_window, + &task->pca_threshold}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); - cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); - cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); - cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_pop_context(); + return !have_error(); + } - cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); - cuda_assert(cuModuleGetFunction(&cuFilterDivideCombined, cuFilterModule, "kernel_cuda_filter_divide_combined")); + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); - bool l1 = false; - if(getenv("CYCLES_DENOISE_PREFER_L1")) l1 = true; - cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED)); - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideCombined, l1? CU_FUNC_CACHE_PREFER_L1: CU_FUNC_CACHE_PREFER_SHARED)); + cuda_push_context(); - if(have_error()) - return; + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - int overscan = rtile.buffers->params.overscan; - bool use_cross_denoising = kernel_globals.film.denoise_cross; - bool use_gradients = kernel_globals.integrator.use_gradients; - int half_window = kernel_globals.integrator.half_window; - int buffer_pass_stride = kernel_globals.film.pass_stride; - int buffer_denoising_offset = kernel_globals.film.pass_denoising; - float pca_threshold = kernel_globals.integrator.filter_strength; - int num_frames = 1; - int prev_frames = 0; - - int4 filter_area = make_int4(rtile.x + overscan, rtile.y + overscan, rtile.w - 2*overscan, rtile.h - 2*overscan); - int4 buffer_area = make_int4(rtile.buffers->params.full_x, rtile.buffers->params.full_y, rtile.buffers->params.width, rtile.buffers->params.height); - int4 rect = make_int4(max(filter_area.x - half_window, buffer_area.x), - max(filter_area.y - half_window, buffer_area.y), - min(filter_area.x + filter_area.z + half_window, buffer_area.x + buffer_area.z), - min(filter_area.y + filter_area.w + half_window, buffer_area.y + buffer_area.w)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilterConstructTransform)); - - int xthreads = (int)sqrt((float)threads_per_block); - int ythreads = (int)sqrt((float)threads_per_block); - int xblocks = (buffer_area.z + xthreads - 1)/xthreads; - int yblocks = (buffer_area.w + ythreads - 1)/ythreads; - - CUdeviceptr d_denoise_buffers; - int w = align_up(rect.z - rect.x, 4); - int h = (rect.w - rect.y); - int frame_stride = w*(rect.w - rect.y); - int pass_stride = frame_stride*rtile.buffers->params.frames; - int passes = use_cross_denoising? 20 : 14; - cuda_assert(cuMemAlloc(&d_denoise_buffers, passes*pass_stride*sizeof(float))); -#define CUDA_PTR_ADD(ptr, x) ((CUdeviceptr) (((float*) (ptr)) + (x))) - - for(int frame = 0; frame < rtile.buffers->params.frames; frame++) { - CUdeviceptr d_denoise_buffer = CUDA_PTR_ADD(d_denoise_buffers, frame_stride*frame); - CUdeviceptr d_buffer = CUDA_PTR_ADD(d_buffers, frame*rtile.buffers->params.width*rtile.buffers->params.height*rtile.buffers->params.get_passes_size()); - - /* ==== Step 1: Prefilter shadow feature. ==== */ - { - CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, 4*pass_stride); - /* Reuse some passes of the filter_buffer for temporary storage. */ - CUdeviceptr d_sampleV = CUDA_PTR_ADD(d_denoise_buffer, 0*pass_stride); - CUdeviceptr d_sampleVV = CUDA_PTR_ADD(d_denoise_buffer, 1*pass_stride); - CUdeviceptr d_bufferV = CUDA_PTR_ADD(d_denoise_buffer, 2*pass_stride); - CUdeviceptr d_cleanV = CUDA_PTR_ADD(d_denoise_buffer, 3*pass_stride); - CUdeviceptr d_unfilteredA = CUDA_PTR_ADD(d_denoise_buffer, 5*pass_stride); - CUdeviceptr d_unfilteredB = CUDA_PTR_ADD(d_denoise_buffer, 6*pass_stride); - - CUdeviceptr d_temp1 = CUDA_PTR_ADD(d_denoise_buffer, 7*pass_stride); - CUdeviceptr d_temp2 = CUDA_PTR_ADD(d_denoise_buffer, 8*pass_stride); - CUdeviceptr d_temp3 = CUDA_PTR_ADD(d_denoise_buffer, 9*pass_stride); - - CUdeviceptr d_null = (CUdeviceptr) 0; - /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ - void *divide_args[] = {&sample, &d_buffer, - &buffer_area, - &rtile.offset, &rtile.stride, - &d_unfilteredA, &d_unfilteredB, - &d_sampleV, &d_sampleVV, &d_bufferV, - &rect, &buffer_pass_stride, &buffer_denoising_offset, - &use_gradients}; - cuda_assert(cuLaunchKernel(cuFilterDivideShadow, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, divide_args, 0)); - - /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - non_local_means(rect, d_bufferV, d_sampleV, d_cleanV, d_sampleVV, d_temp1, d_temp2, d_temp3, 6, 3, 2.0f, 2.0f); - - /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - non_local_means(rect, d_unfilteredA, d_unfilteredB, d_sampleV, d_cleanV, d_temp1, d_temp2, d_temp3, 5, 3, 1.0f, 0.25f); - non_local_means(rect, d_unfilteredB, d_unfilteredA, d_bufferV, d_cleanV, d_temp1, d_temp2, d_temp3, 5, 3, 1.0f, 0.25f); - - /* Estimate the residual variance between the two filtered halves. */ - int var_r = 2; - void *residual_variance_args[] = {&d_null, &d_cleanV, &d_sampleV, &d_bufferV, - &rect, &var_r}; - cuda_assert(cuLaunchKernel(cuFilterCombineHalves, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, residual_variance_args, 0)); - - /* Use the residual variance for a second filter pass. */ - non_local_means(rect, d_sampleV, d_bufferV, d_unfilteredA, d_cleanV, d_temp1, d_temp2, d_temp3, 4, 2, 1.0f, 1.0f); - non_local_means(rect, d_bufferV, d_sampleV, d_unfilteredB, d_cleanV, d_temp1, d_temp2, d_temp3, 4, 2, 1.0f, 1.0f); - - /* Combine the two double-filtered halves to a final shadow feature image and associated variance. */ - var_r = 0; - void *final_prefiltered_args[] = {&d_mean, &d_null, - &d_unfilteredA, &d_unfilteredB, - &rect, &var_r}; - cuda_assert(cuLaunchKernel(cuFilterCombineHalves, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, final_prefiltered_args, 0)); - cuda_assert(cuCtxSynchronize()); - } + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); - /* ==== Step 2: Prefilter general features. ==== */ - { - CUdeviceptr d_unfiltered = CUDA_PTR_ADD(d_denoise_buffer, 8*pass_stride); - CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, 9*pass_stride); - CUdeviceptr d_temp1 = CUDA_PTR_ADD(d_denoise_buffer, 10*pass_stride); - CUdeviceptr d_temp2 = CUDA_PTR_ADD(d_denoise_buffer, 11*pass_stride); - CUdeviceptr d_temp3 = CUDA_PTR_ADD(d_denoise_buffer, 12*pass_stride); - - int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 }; - int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 }; - int mean_to[] = { 1, 2, 3, 0, 5, 6, 7 }; - for(int i = 0; i < 7; i++) { - CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, mean_to[i]*pass_stride); - - void *get_feature_args[] = {&sample, &d_buffer, &mean_from[i], &variance_from[i], - &buffer_area, - &rtile.offset, &rtile.stride, - &d_unfiltered, &d_variance, - &rect, &buffer_pass_stride, - &buffer_denoising_offset, - &use_cross_denoising}; - cuda_assert(cuLaunchKernel(cuFilterGetFeature, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, get_feature_args, 0)); - - /* Smooth the feature using non-local means. */ - non_local_means(rect, d_unfiltered, d_unfiltered, d_mean, d_variance, d_temp1, d_temp2, d_temp3, 4, 2, 1.0f, 0.25f); - } - } + CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; - /* ==== Step 3: Copy combined color pass. ==== */ - { - int mean_from[] = {20, 21, 22, 26, 27, 28}; - int variance_from[] = {23, 24, 25, 29, 30, 31}; - int mean_to[] = { 8, 9, 10, 14, 15, 16}; - int variance_to[] = {11, 12, 13, 17, 18, 19}; - for(int i = 0; i < (use_cross_denoising? 6 : 3); i++) { - CUdeviceptr d_mean = CUDA_PTR_ADD(d_denoise_buffer, mean_to[i]*pass_stride); - CUdeviceptr d_variance = CUDA_PTR_ADD(d_denoise_buffer, variance_to[i]*pass_stride); - - void *get_feature_args[] = {&sample, &d_buffer, &mean_from[i], &variance_from[i], - &buffer_area, - &rtile.offset, &rtile.stride, - &d_mean, &d_variance, - &rect, &buffer_pass_stride, - &buffer_denoising_offset, - &use_cross_denoising}; - cuda_assert(cuLaunchKernel(cuFilterGetFeature, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, get_feature_args, 0)); - } - } + int r = task->half_window; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + void *calc_difference_args[] = {&dx, &dy, + &guide_ptr, + &guide_variance_ptr, + &difference, + &local_rect, + &task->buffer.w, + &task->buffer.pass_stride, + &a, + &task->nlm_k_2}; + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + + void *blur_args[] = {&difference, + &blurDifference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *calc_weight_args[] = {&blurDifference, + &difference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + + /* Reuse previous arguments. */ + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *construct_gramian_args[] = {&dx, &dy, + &blurDifference, + &task->buffer.mem.device_pointer, + &color_ptr, + &color_variance_ptr, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &local_rect, + &task->reconstruction_state.filter_rect, + &task->buffer.w, + &task->buffer.h, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); } - /* Use the prefiltered feature to denoise the image. */ - int storage_num = filter_area.z*filter_area.w; - CUdeviceptr d_rank, d_transform; - cuda_assert(cuMemAlloc(&d_rank, storage_num*sizeof(int))); - cuda_assert(cuMemAlloc(&d_transform, storage_num*sizeof(float)*TRANSFORM_SIZE)); - - xthreads = (int)sqrt((float)threads_per_block); - ythreads = (int)sqrt((float)threads_per_block); - xblocks = (filter_area.z + xthreads - 1)/xthreads; - yblocks = (filter_area.w + ythreads - 1)/ythreads; - - void *transform_args[] = {&sample, - &d_denoise_buffers, - &d_transform, - &d_rank, - &filter_area, - &rect, - &half_window, - &pca_threshold, - &num_frames, - &prev_frames}; - cuda_assert(cuLaunchKernel(cuFilterConstructTransform, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, transform_args, 0)); + void *finalize_args[] = {&task->buffer.w, + &task->buffer.h, + &output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + cuda_pop_context(); + return !have_error(); + } - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + if(have_error()) + return false; - xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads; - yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads; + cuda_push_context(); - int dx, dy; - int4 local_rect, local_filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, filter_area.z, filter_area.w); - int f = 4; - float a = 1.0f; - float k_2 = kernel_globals.integrator.weighting_adjust; - - CUdeviceptr color_buffer = CUDA_PTR_ADD(d_denoise_buffers, 8*pass_stride); - CUdeviceptr variance_buffer = CUDA_PTR_ADD(d_denoise_buffers, 11*pass_stride); - CUdeviceptr d_difference, d_blurDifference, d_XtWX, d_XtWY; - cuda_assert(cuMemAlloc(&d_difference, pass_stride*sizeof(float))); - cuda_assert(cuMemAlloc(&d_blurDifference, pass_stride*sizeof(float))); - cuda_assert(cuMemAlloc(&d_XtWX, storage_num*sizeof(float)*(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1))); - cuda_assert(cuMemAlloc(&d_XtWY, storage_num*sizeof(float3)*(DENOISE_FEATURES+1))); - cuda_assert(cuMemsetD8(d_XtWX, 0, storage_num*sizeof(float)*(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1))); - cuda_assert(cuMemsetD8(d_XtWY, 0, storage_num*sizeof(float3)*(DENOISE_FEATURES+1))); -#undef CUDA_PTR_ADD - - void *calc_difference_args[] = {&dx, &dy, &color_buffer, &variance_buffer, &d_difference, &local_rect, &w, &a, &k_2}; - void *blur_args[] = {&d_difference, &d_blurDifference, &local_rect, &w, &f}; - void *calc_weight_args[] = {&d_blurDifference, &d_difference, &local_rect, &w, &f}; - void *construct_gramian_args[] = {&dx, &dy, &d_blurDifference, &d_denoise_buffers, &color_buffer, &variance_buffer, &d_transform, &d_rank, &d_XtWX, &d_XtWY, &local_rect, &local_filter_rect, &w, &h, &f}; - - for(int i = 0; i < (2*half_window+1)*(2*half_window+1); i++) { - dy = i / (2*half_window+1) - half_window; - dx = i % (2*half_window+1) - half_window; - local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); - cuda_assert(cuLaunchKernel(cuNLMCalcDifference, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, calc_difference_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMBlur, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, blur_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMCalcWeight, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, calc_weight_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMBlur, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, blur_args, 0)); - cuda_assert(cuLaunchKernel(cuNLMConstructGramian, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, construct_gramian_args, 0)); - } - cuda_assert(cuMemFree(d_difference)); - cuda_assert(cuMemFree(d_blurDifference)); - cuda_assert(cuMemFree(d_transform)); - cuda_assert(cuMemFree(d_denoise_buffers)); - //int w, int h, float *buffer, void *storage, float *XtWX, float3 *XtWY, int4 filter_area, int4 buffer_params, int sample) { - int4 buffer_params = make_int4(rtile.offset, rtile.stride, kernel_globals.film.pass_stride, kernel_globals.film.pass_no_denoising); - void *finalize_args[] = {&w, &h, &d_buffers, &d_rank, &d_XtWX, &d_XtWY, &filter_area, &buffer_params, &sample}; - cuda_assert(cuLaunchKernel(cuFinalize, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, finalize_args, 0)); - cuda_assert(cuMemFree(d_XtWX)); - cuda_assert(cuMemFree(d_XtWY)); - cuda_assert(cuMemFree(d_rank)); + cuda_pop_context(); + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_offset, + &task->use_gradients}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); cuda_pop_context(); + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_offset, + &task->use_cross_denoising}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + void denoise(RenderTile &rtile, int sample) + { + DenoisingTask denoising(this); + + int overscan = rtile.buffers->params.overscan; + denoising.filter_area = make_int4(rtile.x + overscan, rtile.y + overscan, rtile.w - 2*overscan, rtile.h - 2*overscan); + denoising.render_buffer.samples = sample; + + denoising.tiles_from_single_tile(rtile); + denoising.init_from_kerneldata(&kernel_globals); + + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + + denoising.run_denoising(); } void path_trace(RenderTile& rtile, int sample, bool branched) diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index f5b295f2220..f77be83f556 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -31,50 +31,54 @@ void DenoisingTask::init_from_kerneldata(KernelData *data) render_buffer.pass_stride = data->film.pass_stride; render_buffer.denoising_offset = data->film.pass_denoising; render_buffer.no_denoising_offset = data->film.pass_no_denoising; - render_buffer.offset = neighbors.offsets[4]; - render_buffer.stride = neighbors.strides[4]; - render_buffer.ptr = neighbors.buffers[4]; - - /* Expand filter_area by half_window pixels and clamp the result to the extent of the neighboring tiles. */ - rect = make_int4(max(neighbors.tile_x[0], filter_area.x - half_window), - max(neighbors.tile_y[0], filter_area.y - half_window), - min(neighbors.tile_x[3], filter_area.x + filter_area.z + half_window), - min(neighbors.tile_y[3], filter_area.y + filter_area.w + half_window)); + render_buffer.offset = tiles->offsets[4]; + render_buffer.stride = tiles->strides[4]; + render_buffer.ptr = tiles->buffers[4]; + + /* Expand filter_area by half_window pixels and clamp the result to the extent of the neighboring tiles */ + rect = make_int4(max(tiles->x[0], filter_area.x - half_window), + max(tiles->y[0], filter_area.y - half_window), + min(tiles->x[3], filter_area.x + filter_area.z + half_window), + min(tiles->y[3], filter_area.y + filter_area.w + half_window)); } -void DenoisingTask::NeighborBuffers::init_from_single_tile(const RenderTile &tile) +void DenoisingTask::tiles_from_single_tile(const RenderTile &tile) { - tile_x[0] = tile.x; - tile_x[1] = tile.x; - tile_x[2] = tile.x+tile.w; - tile_x[3] = tile.x+tile.w; - tile_y[0] = tile.y; - tile_y[1] = tile.y; - tile_y[2] = tile.y+tile.h; - tile_y[3] = tile.y+tile.h; - std::fill(buffers, buffers+9, (device_ptr) 0); - std::fill(offsets, offsets+9, 0); - std::fill(strides, strides+9, 0); - buffers[4] = tile.buffer; - offsets[4] = tile.offset; - strides[4] = tile.stride; + tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int)); + + tiles->x[0] = tile.x; + tiles->x[1] = tile.x; + tiles->x[2] = tile.x+tile.w; + tiles->x[3] = tile.x+tile.w; + tiles->y[0] = tile.y; + tiles->y[1] = tile.y; + tiles->y[2] = tile.y+tile.h; + tiles->y[3] = tile.y+tile.h; + std::fill(tiles->buffers, tiles->buffers+9, (device_ptr) 0); + std::fill(tiles->offsets, tiles->offsets+9, 0); + std::fill(tiles->strides, tiles->strides+9, 0); + tiles->buffers[4] = tile.buffer; + tiles->offsets[4] = tile.offset; + tiles->strides[4] = tile.stride; } -void DenoisingTask::NeighborBuffers::init_from_rendertiles(RenderTile *rtiles) +void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) { + tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int)); + for(int i = 0; i < 9; i++) { - buffers[i] = rtiles[i].buffer; - offsets[i] = rtiles[i].offset; - strides[i] = rtiles[i].stride; + tiles->buffers[i] = rtiles[i].buffer; + tiles->offsets[i] = rtiles[i].offset; + tiles->strides[i] = rtiles[i].stride; } - tile_x[0] = rtiles[3].x; - tile_x[1] = rtiles[4].x; - tile_x[2] = rtiles[5].x; - tile_x[3] = rtiles[5].x + rtiles[5].w; - tile_y[0] = rtiles[1].y; - tile_y[1] = rtiles[4].y; - tile_y[2] = rtiles[7].y; - tile_y[3] = rtiles[7].y + rtiles[7].h; + tiles->x[0] = rtiles[3].x; + tiles->x[1] = rtiles[4].x; + tiles->x[2] = rtiles[5].x; + tiles->x[3] = rtiles[5].x + rtiles[5].w; + tiles->y[0] = rtiles[1].y; + tiles->y[1] = rtiles[4].y; + tiles->y[2] = rtiles[7].y; + tiles->y[3] = rtiles[7].y + rtiles[7].h; } bool DenoisingTask::run_denoising() @@ -87,6 +91,9 @@ bool DenoisingTask::run_denoising() buffer.mem.resize(buffer.pass_stride * buffer.passes); device->mem_alloc(buffer.mem, MEM_READ_WRITE); + device->mem_alloc(tiles_mem, MEM_READ_ONLY); + device->mem_copy_to(tiles_mem); + device_ptr null_ptr = (device_ptr) 0; /* Prefilter shadow feature. */ @@ -262,6 +269,7 @@ bool DenoisingTask::run_denoising() device->mem_free(temporary_1); device->mem_free(temporary_2); device->mem_free(buffer.mem); + device->mem_free(tiles_mem); return true; } diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 23430706733..9b7bedb83db 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -20,6 +20,8 @@ #include "device.h" #include "buffers.h" +#include "filter_defines.h" + CCL_NAMESPACE_BEGIN class DenoisingTask { @@ -42,16 +44,10 @@ public: int samples; } render_buffer; - struct NeighborBuffers { - int tile_x[4]; - int tile_y[4]; - device_ptr buffers[9]; - int offsets[9]; - int strides[9]; - - void init_from_single_tile(const RenderTile &tile); - void init_from_rendertiles(RenderTile *rtiles); - } neighbors; + TilesInfo *tiles; + device_vector<int> tiles_mem; + void tiles_from_single_tile(const RenderTile &tile); + void tiles_from_rendertiles(RenderTile *rtiles); int4 rect; int4 filter_area; diff --git a/intern/cycles/filter/filter_defines.h b/intern/cycles/filter/filter_defines.h index c6dfe96283a..35b0b079e19 100644 --- a/intern/cycles/filter/filter_defines.h +++ b/intern/cycles/filter/filter_defines.h @@ -23,4 +23,13 @@ #define XTWX_SIZE ((DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)) #define XTWY_SIZE (DENOISE_FEATURES+1) +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ + long long int buffers[9]; + int x[4]; + int y[4]; +} TilesInfo; + #endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/filter/filter_prefilter.h b/intern/cycles/filter/filter_prefilter.h index b2eeea28fd8..16c11b0f44d 100644 --- a/intern/cycles/filter/filter_prefilter.h +++ b/intern/cycles/filter/filter_prefilter.h @@ -26,10 +26,8 @@ CCL_NAMESPACE_BEGIN * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. */ ccl_device void kernel_filter_divide_shadow(int sample, - float **buffers, + TilesInfo *tiles, int x, int y, - int *tile_x, int *tile_y, - int *offset, int *stride, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -40,10 +38,10 @@ ccl_device void kernel_filter_divide_shadow(int sample, int buffer_denoising_offset, bool use_gradients) { - int xtile = (x < tile_x[1])? 0: ((x < tile_x[2])? 1: 2); - int ytile = (y < tile_y[1])? 0: ((y < tile_y[2])? 1: 2); + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); int tile = ytile*3+xtile; - float *center_buffer = buffers[tile] + (offset[tile] + y*stride[tile] + x)*buffer_pass_stride; + float *center_buffer = ((float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride; if(use_gradients && tile == 4) { center_buffer[0] = center_buffer[1] = center_buffer[2] = center_buffer[3] = 0.0f; @@ -63,27 +61,23 @@ ccl_device void kernel_filter_divide_shadow(int sample, /* Load a regular feature from the render buffers into the denoise buffer. * Parameters: * - sample: The sample amount in the buffer, used to normalize the buffer. - * - buffers: 9-Element Array containing pointers to the buffers of the 3x3 tiles around the current one. * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. * - x, y: Current pixel - * - tile_x, tile_y: 4-Element Arrays containing the x/y coordinates of the start of the lower, current and upper tile as well as the end of the upper tile plus one. - * - offset, stride: 9-Element Arrays containing offset and stride of the RenderBuffers. * - mean, variance: Target denoise buffers. * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). */ -ccl_device void kernel_filter_get_feature(int sample, float **buffers, +ccl_device void kernel_filter_get_feature(int sample, + TilesInfo *tiles, int m_offset, int v_offset, int x, int y, - int *tile_x, int *tile_y, - int *offset, int *stride, float *mean, float *variance, int4 rect, int buffer_pass_stride, int buffer_denoising_offset, bool use_cross_denoising) { - int xtile = (x < tile_x[1])? 0: ((x < tile_x[2])? 1: 2); - int ytile = (y < tile_y[1])? 0: ((y < tile_y[2])? 1: 2); + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); int tile = ytile*3+xtile; - float *center_buffer = buffers[tile] + (offset[tile] + y*stride[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + float *center_buffer = ((float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); diff --git a/intern/cycles/filter/filter_transform.h b/intern/cycles/filter/filter_transform.h index 28c9224dccc..6ddc3f203b5 100644 --- a/intern/cycles/filter/filter_transform.h +++ b/intern/cycles/filter/filter_transform.h @@ -19,14 +19,13 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly_ptr buffer, int x, int y, int4 rect, float *transform, int *rank, - int half_window, float pca_threshold, - int num_frames, int prev_frames) + int half_window, float pca_threshold) { float features[DENOISE_FEATURES]; int buffer_w = align_up(rect.z - rect.x, 4); int buffer_h = (rect.w - rect.y); - int pass_stride = buffer_h * buffer_w * num_frames; + int pass_stride = buffer_h * buffer_w; /* Temporary storage, used in different steps of the algorithm. */ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; diff --git a/intern/cycles/filter/filter_transform_cuda.h b/intern/cycles/filter/filter_transform_cuda.h index b2a94acc3d0..da9dc683ebd 100644 --- a/intern/cycles/filter/filter_transform_cuda.h +++ b/intern/cycles/filter/filter_transform_cuda.h @@ -20,7 +20,6 @@ ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly int x, int y, int4 rect, float *transform, int *rank, int half_window, float pca_threshold, - int num_frames, int prev_frames, int transform_stride, int localIdx) { __shared__ float shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH]; @@ -28,7 +27,7 @@ ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly int buffer_w = align_up(rect.z - rect.x, 4); int buffer_h = (rect.w - rect.y); - int pass_stride = buffer_h * buffer_w * num_frames; + int pass_stride = buffer_h * buffer_w; /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - half_window), max(rect.y, y - half_window)); diff --git a/intern/cycles/filter/filter_transform_sse.h b/intern/cycles/filter/filter_transform_sse.h index 57b3f10998a..63b71d226fe 100644 --- a/intern/cycles/filter/filter_transform_sse.h +++ b/intern/cycles/filter/filter_transform_sse.h @@ -19,12 +19,11 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly_ptr buffer, int x, int y, int4 rect, float *transform, int *rank, - int half_window, float pca_threshold, - int num_frames, int prev_frames) + int half_window, float pca_threshold) { int buffer_w = align_up(rect.z - rect.x, 4); int buffer_h = (rect.w - rect.y); - int pass_stride = buffer_h * buffer_w * num_frames; + int pass_stride = buffer_h * buffer_w; __m128 features[DENOISE_FEATURES]; float ccl_readonly_ptr pixel_buffer; diff --git a/intern/cycles/filter/kernels/cpu/filter_cpu.h b/intern/cycles/filter/kernels/cpu/filter_cpu.h index 6a0b58b214c..349437a22f5 100644 --- a/intern/cycles/filter/kernels/cpu/filter_cpu.h +++ b/intern/cycles/filter/kernels/cpu/filter_cpu.h @@ -17,13 +17,9 @@ /* Templated common declaration part of all CPU kernels. */ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, - float** buffers, + TilesInfo *tiles, int x, int y, - int *tile_x, - int *tile_y, - int *offset, - int *stride, float *unfilteredA, float *unfilteredB, float *sampleV, @@ -35,15 +31,11 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, bool use_gradients); void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, - float** buffers, + TilesInfo *tiles, int m_offset, int v_offset, int x, int y, - int *tile_x, - int *tile_y, - int *offset, - int *stride, float *mean, float *variance, int* prefilter_rect, @@ -68,9 +60,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(int sample, int *rank, int* rect, int half_window, - float pca_threshold, - int num_frames, - int prev_frames); + float pca_threshold); void KERNEL_FUNCTION_FULL_NAME(filter_divide_combined)(int x, int y, int sample, diff --git a/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h b/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h index 586c30cfa69..716937e8e78 100644 --- a/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h @@ -35,13 +35,9 @@ CCL_NAMESPACE_BEGIN /* Denoise filter */ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, - float** buffers, + TilesInfo *tiles, int x, int y, - int *tile_x, - int *tile_y, - int *offset, - int *stride, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -55,9 +51,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); #else - kernel_filter_divide_shadow(sample, buffers, - x, y, tile_x, tile_y, - offset, stride, + kernel_filter_divide_shadow(sample, tiles, + x, y, unfilteredA, unfilteredB, sampleVariance, @@ -71,15 +66,11 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, } void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, - float** buffers, + TilesInfo *tiles, int m_offset, int v_offset, int x, int y, - int *tile_x, - int *tile_y, - int *offset, - int *stride, float *mean, float *variance, int* prefilter_rect, int buffer_pass_stride, @@ -89,10 +80,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_get_feature); #else - kernel_filter_get_feature(sample, buffers, + kernel_filter_get_feature(sample, tiles, m_offset, v_offset, - x, y, tile_x, tile_y, - offset, stride, + x, y, mean, variance, load_int4(prefilter_rect), buffer_pass_stride, buffer_denoising_offset, use_cross_denoising); @@ -123,9 +113,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(int sample, int *rank, int* prefilter_rect, int half_window, - float pca_threshold, - int num_frames, - int prev_frames) + float pca_threshold) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); @@ -135,8 +123,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(int sample, kernel_filter_construct_transform(sample, buffer, x, y, load_int4(prefilter_rect), transform, rank, - half_window, pca_threshold, - num_frames, prev_frames); + half_window, pca_threshold); #endif } diff --git a/intern/cycles/filter/kernels/cuda/filter.cu b/intern/cycles/filter/kernels/cuda/filter.cu index c62953c1fcb..0b67543b6b6 100644 --- a/intern/cycles/filter/kernels/cuda/filter.cu +++ b/intern/cycles/filter/kernels/cuda/filter.cu @@ -28,9 +28,8 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_divide_shadow(int sample, float* buffers, - int4 buffer_rect, - int offset, int stride, +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -44,14 +43,9 @@ kernel_cuda_filter_divide_shadow(int sample, float* buffers, int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { - int tile_x[4] = {buffer_rect.x, buffer_rect.x, buffer_rect.x+buffer_rect.z, buffer_rect.x+buffer_rect.z}; - int tile_y[4] = {buffer_rect.y, buffer_rect.y, buffer_rect.y+buffer_rect.w, buffer_rect.y+buffer_rect.w}; - float *tile_buffers[9] = {NULL, NULL, NULL, NULL, buffers, NULL, NULL, NULL, NULL}; - int tile_offset[9] = {0, 0, 0, 0, offset, 0, 0, 0, 0}; - int tile_stride[9] = {0, 0, 0, 0, stride, 0, 0, 0, 0}; - kernel_filter_divide_shadow(sample, tile_buffers, - x, y, tile_x, tile_y, - tile_offset, tile_stride, + kernel_filter_divide_shadow(sample, + tiles, + x, y, unfilteredA, unfilteredB, sampleVariance, @@ -66,29 +60,29 @@ kernel_cuda_filter_divide_shadow(int sample, float* buffers, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_get_feature(int sample, float* buffers, - int m_offset, int v_offset, - int4 buffer_rect, - int offset, int stride, - float *mean, float *variance, - int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, bool use_cross_denoising) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_cross_denoising) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { - int tile_x[4] = {buffer_rect.x, buffer_rect.x, buffer_rect.x+buffer_rect.z, buffer_rect.x+buffer_rect.z}; - int tile_y[4] = {buffer_rect.y, buffer_rect.y, buffer_rect.y+buffer_rect.w, buffer_rect.y+buffer_rect.w}; - float *tile_buffers[9] = {NULL, NULL, NULL, NULL, buffers, NULL, NULL, NULL, NULL}; - int tile_offset[9] = {0, 0, 0, 0, offset, 0, 0, 0, 0}; - int tile_stride[9] = {0, 0, 0, 0, stride, 0, 0, 0, 0}; - kernel_filter_get_feature(sample, tile_buffers, + kernel_filter_get_feature(sample, + tiles, m_offset, v_offset, - x, y, tile_x, tile_y, - tile_offset, tile_stride, + x, y, mean, variance, - prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, use_cross_denoising); + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_cross_denoising); } } @@ -108,8 +102,7 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_construct_transform(int sample, float const* __restrict__ buffer, float *transform, int *rank, int4 filter_area, int4 rect, - int half_window, float pca_threshold, - int num_frames, int prev_frames) + int half_window, float pca_threshold) { int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; @@ -120,7 +113,6 @@ kernel_cuda_filter_construct_transform(int sample, float const* __restrict__ buf x + filter_area.x, y + filter_area.y, rect, l_transform, l_rank, half_window, pca_threshold, - num_frames, prev_frames, filter_area.z*filter_area.w, threadIdx.y*blockDim.x + threadIdx.x); } @@ -148,11 +140,12 @@ kernel_cuda_filter_nlm_calc_difference(int dx, int dy, float ccl_readonly_ptr varianceImage, float *differenceImage, int4 rect, int w, + int channel_offset, float a, float k_2) { int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, 0, a, k_2); + kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); } } @@ -236,7 +229,7 @@ kernel_cuda_filter_finalize(int w, int h, rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; - kernel_filter_finalize(x+filter_area.x, y+filter_area.y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); } } |