From d5ca72191c36f3022db8fa5a17d933ee82c82d30 Mon Sep 17 00:00:00 2001 From: Patrick Mours Date: Wed, 11 Dec 2019 18:11:46 +0100 Subject: Cycles: Add OptiX AI denoiser support This patch adds support for the OptiX denoiser as an alternative to the existing NLM denoiser in Cycles. It's re-using the same denoising architecture based on tiles and therefore implicitly also works with multiple GPUs. Reviewed By: sergey Differential Revision: https://developer.blender.org/D6395 --- intern/cycles/blender/addon/properties.py | 20 ++ intern/cycles/blender/addon/ui.py | 20 +- intern/cycles/blender/blender_session.cpp | 19 +- intern/cycles/blender/blender_sync.cpp | 32 ++- intern/cycles/device/device_optix.cpp | 393 +++++++++++++++++++++++----- intern/cycles/device/device_task.h | 4 + intern/cycles/kernel/kernel_passes.h | 4 + intern/cycles/kernel/kernels/cuda/filter.cu | 84 +++++- intern/cycles/render/buffers.cpp | 30 ++- intern/cycles/render/session.cpp | 5 +- intern/cycles/render/session.h | 2 + 11 files changed, 506 insertions(+), 107 deletions(-) (limited to 'intern') diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index e09f15b46e8..5f163c2510b 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -197,6 +197,12 @@ enum_aov_types = ( ('COLOR', "Color", "Write a Color pass", 1), ) +enum_denoising_optix_input_passes= ( + ('RGB', "Color", "Use only color as input", 1), + ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2), + ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3), +) + class CyclesRenderSettings(bpy.types.PropertyGroup): device: EnumProperty( @@ -1279,6 +1285,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, update=update_render_passes, ) + use_pass_volume_direct: BoolProperty( name="Volume Direct", description="Deliver direct volumetric scattering pass", @@ -1298,6 +1305,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, update=update_render_passes, ) + use_optix_denoising: BoolProperty( + name="Use OptiX AI Denoising", + description="Denoise the rendered image with the OptiX AI denoiser", + default=False, + update=update_render_passes, + ) denoising_diffuse_direct: BoolProperty( name="Diffuse Direct", description="Denoise the direct diffuse lighting", @@ -1374,6 +1387,13 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): min=0, max=7, default=0, ) + denoising_optix_input_passes: EnumProperty( + name="Input Passes", + description="Controls which passes the OptiX AI denoiser should use as input, which can have different effects on the denoised image", + items=enum_denoising_optix_input_passes, + default='RGB', + ) + use_pass_crypto_object: BoolProperty( name="Cryptomatte Object", description="Render cryptomatte object pass, for isolating objects in compositing", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index c4182ba564a..35d5d3801d2 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -979,11 +979,21 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): split = layout.split() split.active = cycles_view_layer.use_denoising - layout = layout.column(align=True) - layout.prop(cycles_view_layer, "denoising_radius", text="Radius") - layout.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength") - layout.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength") - layout.prop(cycles_view_layer, "denoising_relative_pca") + col = split.column(align=True) + + if use_optix(context): + col.prop(cycles_view_layer, "use_optix_denoising", text="OptiX AI Denoising") + + if cycles_view_layer.use_optix_denoising: + col.prop(cycles_view_layer, "denoising_optix_input_passes") + return + + col.separator(factor=2.0) + + col.prop(cycles_view_layer, "denoising_radius", text="Radius") + col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength") + col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength") + col.prop(cycles_view_layer, "denoising_relative_pca") layout.separator() diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 26b04babce2..924807350f9 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -478,23 +478,24 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) buffer_params.passes = passes; PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - bool full_denoising = get_boolean(crl, "use_denoising"); + bool use_denoising = get_boolean(crl, "use_denoising"); + bool use_optix_denoising = get_boolean(crl, "use_optix_denoising"); bool write_denoising_passes = get_boolean(crl, "denoising_store_passes"); - bool run_denoising = full_denoising || write_denoising_passes; - - session->tile_manager.schedule_denoising = run_denoising; - buffer_params.denoising_data_pass = run_denoising; + buffer_params.denoising_data_pass = use_denoising || write_denoising_passes; buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); - buffer_params.denoising_prefiltered_pass = write_denoising_passes; + buffer_params.denoising_prefiltered_pass = write_denoising_passes && !use_optix_denoising; - session->params.run_denoising = run_denoising; - session->params.full_denoising = full_denoising; - session->params.write_denoising_passes = write_denoising_passes; + session->params.run_denoising = use_denoising || write_denoising_passes; + session->params.full_denoising = use_denoising && !use_optix_denoising; + session->params.optix_denoising = use_denoising && use_optix_denoising; + session->params.write_denoising_passes = write_denoising_passes && !use_optix_denoising; session->params.denoising.radius = get_int(crl, "denoising_radius"); session->params.denoising.strength = get_float(crl, "denoising_strength"); session->params.denoising.feature_strength = get_float(crl, "denoising_feature_strength"); session->params.denoising.relative_pca = get_boolean(crl, "denoising_relative_pca"); + session->params.denoising.optix_input_passes = get_enum(crl, "denoising_optix_input_passes"); + session->tile_manager.schedule_denoising = session->params.run_denoising; scene->film->denoising_data_pass = buffer_params.denoising_data_pass; scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass; diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 332ee3575c0..20dbe23cdb7 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -535,23 +535,26 @@ vector BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa } PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - bool full_denoising = get_boolean(crp, "use_denoising"); + bool use_denoising = get_boolean(crp, "use_denoising"); + bool use_optix_denoising = get_boolean(crp, "use_optix_denoising"); bool write_denoising_passes = get_boolean(crp, "denoising_store_passes"); scene->film->denoising_flags = 0; - if (full_denoising || write_denoising_passes) { + if (use_denoising || write_denoising_passes) { + if (!use_optix_denoising) { #define MAP_OPTION(name, flag) \ if (!get_boolean(crp, name)) \ scene->film->denoising_flags |= flag; - MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); - MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); - MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR); - MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND); - MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR); - MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND); - MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR); - MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND); + MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); + MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); + MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR); + MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND); + MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR); + MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND); + MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR); + MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND); #undef MAP_OPTION + } b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str()); } @@ -559,14 +562,17 @@ vector BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str()); + if (!use_optix_denoising) { + b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str()); + b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str()); + b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str()); + } if (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) { b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str()); } } + #ifdef __KERNEL_DEBUG__ if (get_boolean(crp, "pass_debug_bvh_traversed_nodes")) { b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_view_layer.name().c_str()); diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index 7335e0bc64d..979ea7dba23 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -42,6 +42,9 @@ # include # include +// TODO(pmours): Disable this once drivers have native support +# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1 + CCL_NAMESPACE_BEGIN /* Make sure this stays in sync with kernel_globals.h */ @@ -107,6 +110,30 @@ struct KernelParams { } \ (void)0 +# define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads; \ + check_result_cuda_ret( \ + cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + threads = (int)sqrt((float)threads); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = ((h) + threads - 1) / threads; + +# define CUDA_LAUNCH_KERNEL(func, args) \ + check_result_cuda_ret(cuLaunchKernel( \ + func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0)); + +/* Similar as above, but for 1-dimensional blocks. */ +# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads; \ + check_result_cuda_ret( \ + cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = h; + +# define CUDA_LAUNCH_KERNEL_1D(func, args) \ + check_result_cuda_ret(cuLaunchKernel( \ + func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0)); + class OptiXDevice : public Device { // List of OptiX program groups @@ -186,6 +213,9 @@ class OptiXDevice : public Device { map cuda_mem_map; bool move_texture_to_host = false; + OptixDenoiser denoiser = NULL; + vector> denoiser_state; + public: OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) : Device(info_, stats_, profiler_, background_), @@ -262,6 +292,9 @@ class OptiXDevice : public Device { launch_params.data_elements = sizeof(KernelParams); // Allocate launch parameter buffer memory on device launch_params.alloc_to_device(info.cpu_threads); + + // Create denoiser state entries for all threads (but do not allocate yet) + denoiser_state.resize(info.cpu_threads); } ~OptiXDevice() { @@ -272,7 +305,11 @@ class OptiXDevice : public Device { for (CUdeviceptr mem : as_mem) { cuMemFree(mem); } - as_mem.clear(); + + // Free denoiser state for all threads + for (const pair &state : denoiser_state) { + cuMemFree(state.second); + } sbt_data.free(); texture_info.free(); @@ -296,6 +333,9 @@ class OptiXDevice : public Device { for (CUstream stream : cuda_stream) cuStreamDestroy(stream); + if (denoiser != NULL) + optixDenoiserDestroy(denoiser); + // Destroy OptiX and CUDA context optixDeviceContextDestroy(context); cuDevicePrimaryCtxRelease(cuda_device); @@ -686,46 +726,298 @@ class OptiXDevice : public Device { } } - void launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index) + bool launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index) { + int total_samples = rtile.start_sample + rtile.num_samples; + const CUDAContextScope scope(cuda_context); - // Run CUDA denoising kernels - DenoisingTask denoising(this, task); - denoising.functions.construct_transform = function_bind( - &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index); - denoising.functions.accumulate = function_bind( - &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index); - denoising.functions.solve = function_bind( - &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index); - denoising.functions.divide_shadow = function_bind( - &OptiXDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising, thread_index); - denoising.functions.non_local_means = function_bind( - &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index); - denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves, - this, - _1, - _2, - _3, - _4, - _5, - _6, - &denoising, - thread_index); - denoising.functions.get_feature = function_bind( - &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index); - denoising.functions.write_feature = function_bind( - &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index); - denoising.functions.detect_outliers = function_bind( - &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample = rtile.start_sample + rtile.num_samples; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); + // Choose between OptiX and NLM denoising + if (task.denoising_use_optix) { + // Map neighboring tiles onto this device, indices are as following: + // Where index 4 is the center tile and index 9 is the target for the result. + // 0 1 2 + // 3 4 5 + // 6 7 8 9 + RenderTile rtiles[10]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + + // Calculate size of the tile to denoise (including overlap) + int4 rect = make_int4( + rtiles[4].x, rtiles[4].y, rtiles[4].x + rtiles[4].w, rtiles[4].y + rtiles[4].h); + // Overlap between tiles has to be at least 64 pixels + // TODO(pmours): Query this value from OptiX + rect = rect_expand(rect, 64); + int4 clip_rect = make_int4( + rtiles[3].x, rtiles[1].y, rtiles[5].x + rtiles[5].w, rtiles[7].y + rtiles[7].h); + rect = rect_clip(rect, clip_rect); + int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); + int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y); + + // Calculate byte offsets and strides + int pixel_stride = task.pass_stride * (int)sizeof(float); + int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride; + const int pass_offset[3] = { + (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float), + (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), + (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; + + // Start with the current tile pointer offset + int input_stride = pixel_stride; + device_ptr input_ptr = rtile.buffer + pixel_offset; + + // Copy tile data into a common buffer if necessary + device_only_memory input(this, "denoiser input"); + device_vector tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE); + + if ((!rtiles[0].buffer || rtiles[0].buffer == rtile.buffer) && + (!rtiles[1].buffer || rtiles[1].buffer == rtile.buffer) && + (!rtiles[2].buffer || rtiles[2].buffer == rtile.buffer) && + (!rtiles[3].buffer || rtiles[3].buffer == rtile.buffer) && + (!rtiles[5].buffer || rtiles[5].buffer == rtile.buffer) && + (!rtiles[6].buffer || rtiles[6].buffer == rtile.buffer) && + (!rtiles[7].buffer || rtiles[7].buffer == rtile.buffer) && + (!rtiles[8].buffer || rtiles[8].buffer == rtile.buffer)) { + // Tiles are in continous memory, so can just subtract overlap offset + input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride; + // Stride covers the whole width of the image and not just a single tile + input_stride *= rtile.stride; + } + else { + // Adjacent tiles are in separate memory regions, so need to copy them into a single one + input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride); + // Start with the new input buffer + input_ptr = input.device_pointer; + // Stride covers the width of the new input buffer, which includes tile width and overlap + input_stride *= rect_size.x; + + TileInfo *tile_info = tile_info_mem.alloc(1); + for (int i = 0; i < 9; i++) { + tile_info->offsets[i] = rtiles[i].offset; + tile_info->strides[i] = rtiles[i].stride; + tile_info->buffers[i] = rtiles[i].buffer; + } + tile_info->x[0] = rtiles[3].x; + tile_info->x[1] = rtiles[4].x; + tile_info->x[2] = rtiles[5].x; + tile_info->x[3] = rtiles[5].x + rtiles[5].w; + tile_info->y[0] = rtiles[1].y; + tile_info->y[1] = rtiles[4].y; + tile_info->y[2] = rtiles[7].y; + tile_info->y[3] = rtiles[7].y + rtiles[7].h; + tile_info_mem.copy_to_device(); + + CUfunction filter_copy_func; + check_result_cuda_ret(cuModuleGetFunction( + &filter_copy_func, cuda_filter_module, "kernel_cuda_filter_copy_input")); + check_result_cuda_ret(cuFuncSetCacheConfig(filter_copy_func, CU_FUNC_CACHE_PREFER_L1)); + + void *args[] = { + &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; + CUDA_GET_BLOCKSIZE(filter_copy_func, rect_size.x, rect_size.y); + CUDA_LAUNCH_KERNEL(filter_copy_func, args); + } + +# if OPTIX_DENOISER_NO_PIXEL_STRIDE + device_only_memory input_rgb(this, "denoiser input rgb"); + { + input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * + task.denoising.optix_input_passes); + + CUfunction convert_to_rgb_func; + check_result_cuda_ret(cuModuleGetFunction( + &convert_to_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_to_rgb")); + check_result_cuda_ret(cuFuncSetCacheConfig(convert_to_rgb_func, CU_FUNC_CACHE_PREFER_L1)); + + void *args[] = {&input_rgb.device_pointer, + &input_ptr, + &rect_size.x, + &rect_size.y, + &input_stride, + &task.pass_stride, + const_cast(pass_offset), + &task.denoising.optix_input_passes, + &total_samples}; + CUDA_GET_BLOCKSIZE(convert_to_rgb_func, rect_size.x, rect_size.y); + CUDA_LAUNCH_KERNEL(convert_to_rgb_func, args); + + input_ptr = input_rgb.device_pointer; + pixel_stride = 3 * sizeof(float); + input_stride = rect_size.x * pixel_stride; + } +# endif + + if (denoiser == NULL) { + // Create OptiX denoiser handle on demand when it is first used + OptixDenoiserOptions denoiser_options; + assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3); + denoiser_options.inputKind = static_cast( + OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1)); + denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3; + check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser)); + check_result_optix_ret( + optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0)); + } + + OptixDenoiserSizes sizes = {}; + check_result_optix_ret( + optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes)); + + auto &state = denoiser_state[thread_index].second; + auto &state_size = denoiser_state[thread_index].first; + const size_t scratch_size = sizes.recommendedScratchSizeInBytes; + const size_t scratch_offset = sizes.stateSizeInBytes; + + // Allocate denoiser state if tile size has changed since last setup + if (state_size.x != rect_size.x || state_size.y != rect_size.y) { + if (state) { + cuMemFree(state); + state = 0; + } + check_result_cuda_ret(cuMemAlloc(&state, scratch_offset + scratch_size)); + + check_result_optix_ret(optixDenoiserSetup(denoiser, + cuda_stream[thread_index], + rect_size.x, + rect_size.y, + state, + scratch_offset, + state + scratch_offset, + scratch_size)); + + state_size = rect_size; + } + + // Set up input and output layer information + OptixImage2D input_layers[3] = {}; + OptixImage2D output_layers[1] = {}; + + for (int i = 0; i < 3; ++i) { +# if OPTIX_DENOISER_NO_PIXEL_STRIDE + input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i); +# else + input_layers[i].data = input_ptr + pass_offset[i]; +# endif + input_layers[i].width = rect_size.x; + input_layers[i].height = rect_size.y; + input_layers[i].rowStrideInBytes = input_stride; + input_layers[i].pixelStrideInBytes = pixel_stride; + input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + +# if OPTIX_DENOISER_NO_PIXEL_STRIDE + output_layers[0].data = input_ptr; + output_layers[0].width = rect_size.x; + output_layers[0].height = rect_size.y; + output_layers[0].rowStrideInBytes = input_stride; + output_layers[0].pixelStrideInBytes = pixel_stride; + int2 output_offset = overlap_offset; + overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually +# else + output_layers[0].data = rtiles[9].buffer + pixel_offset; + output_layers[0].width = rtiles[9].w; + output_layers[0].height = rtiles[9].h; + output_layers[0].rowStrideInBytes = rtiles[9].stride * pixel_stride; + output_layers[0].pixelStrideInBytes = pixel_stride; +# endif + output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; + + // Finally run denonising + OptixDenoiserParams params = {}; // All parameters are disabled/zero + check_result_optix_ret(optixDenoiserInvoke(denoiser, + cuda_stream[thread_index], + ¶ms, + state, + scratch_offset, + input_layers, + task.denoising.optix_input_passes, + overlap_offset.x, + overlap_offset.y, + output_layers, + state + scratch_offset, + scratch_size)); + +# if OPTIX_DENOISER_NO_PIXEL_STRIDE + { + CUfunction convert_from_rgb_func; + check_result_cuda_ret(cuModuleGetFunction( + &convert_from_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_from_rgb")); + check_result_cuda_ret( + cuFuncSetCacheConfig(convert_from_rgb_func, CU_FUNC_CACHE_PREFER_L1)); + + void *args[] = {&input_ptr, + &rtiles[9].buffer, + &output_offset.x, + &output_offset.y, + &rect_size.x, + &rect_size.y, + &rtiles[9].x, + &rtiles[9].y, + &rtiles[9].w, + &rtiles[9].h, + &rtiles[9].offset, + &rtiles[9].stride, + &task.pass_stride}; + CUDA_GET_BLOCKSIZE(convert_from_rgb_func, rtiles[9].w, rtiles[9].h); + CUDA_LAUNCH_KERNEL(convert_from_rgb_func, args); + } +# endif + + check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + task.unmap_neighbor_tiles(rtiles, this); + } + else { + // Run CUDA denoising kernels + DenoisingTask denoising(this, task); + denoising.functions.construct_transform = function_bind( + &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index); + denoising.functions.accumulate = function_bind( + &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index); + denoising.functions.solve = function_bind( + &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index); + denoising.functions.divide_shadow = function_bind(&OptiXDevice::denoising_divide_shadow, + this, + _1, + _2, + _3, + _4, + _5, + &denoising, + thread_index); + denoising.functions.non_local_means = function_bind( + &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index); + denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves, + this, + _1, + _2, + _3, + _4, + _5, + _6, + &denoising, + thread_index); + denoising.functions.get_feature = function_bind( + &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index); + denoising.functions.write_feature = function_bind( + &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index); + denoising.functions.detect_outliers = function_bind( + &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = total_samples; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); + } + + // Update current sample, so it is displayed correctly + rtile.sample = total_samples; + // Update task progress after the denoiser completed processing task.update_progress(&rtile, rtile.w * rtile.h); + + return true; } void launch_shader_eval(DeviceTask &task, int thread_index) @@ -1899,30 +2191,6 @@ class OptiXDevice : public Device { task_pool.cancel(); } -# define CUDA_GET_BLOCKSIZE(func, w, h) \ - int threads; \ - check_result_cuda_ret( \ - cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - threads = (int)sqrt((float)threads); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; - -# define CUDA_LAUNCH_KERNEL(func, args) \ - check_result_cuda_ret(cuLaunchKernel( \ - func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0)); - - /* Similar as above, but for 1-dimensional blocks. */ -# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads; \ - check_result_cuda_ret( \ - cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = h; - -# define CUDA_LAUNCH_KERNEL_1D(func, args) \ - check_result_cuda_ret(cuLaunchKernel( \ - func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0)); - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, @@ -2341,9 +2609,8 @@ bool device_optix_init() const OptixResult result = optixInit(); if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { - VLOG(1) - << "OptiX initialization failed because the installed driver does not support ABI version " - << OPTIX_ABI_VERSION; + VLOG(1) << "OptiX initialization failed because driver does not support ABI version " + << OPTIX_ABI_VERSION; return false; } else if (result != OPTIX_SUCCESS) { diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index f45de556492..1b1e97cdb10 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -47,6 +47,8 @@ class DenoiseParams { int neighbor_frames; /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ bool clamp_input; + /* Controls which passes the OptiX AI denoiser should use as input. */ + int optix_input_passes; DenoiseParams() { @@ -56,6 +58,7 @@ class DenoiseParams { relative_pca = false; neighbor_frames = 2; clamp_input = true; + optix_input_passes = 1; } }; @@ -100,6 +103,7 @@ class DeviceTask : public Task { vector denoising_frames; bool denoising_do_filter; + bool denoising_use_optix; bool denoising_write_passes; int pass_stride; diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 828add9dc13..7841d3a5e09 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -91,6 +91,10 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, if (sum_weight != 0.0f) { normal /= sum_weight; } + + /* Transform normal into camera space. */ + normal = transform_direction(&kernel_data.cam.worldtocamera, normal); + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 5b552b01413..fbb773533ce 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -26,6 +26,74 @@ /* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_copy_input(float *buffer, + CCL_FILTER_TILE_INFO, + int4 prefilter_rect, + int buffer_pass_stride) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); + int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); + int itile = ytile * 3 + xtile; + float *const in = ((float *)ccl_get_tile_buffer(itile)) + + (tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride; + buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride; + for (int i = 0; i < buffer_pass_stride; ++i) + buffer[i] = in[i]; + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < sw && y < sh) { + if (num_inputs > 0) { + float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float); + float *out = rgb + (x + y * sw) * 3; + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + } + if (num_inputs > 1) { + float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float); + float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3; + out[0] = in[0] / num_samples; + out[1] = in[1] / num_samples; + out[2] = in[2] / num_samples; + } + if (num_inputs > 2) { + float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float); + float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3; + out[0] = in[0] / num_samples; + out[1] = in[1] / num_samples; + out[2] = in[2] / num_samples; + } + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < sw && y < sh) { + float *in = rgb + ((ix + x) + (iy + y) * iw) * 3; + float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride; + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + } +} + + extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, @@ -97,14 +165,14 @@ kernel_cuda_filter_write_feature(int sample, int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < filter_area.z && y < filter_area.w) { - kernel_filter_write_feature(sample, - x + filter_area.x, - y + filter_area.y, - buffer_params, - from, - buffer, - out_offset, - prefilter_rect); + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); } } diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fe8606e1939..50308d0d377 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -55,7 +55,10 @@ bool BufferParams::modified(const BufferParams ¶ms) { return !(full_x == params.full_x && full_y == params.full_y && width == params.width && height == params.height && full_width == params.full_width && - full_height == params.full_height && Pass::equals(passes, params.passes)); + full_height == params.full_height && Pass::equals(passes, params.passes) && + denoising_data_pass == params.denoising_data_pass && + denoising_clean_pass == params.denoising_clean_pass && + denoising_prefiltered_pass == params.denoising_prefiltered_pass); } int BufferParams::get_passes_size() @@ -183,13 +186,28 @@ bool RenderBuffers::get_denoising_pass_rect( offset = type + params.get_denoising_offset(); scale /= sample; } - else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) { - /* If we're not saving the prefiltering result, return the original noisy pass. */ - offset = params.get_denoising_offset() + DENOISING_PASS_COLOR; - scale /= sample; + else if (params.denoising_prefiltered_pass) { + offset = type + params.get_denoising_prefiltered_offset(); } else { - offset = type + params.get_denoising_prefiltered_offset(); + switch (type) { + case DENOISING_PASS_PREFILTERED_DEPTH: + offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH; + break; + case DENOISING_PASS_PREFILTERED_NORMAL: + offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL; + break; + case DENOISING_PASS_PREFILTERED_ALBEDO: + offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO; + break; + case DENOISING_PASS_PREFILTERED_COLOR: + /* If we're not saving the prefiltering result, return the original noisy pass. */ + offset = params.get_denoising_offset() + DENOISING_PASS_COLOR; + break; + default: + return false; + } + scale /= sample; } int pass_stride = params.get_passes_size(); diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 7a894c1e98a..c77a20787f5 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -285,9 +285,7 @@ void Session::run_gpu() if (progress.get_cancel()) break; - } - if (!no_tiles) { /* buffers mutex is locked entirely while rendering each * sample, and released/reacquired on each iteration to allow * reset and draw in between */ @@ -978,7 +976,7 @@ void Session::update_status_time(bool show_pause, bool show_done) */ substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); } - if (params.full_denoising) { + if (params.full_denoising || params.optix_denoising) { substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); } else if (params.run_denoising) { @@ -1038,6 +1036,7 @@ void Session::render() task.denoising_from_render = true; task.denoising_do_filter = params.full_denoising; + task.denoising_use_optix = params.optix_denoising; task.denoising_write_passes = params.write_denoising_passes; } diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 9fffc13dd41..ec465601541 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -63,6 +63,7 @@ class SessionParams { bool run_denoising; bool write_denoising_passes; bool full_denoising; + bool optix_denoising; DenoiseParams denoising; double cancel_timeout; @@ -92,6 +93,7 @@ class SessionParams { run_denoising = false; write_denoising_passes = false; full_denoising = false; + optix_denoising = false; display_buffer_linear = false; -- cgit v1.2.3