diff options
Diffstat (limited to 'intern/cycles/device/optix/device_impl.cpp')
-rw-r--r-- | intern/cycles/device/optix/device_impl.cpp | 526 |
1 files changed, 374 insertions, 152 deletions
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index 5f5eff53063..cb6c36d5ea6 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -20,45 +20,41 @@ # include "device/optix/device_impl.h" # include "bvh/bvh.h" -# include "bvh/bvh_optix.h" +# include "bvh/optix.h" + # include "integrator/pass_accessor_gpu.h" -# include "render/buffers.h" -# include "render/hair.h" -# include "render/mesh.h" -# include "render/object.h" -# include "render/pass.h" -# include "render/scene.h" - -# include "util/util_debug.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_progress.h" -# include "util/util_time.h" + +# include "scene/hair.h" +# include "scene/mesh.h" +# include "scene/object.h" +# include "scene/pass.h" +# include "scene/pointcloud.h" +# include "scene/scene.h" + +# include "util/debug.h" +# include "util/log.h" +# include "util/md5.h" +# include "util/path.h" +# include "util/progress.h" +# include "util/time.h" # undef __KERNEL_CPU__ # define __KERNEL_OPTIX__ # include "kernel/device/optix/globals.h" +# include <optix_denoiser_tiling.h> + CCL_NAMESPACE_BEGIN OptiXDevice::Denoiser::Denoiser(OptiXDevice *device) - : device(device), queue(device), state(device, "__denoiser_state") -{ -} - -OptiXDevice::Denoiser::~Denoiser() + : device(device), queue(device), state(device, "__denoiser_state", true) { - const CUDAContextScope scope(device); - if (optix_denoiser != nullptr) { - optixDenoiserDestroy(optix_denoiser); - } } OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) : CUDADevice(info, stats, profiler), sbt_data(this, "__sbt", MEM_READ_ONLY), - launch_params(this, "__params"), + launch_params(this, "__params", false), denoiser_(this) { /* Make the CUDA context current. */ @@ -90,6 +86,7 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile }; # endif if (DebugFlags().optix.use_debug) { + VLOG(1) << "Using OptiX debug mode."; options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL; } optix_assert(optixDeviceContextCreate(cuContext, &options, &context)); @@ -131,6 +128,11 @@ OptiXDevice::~OptiXDevice() } } + /* Make sure denoiser is destroyed before device context! */ + if (denoiser_.optix_denoiser != nullptr) { + optixDenoiserDestroy(denoiser_.optix_denoiser); + } + optixDeviceContextDestroy(context); } @@ -209,18 +211,22 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } else { module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; - module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; } module_options.boundValues = nullptr; module_options.numBoundValues = 0; +# if OPTIX_ABI_VERSION >= 55 + module_options.payloadTypes = nullptr; + module_options.numPayloadTypes = 0; +# endif OptixPipelineCompileOptions pipeline_options = {}; /* Default to no motion blur and two-level graph, since it is the fastest option. */ pipeline_options.usesMotionBlur = false; pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; - pipeline_options.numPayloadValues = 6; + pipeline_options.numPayloadValues = 8; pipeline_options.numAttributeValues = 2; /* u, v */ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */ @@ -228,11 +234,18 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; if (kernel_features & KERNEL_FEATURE_HAIR) { if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { +# if OPTIX_ABI_VERSION >= 55 + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM; +# else pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; +# endif } else pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; } + if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; + } /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds * This is necessary since objects may be reported to have motion if the Vector pass is @@ -325,7 +338,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features) if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { /* Built-in thick curve intersection. */ OptixBuiltinISOptions builtin_options = {}; +# if OPTIX_ABI_VERSION >= 55 + builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM; + builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE; + builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */ +# else builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; +# endif builtin_options.usesMotionBlur = false; optix_assert(optixBuiltinISModuleGet( @@ -357,6 +376,18 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } } + /* Pointclouds */ + if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { + group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD]; + group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITD_POINTCLOUD].hitgroup.moduleIS = optix_module; + group_descs[PG_HITD_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point"; + group_descs[PG_HITS_POINTCLOUD] = group_descs[PG_HITS]; + group_descs[PG_HITS_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITS_POINTCLOUD].hitgroup.moduleIS = optix_module; + group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point"; + } + if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) { /* Add hit group for local intersections. */ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; @@ -377,9 +408,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = "__direct_callable__svm_node_bevel"; - group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module; - group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass"; } optix_assert(optixProgramGroupCreate( @@ -407,6 +435,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features) stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); trace_css = std::max(trace_css, stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); + trace_css = std::max( + trace_css, stack_size[PG_HITD_POINTCLOUD].cssIS + stack_size[PG_HITD_POINTCLOUD].cssAH); + trace_css = std::max( + trace_css, stack_size[PG_HITS_POINTCLOUD].cssIS + stack_size[PG_HITS_POINTCLOUD].cssAH); OptixPipelineLinkOptions link_options = {}; link_options.maxTraceDepth = 1; @@ -415,7 +447,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; } else { - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; } if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { @@ -432,6 +464,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); } + if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { + pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); + pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); + } pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); @@ -471,6 +507,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); } + if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { + pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); + pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); + } optix_assert(optixPipelineCreate(context, &pipeline_options, @@ -511,7 +551,7 @@ class OptiXDevice::DenoiseContext { : denoise_params(task.params), render_buffers(task.render_buffers), buffer_params(task.buffer_params), - guiding_buffer(device, "denoiser guiding passes buffer"), + guiding_buffer(device, "denoiser guiding passes buffer", true), num_samples(task.num_samples) { num_input_passes = 1; @@ -526,14 +566,28 @@ class OptiXDevice::DenoiseContext { } } - const int num_guiding_passes = num_input_passes - 1; + if (denoise_params.temporally_stable) { + prev_output.device_pointer = render_buffers->buffer.device_pointer; + + prev_output.offset = buffer_params.get_pass_offset(PASS_DENOISING_PREVIOUS); + + prev_output.stride = buffer_params.stride; + prev_output.pass_stride = buffer_params.pass_stride; + + num_input_passes += 1; + use_pass_flow = true; + pass_motion = buffer_params.get_pass_offset(PASS_MOTION); + } + + use_guiding_passes = (num_input_passes - 1) > 0; - if (num_guiding_passes) { + if (use_guiding_passes) { if (task.allow_inplace_modification) { guiding_params.device_pointer = render_buffers->buffer.device_pointer; guiding_params.pass_albedo = pass_denoising_albedo; guiding_params.pass_normal = pass_denoising_normal; + guiding_params.pass_flow = pass_motion; guiding_params.stride = buffer_params.stride; guiding_params.pass_stride = buffer_params.pass_stride; @@ -548,6 +602,10 @@ class OptiXDevice::DenoiseContext { guiding_params.pass_normal = guiding_params.pass_stride; guiding_params.pass_stride += 3; } + if (use_pass_flow) { + guiding_params.pass_flow = guiding_params.pass_stride; + guiding_params.pass_stride += 2; + } guiding_params.stride = buffer_params.width; @@ -565,6 +623,16 @@ class OptiXDevice::DenoiseContext { RenderBuffers *render_buffers = nullptr; const BufferParams &buffer_params; + /* Previous output. */ + struct { + device_ptr device_pointer = 0; + + int offset = PASS_UNUSED; + + int stride = -1; + int pass_stride = -1; + } prev_output; + /* Device-side storage of the guiding passes. */ device_only_memory<float> guiding_buffer; @@ -574,6 +642,7 @@ class OptiXDevice::DenoiseContext { /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */ int pass_albedo = PASS_UNUSED; int pass_normal = PASS_UNUSED; + int pass_flow = PASS_UNUSED; int stride = -1; int pass_stride = -1; @@ -581,8 +650,10 @@ class OptiXDevice::DenoiseContext { /* Number of input passes. Including the color and extra auxiliary passes. */ int num_input_passes = 0; + bool use_guiding_passes = false; bool use_pass_albedo = false; bool use_pass_normal = false; + bool use_pass_flow = false; int num_samples = 0; @@ -591,6 +662,7 @@ class OptiXDevice::DenoiseContext { /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */ int pass_denoising_albedo = PASS_UNUSED; int pass_denoising_normal = PASS_UNUSED; + int pass_motion = PASS_UNUSED; /* For passes which don't need albedo channel for denoising we replace the actual albedo with * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with @@ -657,22 +729,24 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context) const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer), - const_cast<int *>(&context.guiding_params.pass_stride), - const_cast<int *>(&context.guiding_params.pass_albedo), - const_cast<int *>(&context.guiding_params.pass_normal), - &context.render_buffers->buffer.device_pointer, - const_cast<int *>(&buffer_params.offset), - const_cast<int *>(&buffer_params.stride), - const_cast<int *>(&buffer_params.pass_stride), - const_cast<int *>(&context.pass_sample_count), - const_cast<int *>(&context.pass_denoising_albedo), - const_cast<int *>(&context.pass_denoising_normal), - const_cast<int *>(&buffer_params.full_x), - const_cast<int *>(&buffer_params.full_y), - const_cast<int *>(&buffer_params.width), - const_cast<int *>(&buffer_params.height), - const_cast<int *>(&context.num_samples)}; + DeviceKernelArguments args(&context.guiding_params.device_pointer, + &context.guiding_params.pass_stride, + &context.guiding_params.pass_albedo, + &context.guiding_params.pass_normal, + &context.guiding_params.pass_flow, + &context.render_buffers->buffer.device_pointer, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &context.pass_sample_count, + &context.pass_denoising_albedo, + &context.pass_denoising_normal, + &context.pass_motion, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &context.num_samples); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args); } @@ -683,11 +757,11 @@ bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer), - const_cast<int *>(&context.guiding_params.pass_stride), - const_cast<int *>(&context.guiding_params.pass_albedo), - const_cast<int *>(&buffer_params.width), - const_cast<int *>(&buffer_params.height)}; + DeviceKernelArguments args(&context.guiding_params.device_pointer, + &context.guiding_params.pass_stride, + &context.guiding_params.pass_albedo, + &buffer_params.width, + &buffer_params.height); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args); } @@ -712,7 +786,7 @@ void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type) return; } } - else if (!context.albedo_replaced_with_fake) { + else if (context.use_guiding_passes && !context.albedo_replaced_with_fake) { context.albedo_replaced_with_fake = true; if (!denoise_filter_guiding_set_fake_albedo(context)) { LOG(ERROR) << "Error replacing real albedo with the fake one."; @@ -768,7 +842,13 @@ void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass destination.num_components = 3; destination.pixel_stride = context.buffer_params.pass_stride; - pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination); + BufferParams buffer_params = context.buffer_params; + buffer_params.window_x = 0; + buffer_params.window_y = 0; + buffer_params.window_width = buffer_params.width; + buffer_params.window_height = buffer_params.height; + + pass_accessor.get_render_tile_pixels(context.render_buffers, buffer_params, destination); } bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass) @@ -777,15 +857,15 @@ bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {&context.render_buffers->buffer.device_pointer, - const_cast<int *>(&buffer_params.full_x), - const_cast<int *>(&buffer_params.full_y), - const_cast<int *>(&buffer_params.width), - const_cast<int *>(&buffer_params.height), - const_cast<int *>(&buffer_params.offset), - const_cast<int *>(&buffer_params.stride), - const_cast<int *>(&buffer_params.pass_stride), - const_cast<int *>(&pass.denoised_offset)}; + DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &pass.denoised_offset); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args); } @@ -797,20 +877,20 @@ bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context, const int work_size = buffer_params.width * buffer_params.height; - void *args[] = {&context.render_buffers->buffer.device_pointer, - const_cast<int *>(&buffer_params.full_x), - const_cast<int *>(&buffer_params.full_y), - const_cast<int *>(&buffer_params.width), - const_cast<int *>(&buffer_params.height), - const_cast<int *>(&buffer_params.offset), - const_cast<int *>(&buffer_params.stride), - const_cast<int *>(&buffer_params.pass_stride), - const_cast<int *>(&context.num_samples), - const_cast<int *>(&pass.noisy_offset), - const_cast<int *>(&pass.denoised_offset), - const_cast<int *>(&context.pass_sample_count), - const_cast<int *>(&pass.num_components), - const_cast<bool *>(&pass.use_compositing)}; + DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &context.num_samples, + &pass.noisy_offset, + &pass.denoised_offset, + &context.pass_sample_count, + &pass.num_components, + &pass.use_compositing); return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args); } @@ -834,7 +914,8 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context) { const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) || (denoiser_.use_pass_albedo != context.use_pass_albedo) || - (denoiser_.use_pass_normal != context.use_pass_normal); + (denoiser_.use_pass_normal != context.use_pass_normal) || + (denoiser_.use_pass_flow != context.use_pass_flow); if (!recreate_denoiser) { return true; } @@ -848,8 +929,14 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context) OptixDenoiserOptions denoiser_options = {}; denoiser_options.guideAlbedo = context.use_pass_albedo; denoiser_options.guideNormal = context.use_pass_normal; + + OptixDenoiserModelKind model = OPTIX_DENOISER_MODEL_KIND_HDR; + if (context.use_pass_flow) { + model = OPTIX_DENOISER_MODEL_KIND_TEMPORAL; + } + const OptixResult result = optixDenoiserCreate( - this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser); + this->context, model, &denoiser_options, &denoiser_.optix_denoiser); if (result != OPTIX_SUCCESS) { set_error("Failed to create OptiX denoiser"); @@ -859,6 +946,7 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context) /* OptiX denoiser handle was created with the requested number of input passes. */ denoiser_.use_pass_albedo = context.use_pass_albedo; denoiser_.use_pass_normal = context.use_pass_normal; + denoiser_.use_pass_flow = context.use_pass_flow; /* OptiX denoiser has been created, but it needs configuration. */ denoiser_.is_configured = false; @@ -868,41 +956,42 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context) bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context) { - if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width && - denoiser_.configured_size.y == context.buffer_params.height)) { + /* Limit maximum tile size denoiser can be invoked with. */ + const int2 tile_size = make_int2(min(context.buffer_params.width, 4096), + min(context.buffer_params.height, 4096)); + + if (denoiser_.is_configured && + (denoiser_.configured_size.x == tile_size.x && denoiser_.configured_size.y == tile_size.y)) { return true; } - const BufferParams &buffer_params = context.buffer_params; - - OptixDenoiserSizes sizes = {}; optix_assert(optixDenoiserComputeMemoryResources( - denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes)); - - denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes; - denoiser_.scratch_offset = sizes.stateSizeInBytes; + denoiser_.optix_denoiser, tile_size.x, tile_size.y, &denoiser_.sizes)); /* Allocate denoiser state if tile size has changed since last setup. */ - denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size); + denoiser_.state.alloc_to_device(denoiser_.sizes.stateSizeInBytes + + denoiser_.sizes.withOverlapScratchSizeInBytes); /* Initialize denoiser state for the current tile size. */ - const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser, - denoiser_.queue.stream(), - buffer_params.width, - buffer_params.height, - denoiser_.state.device_pointer, - denoiser_.scratch_offset, - denoiser_.state.device_pointer + - denoiser_.scratch_offset, - denoiser_.scratch_size); + const OptixResult result = optixDenoiserSetup( + denoiser_.optix_denoiser, + 0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called + on a stream that is not the default stream */ + tile_size.x + denoiser_.sizes.overlapWindowSizeInPixels * 2, + tile_size.y + denoiser_.sizes.overlapWindowSizeInPixels * 2, + denoiser_.state.device_pointer, + denoiser_.sizes.stateSizeInBytes, + denoiser_.state.device_pointer + denoiser_.sizes.stateSizeInBytes, + denoiser_.sizes.withOverlapScratchSizeInBytes); if (result != OPTIX_SUCCESS) { set_error("Failed to set up OptiX denoiser"); return false; } + cuda_assert(cuCtxSynchronize()); + denoiser_.is_configured = true; - denoiser_.configured_size.x = buffer_params.width; - denoiser_.configured_size.y = buffer_params.height; + denoiser_.configured_size = tile_size; return true; } @@ -917,8 +1006,10 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass) OptixImage2D color_layer = {0}; OptixImage2D albedo_layer = {0}; OptixImage2D normal_layer = {0}; + OptixImage2D flow_layer = {0}; OptixImage2D output_layer = {0}; + OptixImage2D prev_output_layer = {0}; /* Color pass. */ { @@ -934,7 +1025,18 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass) color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; } - device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE); + /* Previous output. */ + if (context.prev_output.offset != PASS_UNUSED) { + const int64_t pass_stride_in_bytes = context.prev_output.pass_stride * sizeof(float); + + prev_output_layer.data = context.prev_output.device_pointer + + context.prev_output.offset * sizeof(float); + prev_output_layer.width = width; + prev_output_layer.height = height; + prev_output_layer.rowStrideInBytes = pass_stride_in_bytes * context.prev_output.stride; + prev_output_layer.pixelStrideInBytes = pass_stride_in_bytes; + prev_output_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } /* Optional albedo and color passes. */ if (context.num_input_passes > 1) { @@ -959,33 +1061,47 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass) normal_layer.pixelStrideInBytes = pixel_stride_in_bytes; normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; } + + if (context.use_pass_flow) { + flow_layer.data = d_guiding_buffer + context.guiding_params.pass_flow * sizeof(float); + flow_layer.width = width; + flow_layer.height = height; + flow_layer.rowStrideInBytes = row_stride_in_bytes; + flow_layer.pixelStrideInBytes = pixel_stride_in_bytes; + flow_layer.format = OPTIX_PIXEL_FORMAT_FLOAT2; + } } /* Denoise in-place of the noisy input in the render buffers. */ output_layer = color_layer; - /* Finally run denoising. */ - OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */ + OptixDenoiserGuideLayer guide_layers = {}; + guide_layers.albedo = albedo_layer; + guide_layers.normal = normal_layer; + guide_layers.flow = flow_layer; + OptixDenoiserLayer image_layers = {}; image_layers.input = color_layer; + image_layers.previousOutput = prev_output_layer; image_layers.output = output_layer; - OptixDenoiserGuideLayer guide_layers = {}; - guide_layers.albedo = albedo_layer; - guide_layers.normal = normal_layer; + /* Finally run denoising. */ + OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */ - optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser, - denoiser_.queue.stream(), - ¶ms, - denoiser_.state.device_pointer, - denoiser_.scratch_offset, - &guide_layers, - &image_layers, - 1, - 0, - 0, - denoiser_.state.device_pointer + denoiser_.scratch_offset, - denoiser_.scratch_size)); + optix_assert(optixUtilDenoiserInvokeTiled(denoiser_.optix_denoiser, + denoiser_.queue.stream(), + ¶ms, + denoiser_.state.device_pointer, + denoiser_.sizes.stateSizeInBytes, + &guide_layers, + &image_layers, + 1, + denoiser_.state.device_pointer + + denoiser_.sizes.stateSizeInBytes, + denoiser_.sizes.withOverlapScratchSizeInBytes, + denoiser_.sizes.overlapWindowSizeInPixels, + denoiser_.configured_size.x, + denoiser_.configured_size.y)); return true; } @@ -995,6 +1111,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, const OptixBuildInput &build_input, uint16_t num_motion_steps) { + /* Allocate and build acceleration structures only one at a time, to prevent parallel builds + * from running out of memory (since both original and compacted acceleration structure memory + * may be allocated at the same time for the duration of this function). The builds would + * otherwise happen on the same CUDA stream anyway. */ + static thread_mutex mutex; + thread_scoped_lock lock(mutex); + const CUDAContextScope scope(this); const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC); @@ -1020,14 +1143,15 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); /* Allocate required output buffers. */ - device_only_memory<char> temp_mem(this, "optix temp as build mem"); + device_only_memory<char> temp_mem(this, "optix temp as build mem", true); temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); if (!temp_mem.device_pointer) { /* Make sure temporary memory allocation succeeded. */ return false; } - device_only_memory<char> &out_data = bvh->as_data; + /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */ + device_only_memory<char> &out_data = *bvh->as_data; if (operation == OPTIX_BUILD_OPERATION_BUILD) { assert(out_data.device == this); out_data.alloc_to_device(sizes.outputSizeInBytes); @@ -1075,12 +1199,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, /* There is no point compacting if the size does not change. */ if (compacted_size < sizes.outputSizeInBytes) { - device_only_memory<char> compacted_data(this, "optix compacted as"); + device_only_memory<char> compacted_data(this, "optix compacted as", false); compacted_data.alloc_to_device(compacted_size); - if (!compacted_data.device_pointer) + if (!compacted_data.device_pointer) { /* Do not compact if memory allocation for compacted acceleration structure fails. * Can just use the uncompacted one then, so succeed here regardless. */ return !have_error(); + } optix_assert(optixAccelCompact( context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle)); @@ -1091,6 +1216,8 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, std::swap(out_data.device_size, compacted_data.device_size); std::swap(out_data.device_pointer, compacted_data.device_pointer); + /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope. + */ } } @@ -1118,7 +1245,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) operation = OPTIX_BUILD_OPERATION_UPDATE; } else { - bvh_optix->as_data.free(); + bvh_optix->as_data->free(); bvh_optix->traversable_handle = 0; } @@ -1173,20 +1300,27 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) int ka = max(k0 - 1, curve.first_key); int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); + index_data[i] = i * 4; + float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; + +# if OPTIX_ABI_VERSION >= 55 + v[0] = make_float4(keys[ka].x, keys[ka].y, keys[ka].z, curve_radius[ka]); + v[1] = make_float4(keys[k0].x, keys[k0].y, keys[k0].z, curve_radius[k0]); + v[2] = make_float4(keys[k1].x, keys[k1].y, keys[k1].z, curve_radius[k1]); + v[3] = make_float4(keys[kb].x, keys[kb].y, keys[kb].z, curve_radius[kb]); +# else const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); const float4 pw = make_float4( curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]); - /* Convert Catmull-Rom data to Bezier spline. */ + /* Convert Catmull-Rom data to B-spline. */ static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; - index_data[i] = i * 4; - float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; v[0] = make_float4( dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); v[1] = make_float4( @@ -1195,6 +1329,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); v[3] = make_float4( dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); +# endif } else { BoundBox bounds = BoundBox::empty; @@ -1236,7 +1371,11 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) OptixBuildInput build_input = {}; if (hair->curve_shape == CURVE_THICK) { build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; +# if OPTIX_ABI_VERSION >= 55 + build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM; +# else build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; +# endif build_input.curveArray.numPrimitives = num_segments; build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); build_input.curveArray.numVertices = num_vertices; @@ -1246,11 +1385,11 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; build_input.curveArray.indexStrideInBytes = sizeof(int); build_input.curveArray.flag = build_flags; - build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; + build_input.curveArray.primitiveIndexOffset = hair->curve_segment_offset; } else { /* Disable visibility test any-hit program, since it is already checked during - * intersection. Those trace calls that require anyhit can force it with a ray flag. */ + * intersection. Those trace calls that require any-hit can force it with a ray flag. */ build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; @@ -1259,7 +1398,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); build_input.customPrimitiveArray.flags = &build_flags; build_input.customPrimitiveArray.numSbtRecords = 1; - build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; + build_input.customPrimitiveArray.primitiveIndexOffset = hair->curve_segment_offset; } if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { @@ -1328,7 +1467,87 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in * one and rely on that having the same meaning in this case. */ build_input.triangleArray.numSbtRecords = 1; - build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; + build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset; + + if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + } + else if (geom->geometry_type == Geometry::POINTCLOUD) { + /* Build BLAS for points primitives. */ + PointCloud *const pointcloud = static_cast<PointCloud *const>(geom); + const size_t num_points = pointcloud->num_points(); + if (num_points == 0) { + return; + } + + size_t num_motion_steps = 1; + Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) { + num_motion_steps = pointcloud->get_motion_steps(); + } + + device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); + aabb_data.alloc(num_points * num_motion_steps); + + /* Get AABBs for each motion step. */ + for (size_t step = 0; step < num_motion_steps; ++step) { + /* The center step for motion vertices is not stored in the attribute. */ + const float3 *points = pointcloud->get_points().data(); + const float *radius = pointcloud->get_radius().data(); + size_t center_step = (num_motion_steps - 1) / 2; + if (step != center_step) { + size_t attr_offset = (step > center_step) ? step - 1 : step; + /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */ + points = motion_points->data_float3() + attr_offset * num_points; + } + + for (size_t i = 0; i < num_points; ++i) { + const PointCloud::Point point = pointcloud->get_point(i); + BoundBox bounds = BoundBox::empty; + point.bounds_grow(points, radius, bounds); + + const size_t index = step * num_points + i; + aabb_data[index].minX = bounds.min.x; + aabb_data[index].minY = bounds.min.y; + aabb_data[index].minZ = bounds.min.z; + aabb_data[index].maxX = bounds.max.x; + aabb_data[index].maxY = bounds.max.y; + aabb_data[index].maxZ = bounds.max.z; + } + } + + /* Upload AABB data to GPU. */ + aabb_data.copy_to_device(); + + vector<device_ptr> aabb_ptrs; + aabb_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + aabb_ptrs.push_back(aabb_data.device_pointer + step * num_points * sizeof(OptixAabb)); + } + + /* Disable visibility test any-hit program, since it is already checked during + * intersection. Those trace calls that require anyhit can force it with a ray flag. + * For those, force a single any-hit call, so shadow record-all behavior works correctly. */ + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT | + OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; +# if OPTIX_ABI_VERSION < 23 + build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.aabbArray.numPrimitives = num_points; + build_input.aabbArray.strideInBytes = sizeof(OptixAabb); + build_input.aabbArray.flags = &build_flags; + build_input.aabbArray.numSbtRecords = 1; + build_input.aabbArray.primitiveIndexOffset = pointcloud->prim_offset; +# else + build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.customPrimitiveArray.numPrimitives = num_points; + build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); + build_input.customPrimitiveArray.flags = &build_flags; + build_input.customPrimitiveArray.numSbtRecords = 1; + build_input.customPrimitiveArray.primitiveIndexOffset = pointcloud->prim_offset; +# endif if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { progress.set_error("Failed to build OptiX acceleration structure"); @@ -1339,9 +1558,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) unsigned int num_instances = 0; unsigned int max_num_instances = 0xFFFFFFFF; - bvh_optix->as_data.free(); + bvh_optix->as_data->free(); bvh_optix->traversable_handle = 0; - bvh_optix->motion_transform_data.free(); + bvh_optix->motion_transform_data->free(); optixDeviceContextGetProperty(context, OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, @@ -1374,8 +1593,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } } - assert(bvh_optix->motion_transform_data.device == this); - bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); + assert(bvh_optix->motion_transform_data->device == this); + bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size); } for (Object *ob : bvh->objects) { @@ -1395,8 +1614,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) instance.transform[5] = 1.0f; instance.transform[10] = 1.0f; - /* Set user instance ID to object index (but leave low bit blank). */ - instance.instanceId = ob->get_device_index() << 1; + /* Set user instance ID to object index. */ + instance.instanceId = ob->get_device_index(); /* Add some of the object visibility bits to the mask. * __prim_visibility contains the combined visibility bits of all instances, so is not @@ -1417,9 +1636,22 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) instance.sbtOffset = PG_HITD_MOTION - PG_HITD; } } - else { + else if (ob->get_geometry()->geometry_type == Geometry::POINTCLOUD) { + /* Use the hit group that has an intersection program for point clouds. */ + instance.sbtOffset = PG_HITD_POINTCLOUD - PG_HITD; + + /* Also skip point clouds in local trace calls. */ + instance.visibilityMask |= 4; + } + +# if OPTIX_ABI_VERSION < 55 + /* Cannot disable any-hit program for thick curves, since it needs to filter out end-caps. */ + else +# endif + { /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves, - * since it needs to filter out endcaps there). + * since it needs to filter out end-caps there). + * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT. */ @@ -1436,7 +1668,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) motion_transform_offset = align_up(motion_transform_offset, OPTIX_TRANSFORM_BYTE_ALIGNMENT); - CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + + CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer + motion_transform_offset; motion_transform_offset += motion_transform_size; @@ -1489,9 +1721,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); delete[] reinterpret_cast<uint8_t *>(&motion_transform); - /* Disable instance transform if object uses motion transform already. */ - instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - /* Get traversable handle to motion transform. */ optixConvertPointerToTraversableHandle(context, motion_transform_gpu, @@ -1505,13 +1734,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) /* Set transform matrix. */ memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); } - else { - /* Disable instance transform if geometry already has it applied to vertex data. */ - instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - /* Non-instanced objects read ID from 'prim_object', so distinguish - * them from instanced objects with the low bit set. */ - instance.instanceId |= 1; - } } } @@ -1573,7 +1795,7 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size) return; \ } KERNEL_TEX(IntegratorStateGPU, __integrator_state) -# include "kernel/kernel_textures.h" +# include "kernel/textures.h" # undef KERNEL_TEX } |