Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-09-20 18:59:20 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-09-21 15:55:54 +0300
commit08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/device/optix/device_impl.cpp
parentfa6b1007bad065440950cd67deb16a04f368856f (diff)
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
Diffstat (limited to 'intern/cycles/device/optix/device_impl.cpp')
-rw-r--r--intern/cycles/device/optix/device_impl.cpp1573
1 files changed, 1573 insertions, 0 deletions
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..cd16b8c9f01
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "device/optix/device_impl.h"
+
+# include "bvh/bvh.h"
+# include "bvh/bvh_optix.h"
+# include "integrator/pass_accessor_gpu.h"
+# include "render/buffers.h"
+# include "render/hair.h"
+# include "render/mesh.h"
+# include "render/object.h"
+# include "render/pass.h"
+# include "render/scene.h"
+
+# include "util/util_debug.h"
+# include "util/util_logging.h"
+# include "util/util_md5.h"
+# include "util/util_path.h"
+# include "util/util_progress.h"
+# include "util/util_time.h"
+
+# undef __KERNEL_CPU__
+# define __KERNEL_OPTIX__
+# include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+ : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+ const CUDAContextScope scope(device);
+ if (optix_denoiser != nullptr) {
+ optixDenoiserDestroy(optix_denoiser);
+ }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : CUDADevice(info, stats, profiler),
+ sbt_data(this, "__sbt", MEM_READ_ONLY),
+ launch_params(this, "__params"),
+ denoiser_(this)
+{
+ /* Make the CUDA context current. */
+ if (!cuContext) {
+ /* Do not initialize if CUDA context creation failed already. */
+ return;
+ }
+ const CUDAContextScope scope(this);
+
+ /* Create OptiX context for this device. */
+ OptixDeviceContextOptions options = {};
+# ifdef WITH_CYCLES_LOGGING
+ options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+ options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+ switch (level) {
+ case 1:
+ LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+ break;
+ case 2:
+ LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+ break;
+ case 3:
+ LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+ break;
+ case 4:
+ LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+ break;
+ }
+ };
+# endif
+ if (DebugFlags().optix.use_debug) {
+ options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+ }
+ optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+# ifdef WITH_CYCLES_LOGGING
+ optix_assert(optixDeviceContextSetLogCallback(
+ context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+# endif
+
+ /* Fix weird compiler bug that assigns wrong size. */
+ launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+ /* Allocate launch parameter buffer memory on device. */
+ launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+ /* Make CUDA context current. */
+ const CUDAContextScope scope(this);
+
+ free_bvh_memory_delayed();
+
+ sbt_data.free();
+ texture_info.free();
+ launch_params.free();
+
+ /* Unload modules. */
+ if (optix_module != NULL) {
+ optixModuleDestroy(optix_module);
+ }
+ for (unsigned int i = 0; i < 2; ++i) {
+ if (builtin_modules[i] != NULL) {
+ optixModuleDestroy(builtin_modules[i]);
+ }
+ }
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ if (pipelines[i] != NULL) {
+ optixPipelineDestroy(pipelines[i]);
+ }
+ }
+
+ optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+ return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+ /* OptiX has its own internal acceleration structure format. */
+ return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+ string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+ /* Add OptiX SDK include directory to include paths. */
+ const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+ if (optix_sdk_path) {
+ common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+ }
+
+ /* Specialization for shader raytracing. */
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ common_cflags += " --keep-device-functions";
+ }
+
+ return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+ if (have_error()) {
+ /* Abort early if context creation failed already. */
+ return false;
+ }
+
+ /* Load CUDA modules because we need some of the utility kernels. */
+ if (!CUDADevice::load_kernels(kernel_features)) {
+ return false;
+ }
+
+ /* Skip creating OptiX module if only doing denoising. */
+ if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+ return true;
+ }
+
+ const CUDAContextScope scope(this);
+
+ /* Unload existing OptiX module and pipelines first. */
+ if (optix_module != NULL) {
+ optixModuleDestroy(optix_module);
+ optix_module = NULL;
+ }
+ for (unsigned int i = 0; i < 2; ++i) {
+ if (builtin_modules[i] != NULL) {
+ optixModuleDestroy(builtin_modules[i]);
+ builtin_modules[i] = NULL;
+ }
+ }
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ if (pipelines[i] != NULL) {
+ optixPipelineDestroy(pipelines[i]);
+ pipelines[i] = NULL;
+ }
+ }
+
+ OptixModuleCompileOptions module_options = {};
+ module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+ if (DebugFlags().optix.use_debug) {
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ }
+ else {
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+ }
+
+ module_options.boundValues = nullptr;
+ module_options.numBoundValues = 0;
+
+ OptixPipelineCompileOptions pipeline_options = {};
+ /* Default to no motion blur and two-level graph, since it is the fastest option. */
+ pipeline_options.usesMotionBlur = false;
+ pipeline_options.traversableGraphFlags =
+ OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+ pipeline_options.numPayloadValues = 6;
+ pipeline_options.numAttributeValues = 2; /* u, v */
+ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+ pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+ pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+ if (kernel_features & KERNEL_FEATURE_HAIR) {
+ if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+ }
+ else
+ pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+ }
+
+ /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+ * This is necessary since objects may be reported to have motion if the Vector pass is
+ * active, but may still need to be rendered without motion blur if that isn't active as well. */
+ motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+ if (motion_blur) {
+ pipeline_options.usesMotionBlur = true;
+ /* Motion blur can insert motion transforms into the traversal graph.
+ * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+ pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+ }
+
+ { /* Load and compile PTX module with OptiX kernels. */
+ string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+ "lib/kernel_optix_shader_raytrace.ptx" :
+ "lib/kernel_optix.ptx");
+ if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+ if (!getenv("OPTIX_ROOT_DIR")) {
+ set_error(
+ "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+ "the Optix SDK to be able to compile Optix kernels on demand).");
+ return false;
+ }
+ ptx_filename = compile_kernel(
+ kernel_features,
+ (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+ "optix",
+ true);
+ }
+ if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+ set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+ return false;
+ }
+
+ const OptixResult result = optixModuleCreateFromPTX(context,
+ &module_options,
+ &pipeline_options,
+ ptx_data.data(),
+ ptx_data.size(),
+ nullptr,
+ 0,
+ &optix_module);
+ if (result != OPTIX_SUCCESS) {
+ set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+ ptx_filename.c_str(),
+ optixGetErrorName(result)));
+ return false;
+ }
+ }
+
+ /* Create program groups. */
+ OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_closest";
+ group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_shadow";
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_subsurface";
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_volume_stack";
+ group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+ group_descs[PG_MISS].miss.module = optix_module;
+ group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+ group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+ group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+ group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+ if (kernel_features & KERNEL_FEATURE_HAIR) {
+ if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ /* Built-in thick curve intersection. */
+ OptixBuiltinISOptions builtin_options = {};
+ builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+ builtin_options.usesMotionBlur = false;
+
+ optix_assert(optixBuiltinISModuleGet(
+ context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+ group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+ group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+ if (motion_blur) {
+ builtin_options.usesMotionBlur = true;
+
+ optix_assert(optixBuiltinISModuleGet(
+ context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+ group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+ group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+ group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+ group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+ }
+ }
+ else {
+ /* Custom ribbon intersection. */
+ group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ }
+ }
+
+ if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+ /* Add hit group for local intersections. */
+ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+ }
+
+ /* Shader raytracing replaces some functions with direct callables. */
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+ group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+ group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+ "__direct_callable__svm_node_bevel";
+ group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+ }
+
+ optix_assert(optixProgramGroupCreate(
+ context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+ /* Get program stack sizes. */
+ OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+ /* Set up SBT, which in this case is used only to select between different programs. */
+ sbt_data.alloc(NUM_PROGRAM_GROUPS);
+ memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+ optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+ }
+ sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+ /* Calculate maximum trace continuation stack size. */
+ unsigned int trace_css = stack_size[PG_HITD].cssCH;
+ /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+ trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+ trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+ trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+ trace_css = std::max(trace_css,
+ stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+ trace_css = std::max(trace_css,
+ stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+ OptixPipelineLinkOptions link_options = {};
+ link_options.maxTraceDepth = 1;
+
+ if (DebugFlags().optix.use_debug) {
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ }
+ else {
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+ }
+
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ /* Create shader raytracing pipeline. */
+ vector<OptixProgramGroup> pipeline_groups;
+ pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+ pipeline_groups.push_back(groups[PG_MISS]);
+ pipeline_groups.push_back(groups[PG_HITD]);
+ pipeline_groups.push_back(groups[PG_HITS]);
+ pipeline_groups.push_back(groups[PG_HITL]);
+ if (motion_blur) {
+ pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+ pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+ }
+ pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+ pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+ optix_assert(optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups.data(),
+ pipeline_groups.size(),
+ nullptr,
+ 0,
+ &pipelines[PIP_SHADE_RAYTRACE]));
+
+ /* Combine ray generation and trace continuation stack size. */
+ const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+ link_options.maxTraceDepth * trace_css;
+ const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+ stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+ /* Set stack size depending on pipeline options. */
+ optix_assert(optixPipelineSetStackSize(
+ pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+ }
+
+ { /* Create intersection-only pipeline. */
+ vector<OptixProgramGroup> pipeline_groups;
+ pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+ pipeline_groups.push_back(groups[PG_MISS]);
+ pipeline_groups.push_back(groups[PG_HITD]);
+ pipeline_groups.push_back(groups[PG_HITS]);
+ pipeline_groups.push_back(groups[PG_HITL]);
+ if (motion_blur) {
+ pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+ pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+ }
+
+ optix_assert(optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups.data(),
+ pipeline_groups.size(),
+ nullptr,
+ 0,
+ &pipelines[PIP_INTERSECT]));
+
+ /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+ const unsigned int css =
+ std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+ stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+ link_options.maxTraceDepth * trace_css;
+
+ optix_assert(
+ optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+ }
+
+ /* Clean up program group objects. */
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optixProgramGroupDestroy(groups[i]);
+ }
+
+ return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+ explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+ : denoise_params(task.params),
+ render_buffers(task.render_buffers),
+ buffer_params(task.buffer_params),
+ guiding_buffer(device, "denoiser guiding passes buffer"),
+ num_samples(task.num_samples)
+ {
+ num_input_passes = 1;
+ if (denoise_params.use_pass_albedo) {
+ num_input_passes += 1;
+ use_pass_albedo = true;
+ pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+ if (denoise_params.use_pass_normal) {
+ num_input_passes += 1;
+ use_pass_normal = true;
+ pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+ }
+ }
+
+ const int num_guiding_passes = num_input_passes - 1;
+
+ if (num_guiding_passes) {
+ if (task.allow_inplace_modification) {
+ guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+ guiding_params.pass_albedo = pass_denoising_albedo;
+ guiding_params.pass_normal = pass_denoising_normal;
+
+ guiding_params.stride = buffer_params.stride;
+ guiding_params.pass_stride = buffer_params.pass_stride;
+ }
+ else {
+ guiding_params.pass_stride = 0;
+ if (use_pass_albedo) {
+ guiding_params.pass_albedo = guiding_params.pass_stride;
+ guiding_params.pass_stride += 3;
+ }
+ if (use_pass_normal) {
+ guiding_params.pass_normal = guiding_params.pass_stride;
+ guiding_params.pass_stride += 3;
+ }
+
+ guiding_params.stride = buffer_params.width;
+
+ guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+ guiding_params.pass_stride);
+ guiding_params.device_pointer = guiding_buffer.device_pointer;
+ }
+ }
+
+ pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+ }
+
+ const DenoiseParams &denoise_params;
+
+ RenderBuffers *render_buffers = nullptr;
+ const BufferParams &buffer_params;
+
+ /* Device-side storage of the guiding passes. */
+ device_only_memory<float> guiding_buffer;
+
+ struct {
+ device_ptr device_pointer = 0;
+
+ /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+ int pass_albedo = PASS_UNUSED;
+ int pass_normal = PASS_UNUSED;
+
+ int stride = -1;
+ int pass_stride = -1;
+ } guiding_params;
+
+ /* Number of input passes. Including the color and extra auxillary passes. */
+ int num_input_passes = 0;
+ bool use_pass_albedo = false;
+ bool use_pass_normal = false;
+
+ int num_samples = 0;
+
+ int pass_sample_count = PASS_UNUSED;
+
+ /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+ int pass_denoising_albedo = PASS_UNUSED;
+ int pass_denoising_normal = PASS_UNUSED;
+
+ /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+ * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+ * the fake values and denoising of passes which do need albedo can no longer happen. */
+ bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+ DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+ {
+ noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+ denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+ const PassInfo pass_info = Pass::get_info(type);
+ num_components = pass_info.num_components;
+ use_compositing = pass_info.use_compositing;
+ use_denoising_albedo = pass_info.use_denoising_albedo;
+ }
+
+ PassType type;
+
+ int noisy_offset;
+ int denoised_offset;
+
+ int num_components;
+ bool use_compositing;
+ bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+ const CUDAContextScope scope(this);
+
+ DenoiseContext context(this, task);
+
+ if (!denoise_ensure(context)) {
+ return false;
+ }
+
+ if (!denoise_filter_guiding_preprocess(context)) {
+ LOG(ERROR) << "Error preprocessing guiding passes.";
+ return false;
+ }
+
+ /* Passes which will use real albedo when it is available. */
+ denoise_pass(context, PASS_COMBINED);
+ denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+ /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+ denoise_pass(context, PASS_SHADOW_CATCHER);
+
+ return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+ return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+ const_cast<int *>(&context.guiding_params.pass_stride),
+ const_cast<int *>(&context.guiding_params.pass_albedo),
+ const_cast<int *>(&context.guiding_params.pass_normal),
+ &context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&context.pass_sample_count),
+ const_cast<int *>(&context.pass_denoising_albedo),
+ const_cast<int *>(&context.pass_denoising_normal),
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&context.num_samples)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+ const_cast<int *>(&context.guiding_params.pass_stride),
+ const_cast<int *>(&context.guiding_params.pass_albedo),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const DenoisePass pass(pass_type, buffer_params);
+
+ if (pass.noisy_offset == PASS_UNUSED) {
+ return;
+ }
+ if (pass.denoised_offset == PASS_UNUSED) {
+ LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+ return;
+ }
+
+ if (pass.use_denoising_albedo) {
+ if (context.albedo_replaced_with_fake) {
+ LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+ return;
+ }
+ }
+ else if (!context.albedo_replaced_with_fake) {
+ context.albedo_replaced_with_fake = true;
+ if (!denoise_filter_guiding_set_fake_albedo(context)) {
+ LOG(ERROR) << "Error replacing real albedo with the fake one.";
+ return;
+ }
+ }
+
+ /* Read and preprocess noisy color input pass. */
+ denoise_color_read(context, pass);
+ if (!denoise_filter_color_preprocess(context, pass)) {
+ LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+ return;
+ }
+
+ if (!denoise_run(context, pass)) {
+ LOG(ERROR) << "Error running OptiX denoiser.";
+ return;
+ }
+
+ /* Store result in the combined pass of the render buffer.
+ *
+ * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+ if (!denoise_filter_color_postprocess(context, pass)) {
+ LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+ return;
+ }
+
+ denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = pass.type;
+ pass_access_info.mode = PassMode::NOISY;
+ pass_access_info.offset = pass.noisy_offset;
+
+ /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+ * on the approximation. The latter is not even possible because OptiX does not support
+ * denoising of semi-transparent pixels. */
+ pass_access_info.use_approximate_shadow_catcher = false;
+ pass_access_info.use_approximate_shadow_catcher_background = false;
+ pass_access_info.show_active_pixels = false;
+
+ /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+ */
+ const PassAccessorGPU pass_accessor(
+ &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+ PassAccessor::Destination destination(pass_access_info.type);
+ destination.d_pixels = context.render_buffers->buffer.device_pointer +
+ pass.denoised_offset * sizeof(float);
+ destination.num_components = 3;
+ destination.pixel_stride = context.buffer_params.pass_stride;
+
+ pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {&context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&pass.denoised_offset)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+ const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {&context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&context.num_samples),
+ const_cast<int *>(&pass.noisy_offset),
+ const_cast<int *>(&pass.denoised_offset),
+ const_cast<int *>(&context.pass_sample_count),
+ const_cast<int *>(&pass.num_components),
+ const_cast<bool *>(&pass.use_compositing)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+ if (!denoise_create_if_needed(context)) {
+ LOG(ERROR) << "OptiX denoiser creation has failed.";
+ return false;
+ }
+
+ if (!denoise_configure_if_needed(context)) {
+ LOG(ERROR) << "OptiX denoiser configuration has failed.";
+ return false;
+ }
+
+ return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+ const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+ (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+ (denoiser_.use_pass_normal != context.use_pass_normal);
+ if (!recreate_denoiser) {
+ return true;
+ }
+
+ /* Destroy existing handle before creating new one. */
+ if (denoiser_.optix_denoiser) {
+ optixDenoiserDestroy(denoiser_.optix_denoiser);
+ }
+
+ /* Create OptiX denoiser handle on demand when it is first used. */
+ OptixDenoiserOptions denoiser_options = {};
+ denoiser_options.guideAlbedo = context.use_pass_albedo;
+ denoiser_options.guideNormal = context.use_pass_normal;
+ const OptixResult result = optixDenoiserCreate(
+ this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+ if (result != OPTIX_SUCCESS) {
+ set_error("Failed to create OptiX denoiser");
+ return false;
+ }
+
+ /* OptiX denoiser handle was created with the requested number of input passes. */
+ denoiser_.use_pass_albedo = context.use_pass_albedo;
+ denoiser_.use_pass_normal = context.use_pass_normal;
+
+ /* OptiX denoiser has been created, but it needs configuration. */
+ denoiser_.is_configured = false;
+
+ return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+ if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+ denoiser_.configured_size.y == context.buffer_params.height)) {
+ return true;
+ }
+
+ const BufferParams &buffer_params = context.buffer_params;
+
+ OptixDenoiserSizes sizes = {};
+ optix_assert(optixDenoiserComputeMemoryResources(
+ denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+ denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+ denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+ /* Allocate denoiser state if tile size has changed since last setup. */
+ denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+ /* Initialize denoiser state for the current tile size. */
+ const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+ denoiser_.queue.stream(),
+ buffer_params.width,
+ buffer_params.height,
+ denoiser_.state.device_pointer,
+ denoiser_.scratch_offset,
+ denoiser_.state.device_pointer +
+ denoiser_.scratch_offset,
+ denoiser_.scratch_size);
+ if (result != OPTIX_SUCCESS) {
+ set_error("Failed to set up OptiX denoiser");
+ return false;
+ }
+
+ denoiser_.is_configured = true;
+ denoiser_.configured_size.x = buffer_params.width;
+ denoiser_.configured_size.y = buffer_params.height;
+
+ return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+ const int width = buffer_params.width;
+ const int height = buffer_params.height;
+
+ /* Set up input and output layer information. */
+ OptixImage2D color_layer = {0};
+ OptixImage2D albedo_layer = {0};
+ OptixImage2D normal_layer = {0};
+
+ OptixImage2D output_layer = {0};
+
+ /* Color pass. */
+ {
+ const int pass_denoised = pass.denoised_offset;
+ const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+ color_layer.data = context.render_buffers->buffer.device_pointer +
+ pass_denoised * sizeof(float);
+ color_layer.width = width;
+ color_layer.height = height;
+ color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+ color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+ color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+
+ device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+ /* Optional albedo and color passes. */
+ if (context.num_input_passes > 1) {
+ const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+ const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+ const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+ if (context.use_pass_albedo) {
+ albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+ albedo_layer.width = width;
+ albedo_layer.height = height;
+ albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+ albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+ albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+
+ if (context.use_pass_normal) {
+ normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+ normal_layer.width = width;
+ normal_layer.height = height;
+ normal_layer.rowStrideInBytes = row_stride_in_bytes;
+ normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+ normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+ }
+
+ /* Denoise in-place of the noisy input in the render buffers. */
+ output_layer = color_layer;
+
+ /* Finally run denonising. */
+ OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+ OptixDenoiserLayer image_layers = {};
+ image_layers.input = color_layer;
+ image_layers.output = output_layer;
+
+ OptixDenoiserGuideLayer guide_layers = {};
+ guide_layers.albedo = albedo_layer;
+ guide_layers.normal = normal_layer;
+
+ optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+ denoiser_.queue.stream(),
+ &params,
+ denoiser_.state.device_pointer,
+ denoiser_.scratch_offset,
+ &guide_layers,
+ &image_layers,
+ 1,
+ 0,
+ 0,
+ denoiser_.state.device_pointer + denoiser_.scratch_offset,
+ denoiser_.scratch_size));
+
+ return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+ OptixBuildOperation operation,
+ const OptixBuildInput &build_input,
+ uint16_t num_motion_steps)
+{
+ const CUDAContextScope scope(this);
+
+ const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+ /* Compute memory usage. */
+ OptixAccelBufferSizes sizes = {};
+ OptixAccelBuildOptions options = {};
+ options.operation = operation;
+ if (use_fast_trace_bvh) {
+ VLOG(2) << "Using fast to trace OptiX BVH";
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+ }
+ else {
+ VLOG(2) << "Using fast to update OptiX BVH";
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+ }
+
+ options.motionOptions.numKeys = num_motion_steps;
+ options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+ options.motionOptions.timeBegin = 0.0f;
+ options.motionOptions.timeEnd = 1.0f;
+
+ optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+ /* Allocate required output buffers. */
+ device_only_memory<char> temp_mem(this, "optix temp as build mem");
+ temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+ if (!temp_mem.device_pointer) {
+ /* Make sure temporary memory allocation succeeded. */
+ return false;
+ }
+
+ device_only_memory<char> &out_data = bvh->as_data;
+ if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+ assert(out_data.device == this);
+ out_data.alloc_to_device(sizes.outputSizeInBytes);
+ if (!out_data.device_pointer) {
+ return false;
+ }
+ }
+ else {
+ assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+ }
+
+ /* Finally build the acceleration structure. */
+ OptixAccelEmitDesc compacted_size_prop = {};
+ compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+ /* A tiny space was allocated for this property at the end of the temporary buffer above.
+ * Make sure this pointer is 8-byte aligned. */
+ compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+ OptixTraversableHandle out_handle = 0;
+ optix_assert(optixAccelBuild(context,
+ NULL,
+ &options,
+ &build_input,
+ 1,
+ temp_mem.device_pointer,
+ sizes.tempSizeInBytes,
+ out_data.device_pointer,
+ sizes.outputSizeInBytes,
+ &out_handle,
+ use_fast_trace_bvh ? &compacted_size_prop : NULL,
+ use_fast_trace_bvh ? 1 : 0));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+ /* Wait for all operations to finish. */
+ cuda_assert(cuStreamSynchronize(NULL));
+
+ /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+ */
+ if (use_fast_trace_bvh) {
+ uint64_t compacted_size = sizes.outputSizeInBytes;
+ cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+ /* Temporary memory is no longer needed, so free it now to make space. */
+ temp_mem.free();
+
+ /* There is no point compacting if the size does not change. */
+ if (compacted_size < sizes.outputSizeInBytes) {
+ device_only_memory<char> compacted_data(this, "optix compacted as");
+ compacted_data.alloc_to_device(compacted_size);
+ if (!compacted_data.device_pointer)
+ /* Do not compact if memory allocation for compacted acceleration structure fails.
+ * Can just use the uncompacted one then, so succeed here regardless. */
+ return !have_error();
+
+ optix_assert(optixAccelCompact(
+ context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+ /* Wait for compaction to finish. */
+ cuda_assert(cuStreamSynchronize(NULL));
+
+ std::swap(out_data.device_size, compacted_data.device_size);
+ std::swap(out_data.device_pointer, compacted_data.device_pointer);
+ }
+ }
+
+ return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+ const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+ free_bvh_memory_delayed();
+
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+ progress.set_substatus("Building OptiX acceleration structure");
+
+ if (!bvh->params.top_level) {
+ assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+ /* Refit is only possible in viewport for now (because AS is built with
+ * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+ OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+ if (refit && !use_fast_trace_bvh) {
+ assert(bvh_optix->traversable_handle != 0);
+ operation = OPTIX_BUILD_OPERATION_UPDATE;
+ }
+ else {
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ }
+
+ /* Build bottom level acceleration structures (BLAS). */
+ Geometry *const geom = bvh->geometry[0];
+ if (geom->geometry_type == Geometry::HAIR) {
+ /* Build BLAS for curve primitives. */
+ Hair *const hair = static_cast<Hair *const>(geom);
+ if (hair->num_curves() == 0) {
+ return;
+ }
+
+ const size_t num_segments = hair->num_segments();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = hair->get_motion_steps();
+ }
+
+ device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+ /* Four control points for each curve segment. */
+ const size_t num_vertices = num_segments * 4;
+ if (hair->curve_shape == CURVE_THICK) {
+ index_data.alloc(num_segments);
+ vertex_data.alloc(num_vertices * num_motion_steps);
+ }
+ else
+ aabb_data.alloc(num_segments * num_motion_steps);
+
+ /* Get AABBs for each motion step. */
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ /* The center step for motion vertices is not stored in the attribute. */
+ const float3 *keys = hair->get_curve_keys().data();
+ size_t center_step = (num_motion_steps - 1) / 2;
+ if (step != center_step) {
+ size_t attr_offset = (step > center_step) ? step - 1 : step;
+ /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+ keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+ }
+
+ for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+ const Hair::Curve curve = hair->get_curve(j);
+ const array<float> &curve_radius = hair->get_curve_radius();
+
+ for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+ if (hair->curve_shape == CURVE_THICK) {
+ int k0 = curve.first_key + segment;
+ int k1 = k0 + 1;
+ int ka = max(k0 - 1, curve.first_key);
+ int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+ const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+ const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+ const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+ const float4 pw = make_float4(
+ curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+ /* Convert Catmull-Rom data to Bezier spline. */
+ static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+ static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+ static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+ static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+ index_data[i] = i * 4;
+ float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+ v[0] = make_float4(
+ dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+ v[1] = make_float4(
+ dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+ v[2] = make_float4(
+ dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+ v[3] = make_float4(
+ dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+ }
+ else {
+ BoundBox bounds = BoundBox::empty;
+ curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+ const size_t index = step * num_segments + i;
+ aabb_data[index].minX = bounds.min.x;
+ aabb_data[index].minY = bounds.min.y;
+ aabb_data[index].minZ = bounds.min.z;
+ aabb_data[index].maxX = bounds.max.x;
+ aabb_data[index].maxY = bounds.max.y;
+ aabb_data[index].maxZ = bounds.max.z;
+ }
+ }
+ }
+ }
+
+ /* Upload AABB data to GPU. */
+ aabb_data.copy_to_device();
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> aabb_ptrs;
+ aabb_ptrs.reserve(num_motion_steps);
+ vector<device_ptr> width_ptrs;
+ vector<device_ptr> vertex_ptrs;
+ width_ptrs.reserve(num_motion_steps);
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+ const device_ptr base_ptr = vertex_data.device_pointer +
+ step * num_vertices * sizeof(float4);
+ width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+ vertex_ptrs.push_back(base_ptr);
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+ OptixBuildInput build_input = {};
+ if (hair->curve_shape == CURVE_THICK) {
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+ build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+ build_input.curveArray.numPrimitives = num_segments;
+ build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.curveArray.numVertices = num_vertices;
+ build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+ build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+ build_input.curveArray.widthStrideInBytes = sizeof(float4);
+ build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+ build_input.curveArray.indexStrideInBytes = sizeof(int);
+ build_input.curveArray.flag = build_flags;
+ build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+ }
+ else {
+ /* Disable visibility test any-hit program, since it is already checked during
+ * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+ build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+ build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+ build_input.customPrimitiveArray.numPrimitives = num_segments;
+ build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+ build_input.customPrimitiveArray.flags = &build_flags;
+ build_input.customPrimitiveArray.numSbtRecords = 1;
+ build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+ }
+
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ }
+ else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+ /* Build BLAS for triangle primitives. */
+ Mesh *const mesh = static_cast<Mesh *const>(geom);
+ if (mesh->num_triangles() == 0) {
+ return;
+ }
+
+ const size_t num_verts = mesh->get_verts().size();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = mesh->get_motion_steps();
+ }
+
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ index_data.alloc(mesh->get_triangles().size());
+ memcpy(index_data.data(),
+ mesh->get_triangles().data(),
+ mesh->get_triangles().size() * sizeof(int));
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+ vertex_data.alloc(num_verts * num_motion_steps);
+
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ const float3 *verts = mesh->get_verts().data();
+
+ size_t center_step = (num_motion_steps - 1) / 2;
+ /* The center step for motion vertices is not stored in the attribute. */
+ if (step != center_step) {
+ verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+ }
+
+ memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+ }
+
+ /* Upload triangle data to GPU. */
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> vertex_ptrs;
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+ build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.triangleArray.numVertices = num_verts;
+ build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+ build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+ build_input.triangleArray.indexBuffer = index_data.device_pointer;
+ build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+ build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+ build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+ build_input.triangleArray.flags = &build_flags;
+ /* The SBT does not store per primitive data since Cycles already allocates separate
+ * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+ * one and rely on that having the same meaning in this case. */
+ build_input.triangleArray.numSbtRecords = 1;
+ build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ }
+ }
+ else {
+ unsigned int num_instances = 0;
+ unsigned int max_num_instances = 0xFFFFFFFF;
+
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ bvh_optix->motion_transform_data.free();
+
+ optixDeviceContextGetProperty(context,
+ OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+ &max_num_instances,
+ sizeof(max_num_instances));
+ /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+ max_num_instances >>= 1;
+ if (bvh->objects.size() > max_num_instances) {
+ progress.set_error(
+ "Failed to build OptiX acceleration structure because there are too many instances");
+ return;
+ }
+
+ /* Fill instance descriptions. */
+ device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+ instances.alloc(bvh->objects.size());
+
+ /* Calculate total motion transform size and allocate memory for them. */
+ size_t motion_transform_offset = 0;
+ if (motion_blur) {
+ size_t total_motion_transform_size = 0;
+ for (Object *const ob : bvh->objects) {
+ if (ob->is_traceable() && ob->use_motion()) {
+ total_motion_transform_size = align_up(total_motion_transform_size,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ total_motion_transform_size = total_motion_transform_size +
+ sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+ }
+ }
+
+ assert(bvh_optix->motion_transform_data.device == this);
+ bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+ }
+
+ for (Object *ob : bvh->objects) {
+ /* Skip non-traceable objects. */
+ if (!ob->is_traceable()) {
+ continue;
+ }
+
+ BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+ OptixTraversableHandle handle = blas->traversable_handle;
+
+ OptixInstance &instance = instances[num_instances++];
+ memset(&instance, 0, sizeof(instance));
+
+ /* Clear transform to identity matrix. */
+ instance.transform[0] = 1.0f;
+ instance.transform[5] = 1.0f;
+ instance.transform[10] = 1.0f;
+
+ /* Set user instance ID to object index (but leave low bit blank). */
+ instance.instanceId = ob->get_device_index() << 1;
+
+ /* Have to have at least one bit in the mask, or else instance would always be culled. */
+ instance.visibilityMask = 1;
+
+ if (ob->get_geometry()->has_volume) {
+ /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+ */
+ instance.visibilityMask |= 2;
+ }
+
+ if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+ /* Same applies to curves (so they can be skipped in local trace calls). */
+ instance.visibilityMask |= 4;
+
+ if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+ static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+ /* Select between motion blur and non-motion blur built-in intersection module. */
+ instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+ }
+ }
+
+ /* Insert motion traversable if object has motion. */
+ if (motion_blur && ob->use_motion()) {
+ size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+
+ const CUDAContextScope scope(this);
+
+ motion_transform_offset = align_up(motion_transform_offset,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+ motion_transform_offset;
+ motion_transform_offset += motion_transform_size;
+
+ /* Allocate host side memory for motion transform and fill it with transform data. */
+ OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+ new uint8_t[motion_transform_size]);
+ motion_transform.child = handle;
+ motion_transform.motionOptions.numKeys = ob->get_motion().size();
+ motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+ motion_transform.motionOptions.timeBegin = 0.0f;
+ motion_transform.motionOptions.timeEnd = 1.0f;
+
+ OptixSRTData *const srt_data = motion_transform.srtData;
+ array<DecomposedTransform> decomp(ob->get_motion().size());
+ transform_motion_decompose(
+ decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+ for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+ /* Scale. */
+ srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+ srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+ srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+ /* Shear. */
+ srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+ srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+ srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+ assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+ assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+ assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+ /* Pivot point. */
+ srt_data[i].pvx = 0.0f;
+ srt_data[i].pvy = 0.0f;
+ srt_data[i].pvz = 0.0f;
+
+ /* Rotation. */
+ srt_data[i].qx = decomp[i].x.x;
+ srt_data[i].qy = decomp[i].x.y;
+ srt_data[i].qz = decomp[i].x.z;
+ srt_data[i].qw = decomp[i].x.w;
+
+ /* Translation. */
+ srt_data[i].tx = decomp[i].y.x;
+ srt_data[i].ty = decomp[i].y.y;
+ srt_data[i].tz = decomp[i].y.z;
+ }
+
+ /* Upload motion transform to GPU. */
+ cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+ delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+ /* Disable instance transform if object uses motion transform already. */
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+ /* Get traversable handle to motion transform. */
+ optixConvertPointerToTraversableHandle(context,
+ motion_transform_gpu,
+ OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+ &instance.traversableHandle);
+ }
+ else {
+ instance.traversableHandle = handle;
+
+ if (ob->get_geometry()->is_instanced()) {
+ /* Set transform matrix. */
+ memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+ }
+ else {
+ /* Disable instance transform if geometry already has it applied to vertex data. */
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+ /* Non-instanced objects read ID from 'prim_object', so distinguish
+ * them from instanced objects with the low bit set. */
+ instance.instanceId |= 1;
+ }
+ }
+ }
+
+ /* Upload instance descriptions. */
+ instances.resize(num_instances);
+ instances.copy_to_device();
+
+ /* Build top-level acceleration structure (TLAS) */
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+ build_input.instanceArray.instances = instances.device_pointer;
+ build_input.instanceArray.numInstances = num_instances;
+
+ if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ tlas_handle = bvh_optix->traversable_handle;
+ }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+ thread_scoped_lock lock(delayed_free_bvh_mutex);
+ /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+ * while GPU is still rendering. */
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+ delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+ delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+ bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+ thread_scoped_lock lock(delayed_free_bvh_mutex);
+ delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ /* Set constant memory for CUDA module. */
+ CUDADevice::const_copy_to(name, host, size);
+
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ /* Update traversable handle (since it is different for each device on multi devices). */
+ KernelData *const data = (KernelData *)host;
+ *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+ update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+ return;
+ }
+
+ /* Update data storage pointers in launch parameters. */
+# define KERNEL_TEX(data_type, tex_name) \
+ if (strcmp(name, #tex_name) == 0) { \
+ update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+ return; \
+ }
+ KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+# include "kernel/kernel_textures.h"
+# undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+ const CUDAContextScope scope(this);
+
+ cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */