diff options
Diffstat (limited to 'intern/cycles/device/device_optix.cpp')
-rw-r--r-- | intern/cycles/device/device_optix.cpp | 1936 |
1 files changed, 0 insertions, 1936 deletions
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp deleted file mode 100644 index 6f9a7943722..00000000000 --- a/intern/cycles/device/device_optix.cpp +++ /dev/null @@ -1,1936 +0,0 @@ -/* - * Copyright 2019, NVIDIA Corporation. - * Copyright 2019, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPTIX - -# include "bvh/bvh.h" -# include "bvh/bvh_optix.h" -# include "device/cuda/device_cuda.h" -# include "device/device_denoising.h" -# include "device/device_intern.h" -# include "render/buffers.h" -# include "render/hair.h" -# include "render/mesh.h" -# include "render/object.h" -# include "render/scene.h" -# include "util/util_debug.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_progress.h" -# include "util/util_time.h" - -# ifdef WITH_CUDA_DYNLOAD -# include <cuew.h> -// Do not use CUDA SDK headers when using CUEW -# define OPTIX_DONT_INCLUDE_CUDA -# endif -# include <optix_function_table_definition.h> -# include <optix_stubs.h> - -// TODO(pmours): Disable this once drivers have native support -# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1 - -CCL_NAMESPACE_BEGIN - -/* Make sure this stays in sync with kernel_globals.h */ -struct ShaderParams { - uint4 *input; - float4 *output; - int type; - int filter; - int sx; - int offset; - int sample; -}; -struct KernelParams { - WorkTile tile; - KernelData data; - ShaderParams shader; -# define KERNEL_TEX(type, name) const type *name; -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX -}; - -# define check_result_cuda(stmt) \ - { \ - CUresult res = stmt; \ - if (res != CUDA_SUCCESS) { \ - const char *name; \ - cuGetErrorName(res, &name); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return; \ - } \ - } \ - (void)0 -# define check_result_cuda_ret(stmt) \ - { \ - CUresult res = stmt; \ - if (res != CUDA_SUCCESS) { \ - const char *name; \ - cuGetErrorName(res, &name); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return false; \ - } \ - } \ - (void)0 - -# define check_result_optix(stmt) \ - { \ - enum OptixResult res = stmt; \ - if (res != OPTIX_SUCCESS) { \ - const char *name = optixGetErrorName(res); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return; \ - } \ - } \ - (void)0 -# define check_result_optix_ret(stmt) \ - { \ - enum OptixResult res = stmt; \ - if (res != OPTIX_SUCCESS) { \ - const char *name = optixGetErrorName(res); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return false; \ - } \ - } \ - (void)0 - -# define launch_filter_kernel(func_name, w, h, args) \ - { \ - CUfunction func; \ - check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \ - check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \ - int threads; \ - check_result_cuda_ret( \ - cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - threads = (int)sqrt((float)threads); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; \ - check_result_cuda_ret( \ - cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \ - } \ - (void)0 - -class OptiXDevice : public CUDADevice { - - // List of OptiX program groups - enum { - PG_RGEN, - PG_MISS, - PG_HITD, // Default hit group - PG_HITS, // __SHADOW_RECORD_ALL__ hit group - PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles) -# if OPTIX_ABI_VERSION >= 36 - PG_HITD_MOTION, - PG_HITS_MOTION, -# endif - PG_BAKE, // kernel_bake_evaluate - PG_DISP, // kernel_displace_evaluate - PG_BACK, // kernel_background_evaluate - PG_CALL, - NUM_PROGRAM_GROUPS = PG_CALL + 3 - }; - - // List of OptiX pipelines - enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES }; - - // A single shader binding table entry - struct SbtRecord { - char header[OPTIX_SBT_RECORD_HEADER_SIZE]; - }; - - // Information stored about CUDA memory allocations - struct CUDAMem { - bool free_map_host = false; - CUarray array = NULL; - CUtexObject texobject = 0; - bool use_mapped_host = false; - }; - - // Helper class to manage current CUDA context - struct CUDAContextScope { - CUDAContextScope(CUcontext ctx) - { - cuCtxPushCurrent(ctx); - } - ~CUDAContextScope() - { - cuCtxPopCurrent(NULL); - } - }; - - // Use a pool with multiple threads to support launches with multiple CUDA streams - TaskPool task_pool; - - vector<CUstream> cuda_stream; - OptixDeviceContext context = NULL; - - OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module - OptixModule builtin_modules[2] = {}; - OptixPipeline pipelines[NUM_PIPELINES] = {}; - - bool motion_blur = false; - device_vector<SbtRecord> sbt_data; - device_only_memory<KernelParams> launch_params; - OptixTraversableHandle tlas_handle = 0; - - OptixDenoiser denoiser = NULL; - device_only_memory<unsigned char> denoiser_state; - int denoiser_input_passes = 0; - - vector<device_only_memory<char>> delayed_free_bvh_memory; - thread_mutex delayed_free_bvh_mutex; - - public: - OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : CUDADevice(info_, stats_, profiler_, background_), - sbt_data(this, "__sbt", MEM_READ_ONLY), - launch_params(this, "__params", false), - denoiser_state(this, "__denoiser_state", true) - { - // Store number of CUDA streams in device info - info.cpu_threads = DebugFlags().optix.cuda_streams; - - // Make the CUDA context current - if (!cuContext) { - return; // Do not initialize if CUDA context creation failed already - } - const CUDAContextScope scope(cuContext); - - // Create OptiX context for this device - OptixDeviceContextOptions options = {}; -# ifdef WITH_CYCLES_LOGGING - options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4 - options.logCallbackFunction = - [](unsigned int level, const char *, const char *message, void *) { - switch (level) { - case 1: - LOG_IF(FATAL, VLOG_IS_ON(1)) << message; - break; - case 2: - LOG_IF(ERROR, VLOG_IS_ON(1)) << message; - break; - case 3: - LOG_IF(WARNING, VLOG_IS_ON(1)) << message; - break; - case 4: - LOG_IF(INFO, VLOG_IS_ON(1)) << message; - break; - } - }; -# endif - check_result_optix(optixDeviceContextCreate(cuContext, &options, &context)); -# ifdef WITH_CYCLES_LOGGING - check_result_optix(optixDeviceContextSetLogCallback( - context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); -# endif - - // Create launch streams - cuda_stream.resize(info.cpu_threads); - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING)); - - // Fix weird compiler bug that assigns wrong size - launch_params.data_elements = sizeof(KernelParams); - // Allocate launch parameter buffer memory on device - launch_params.alloc_to_device(info.cpu_threads); - } - ~OptiXDevice() - { - // Stop processing any more tasks - task_pool.cancel(); - - // Make CUDA context current - const CUDAContextScope scope(cuContext); - - free_bvh_memory_delayed(); - - sbt_data.free(); - texture_info.free(); - launch_params.free(); - denoiser_state.free(); - - // Unload modules - if (optix_module != NULL) - optixModuleDestroy(optix_module); - for (unsigned int i = 0; i < 2; ++i) - if (builtin_modules[i] != NULL) - optixModuleDestroy(builtin_modules[i]); - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) - if (pipelines[i] != NULL) - optixPipelineDestroy(pipelines[i]); - - // Destroy launch streams - for (CUstream stream : cuda_stream) - cuStreamDestroy(stream); - - if (denoiser != NULL) - optixDenoiserDestroy(denoiser); - - optixDeviceContextDestroy(context); - } - - private: - bool show_samples() const override - { - // Only show samples if not rendering multiple tiles in parallel - return info.cpu_threads == 1; - } - - BVHLayoutMask get_bvh_layout_mask() const override - { - // CUDA kernels are used when doing baking, so need to build a BVH those can understand too! - if (optix_module == NULL) - return CUDADevice::get_bvh_layout_mask(); - - // OptiX has its own internal acceleration structure format - return BVH_LAYOUT_OPTIX; - } - - string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, - bool filter, - bool /*split*/) override - { - // Split kernel is not supported in OptiX - string common_cflags = CUDADevice::compile_kernel_get_common_cflags( - requested_features, filter, false); - - // Add OptiX SDK include directory to include paths - const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR"); - if (optix_sdk_path) { - common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path); - } - - // Specialization for shader raytracing - if (requested_features.use_shader_raytrace) { - common_cflags += " --keep-device-functions"; - } - else { - common_cflags += " -D __NO_SHADER_RAYTRACE__"; - } - - return common_cflags; - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) override - { - if (have_error()) { - // Abort early if context creation failed already - return false; - } - - // Load CUDA modules because we need some of the utility kernels - if (!CUDADevice::load_kernels(requested_features)) { - return false; - } - - // Baking is currently performed using CUDA, so no need to load OptiX kernels - if (requested_features.use_baking) { - return true; - } - - const CUDAContextScope scope(cuContext); - - // Unload existing OptiX module and pipelines first - if (optix_module != NULL) { - optixModuleDestroy(optix_module); - optix_module = NULL; - } - for (unsigned int i = 0; i < 2; ++i) { - if (builtin_modules[i] != NULL) { - optixModuleDestroy(builtin_modules[i]); - builtin_modules[i] = NULL; - } - } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { - if (pipelines[i] != NULL) { - optixPipelineDestroy(pipelines[i]); - pipelines[i] = NULL; - } - } - - OptixModuleCompileOptions module_options = {}; - module_options.maxRegisterCount = 0; // Do not set an explicit register limit - module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; - module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; - -# if OPTIX_ABI_VERSION >= 41 - module_options.boundValues = nullptr; - module_options.numBoundValues = 0; -# endif - - OptixPipelineCompileOptions pipeline_options = {}; - // Default to no motion blur and two-level graph, since it is the fastest option - pipeline_options.usesMotionBlur = false; - pipeline_options.traversableGraphFlags = - OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; - pipeline_options.numPayloadValues = 6; - pipeline_options.numAttributeValues = 2; // u, v - pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; - pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h - -# if OPTIX_ABI_VERSION >= 36 - pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; - if (requested_features.use_hair) { - if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { - pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; - } - else { - pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; - } - } -# endif - - // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds - // This is necessary since objects may be reported to have motion if the Vector pass is - // active, but may still need to be rendered without motion blur if that isn't active as well - motion_blur = requested_features.use_object_motion; - - if (motion_blur) { - pipeline_options.usesMotionBlur = true; - // Motion blur can insert motion transforms into the traversal graph - // It is no longer a two-level graph then, so need to set flags to allow any configuration - pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; - } - - { // Load and compile PTX module with OptiX kernels - string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ? - "lib/kernel_optix_shader_raytrace.ptx" : - "lib/kernel_optix.ptx"); - if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { - if (!getenv("OPTIX_ROOT_DIR")) { - set_error( - "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to " - "the Optix SDK to be able to compile Optix kernels on demand)."); - return false; - } - ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true); - } - if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { - set_error("Failed to load OptiX kernel from '" + ptx_filename + "'"); - return false; - } - - check_result_optix_ret(optixModuleCreateFromPTX(context, - &module_options, - &pipeline_options, - ptx_data.data(), - ptx_data.size(), - nullptr, - 0, - &optix_module)); - } - - // Create program groups - OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; - OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; - OptixProgramGroupOptions group_options = {}; // There are no options currently - group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_RGEN].raygen.module = optix_module; - // Ignore branched integrator for now (see "requested_features.use_integrator_branched") - group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace"; - group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS; - group_descs[PG_MISS].miss.module = optix_module; - group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss"; - group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITD].hitgroup.moduleCH = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; - group_descs[PG_HITD].hitgroup.moduleAH = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test"; - group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITS].hitgroup.moduleAH = optix_module; - group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; - - if (requested_features.use_hair) { - group_descs[PG_HITD].hitgroup.moduleIS = optix_module; - group_descs[PG_HITS].hitgroup.moduleIS = optix_module; - - // Add curve intersection programs - if (requested_features.use_hair_thick) { - // Slower programs for thick hair since that also slows down ribbons. - // Ideally this should not be needed. - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; - } - else { - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; - } - -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { - OptixBuiltinISOptions builtin_options = {}; - builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; - builtin_options.usesMotionBlur = false; - - check_result_optix_ret(optixBuiltinISModuleGet( - context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0])); - - group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0]; - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr; - group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; - - if (motion_blur) { - builtin_options.usesMotionBlur = true; - - check_result_optix_ret(optixBuiltinISModuleGet( - context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1])); - - group_descs[PG_HITD_MOTION] = group_descs[PG_HITD]; - group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1]; - group_descs[PG_HITS_MOTION] = group_descs[PG_HITS]; - group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1]; - } - } -# endif - } - - if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { - // Add hit group for local intersections - group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITL].hitgroup.moduleAH = optix_module; - group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; - } - - if (requested_features.use_baking) { - group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_BAKE].raygen.module = optix_module; - group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake"; - } - - if (requested_features.use_true_displacement) { - group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_DISP].raygen.module = optix_module; - group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace"; - } - - if (requested_features.use_background_light) { - group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_BACK].raygen.module = optix_module; - group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background"; - } - - // Shader raytracing replaces some functions with direct callables - if (requested_features.use_shader_raytrace) { - group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 0].callables.moduleDC = optix_module; - group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes"; - group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 1].callables.moduleDC = optix_module; - group_descs[PG_CALL + 1].callables.entryFunctionNameDC = - "__direct_callable__kernel_volume_shadow"; - group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 2].callables.moduleDC = optix_module; - group_descs[PG_CALL + 2].callables.entryFunctionNameDC = - "__direct_callable__subsurface_scatter_multi_setup"; - } - - check_result_optix_ret(optixProgramGroupCreate( - context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); - - // Get program stack sizes - OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {}; - // Set up SBT, which in this case is used only to select between different programs - sbt_data.alloc(NUM_PROGRAM_GROUPS); - memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); - check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); - } - sbt_data.copy_to_device(); // Upload SBT to device - - // Calculate maximum trace continuation stack size - unsigned int trace_css = stack_size[PG_HITD].cssCH; - // This is based on the maximum of closest-hit and any-hit/intersection programs - trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); -# if OPTIX_ABI_VERSION >= 36 - trace_css = std::max(trace_css, - stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); - trace_css = std::max(trace_css, - stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); -# endif - - OptixPipelineLinkOptions link_options = {}; - link_options.maxTraceDepth = 1; - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; -# if OPTIX_ABI_VERSION < 24 - link_options.overrideUsesMotionBlur = motion_blur; -# endif - - { // Create path tracing pipeline - vector<OptixProgramGroup> pipeline_groups; - pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } -# endif - if (requested_features.use_shader_raytrace) { - pipeline_groups.push_back(groups[PG_CALL + 0]); - pipeline_groups.push_back(groups[PG_CALL + 1]); - pipeline_groups.push_back(groups[PG_CALL + 2]); - } - - check_result_optix_ret(optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups.data(), - pipeline_groups.size(), - nullptr, - 0, - &pipelines[PIP_PATH_TRACE])); - - // Combine ray generation and trace continuation stack size - const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; - // Max direct callable depth is one of the following, so combine accordingly - // - __raygen__ -> svm_eval_nodes - // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes - // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes - const unsigned int dss = stack_size[PG_CALL + 0].dssDC + - std::max(stack_size[PG_CALL + 1].dssDC, - stack_size[PG_CALL + 2].dssDC); - - // Set stack size depending on pipeline options - check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], - 0, - requested_features.use_shader_raytrace ? dss : 0, - css, - motion_blur ? 3 : 2)); - } - - // Only need to create shader evaluation pipeline if one of these features is used: - const bool use_shader_eval_pipeline = requested_features.use_baking || - requested_features.use_background_light || - requested_features.use_true_displacement; - - if (use_shader_eval_pipeline) { // Create shader evaluation pipeline - vector<OptixProgramGroup> pipeline_groups; - pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_BAKE]); - pipeline_groups.push_back(groups[PG_DISP]); - pipeline_groups.push_back(groups[PG_BACK]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } -# endif - if (requested_features.use_shader_raytrace) { - pipeline_groups.push_back(groups[PG_CALL + 0]); - pipeline_groups.push_back(groups[PG_CALL + 1]); - pipeline_groups.push_back(groups[PG_CALL + 2]); - } - - check_result_optix_ret(optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups.data(), - pipeline_groups.size(), - nullptr, - 0, - &pipelines[PIP_SHADER_EVAL])); - - // Calculate continuation stack size based on the maximum of all ray generation stack sizes - const unsigned int css = std::max(stack_size[PG_BAKE].cssRG, - std::max(stack_size[PG_DISP].cssRG, - stack_size[PG_BACK].cssRG)) + - link_options.maxTraceDepth * trace_css; - const unsigned int dss = stack_size[PG_CALL + 0].dssDC + - std::max(stack_size[PG_CALL + 1].dssDC, - stack_size[PG_CALL + 2].dssDC); - - check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL], - 0, - requested_features.use_shader_raytrace ? dss : 0, - css, - motion_blur ? 3 : 2)); - } - - // Clean up program group objects - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - optixProgramGroupDestroy(groups[i]); - } - - return true; - } - - void thread_run(DeviceTask &task, int thread_index) // Main task entry point - { - if (have_error()) - return; // Abort early if there was an error previously - - if (task.type == DeviceTask::RENDER) { - if (thread_index != 0) { - // Only execute denoising in a single thread (see also 'task_add') - task.tile_types &= ~RenderTile::DENOISE; - } - - RenderTile tile; - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) - launch_render(task, tile, thread_index); - else if (tile.task == RenderTile::BAKE) { - // Perform baking using CUDA, since it is not currently implemented in OptiX - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - CUDADevice::render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::DENOISE) - launch_denoise(task, tile); - task.release_tile(tile); - if (task.get_cancel() && !task.need_finish_queue) - break; // User requested cancellation - else if (have_error()) - break; // Abort rendering when encountering an error - } - } - else if (task.type == DeviceTask::SHADER) { - // CUDA kernels are used when doing baking - if (optix_module == NULL) - CUDADevice::shader(task); - else - launch_shader_eval(task, thread_index); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - // Set up a single tile that covers the whole task and denoise it - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - launch_denoise(task, tile); - } - } - - void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index) - { - assert(thread_index < launch_params.data_size); - - // Keep track of total render time of this tile - const scoped_timer timer(&rtile.buffers->render_time); - - WorkTile wtile; - wtile.x = rtile.x; - wtile.y = rtile.y; - wtile.w = rtile.w; - wtile.h = rtile.h; - wtile.offset = rtile.offset; - wtile.stride = rtile.stride; - wtile.buffer = (float *)rtile.buffer; - - const int end_sample = rtile.start_sample + rtile.num_samples; - // Keep this number reasonable to avoid running into TDRs - int step_samples = (info.display_device ? 8 : 32); - - // Offset into launch params buffer so that streams use separate data - device_ptr launch_params_ptr = launch_params.device_pointer + - thread_index * launch_params.data_elements; - - const CUDAContextScope scope(cuContext); - - for (int sample = rtile.start_sample; sample < end_sample;) { - // Copy work tile information to device - wtile.start_sample = sample; - wtile.num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile.num_samples = min(wtile.num_samples, end_sample - sample); - device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); - check_result_cuda( - cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); - - OptixShaderBindingTable sbt_params = {}; - sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord); - sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); - sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = 1; - sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); - sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); -# if OPTIX_ABI_VERSION >= 36 - sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL -# else - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL -# endif - sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); - sbt_params.callablesRecordCount = 3; - sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); - - // Launch the ray generation program - check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], - cuda_stream[thread_index], - launch_params_ptr, - launch_params.data_elements, - &sbt_params, - // Launch with samples close to each other for better locality - wtile.w * wtile.num_samples, - wtile.h, - 1)); - - // Run the adaptive sampling kernels at selected samples aligned to step samples. - uint filter_sample = wtile.start_sample + wtile.num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]); - } - - // Wait for launch to finish - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - // Update current sample, so it is displayed correctly - sample += wtile.num_samples; - rtile.sample = sample; - // Update task progress after the kernel completed rendering - task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); - - if (task.get_cancel() && !task.need_finish_queue) - return; // Cancel rendering - } - - // Finalize adaptive sampling - if (task.adaptive_sampling.use) { - device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); - adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]); - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples); - } - } - - bool launch_denoise(DeviceTask &task, RenderTile &rtile) - { - // Update current sample (for display and NLM denoising task) - rtile.sample = rtile.start_sample + rtile.num_samples; - - // Make CUDA context current now, since it is used for both denoising tasks - const CUDAContextScope scope(cuContext); - - // Choose between OptiX and NLM denoising - if (task.denoising.type == DENOISER_OPTIX) { - // Map neighboring tiles onto this device, indices are as following: - // Where index 4 is the center tile and index 9 is the target for the result. - // 0 1 2 - // 3 4 5 - // 6 7 8 9 - RenderTileNeighbors neighbors(rtile); - task.map_neighbor_tiles(neighbors, this); - RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; - RenderTile &target_tile = neighbors.target; - rtile = center_tile; // Tile may have been modified by mapping code - - // Calculate size of the tile to denoise (including overlap) - int4 rect = center_tile.bounds(); - // Overlap between tiles has to be at least 64 pixels - // TODO(pmours): Query this value from OptiX - rect = rect_expand(rect, 64); - int4 clip_rect = neighbors.bounds(); - rect = rect_clip(rect, clip_rect); - int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); - int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y); - - // Calculate byte offsets and strides - int pixel_stride = task.pass_stride * (int)sizeof(float); - int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride; - const int pass_offset[3] = { - (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float), - (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), - (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; - - // Start with the current tile pointer offset - int input_stride = pixel_stride; - device_ptr input_ptr = rtile.buffer + pixel_offset; - - // Copy tile data into a common buffer if necessary - device_only_memory<float> input(this, "denoiser input", true); - device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY); - - bool contiguous_memory = true; - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { - contiguous_memory = false; - } - } - - if (contiguous_memory) { - // Tiles are in continous memory, so can just subtract overlap offset - input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride; - // Stride covers the whole width of the image and not just a single tile - input_stride *= rtile.stride; - } - else { - // Adjacent tiles are in separate memory regions, so need to copy them into a single one - input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride); - // Start with the new input buffer - input_ptr = input.device_pointer; - // Stride covers the width of the new input buffer, which includes tile width and overlap - input_stride *= rect_size.x; - - TileInfo *tile_info = tile_info_mem.alloc(1); - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - tile_info->offsets[i] = neighbors.tiles[i].offset; - tile_info->strides[i] = neighbors.tiles[i].stride; - tile_info->buffers[i] = neighbors.tiles[i].buffer; - } - tile_info->x[0] = neighbors.tiles[3].x; - tile_info->x[1] = neighbors.tiles[4].x; - tile_info->x[2] = neighbors.tiles[5].x; - tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; - tile_info->y[0] = neighbors.tiles[1].y; - tile_info->y[1] = neighbors.tiles[4].y; - tile_info->y[2] = neighbors.tiles[7].y; - tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; - tile_info_mem.copy_to_device(); - - void *args[] = { - &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; - launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); - } - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - device_only_memory<float> input_rgb(this, "denoiser input rgb", true); - input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); - - void *input_args[] = {&input_rgb.device_pointer, - &input_ptr, - &rect_size.x, - &rect_size.y, - &input_stride, - &task.pass_stride, - const_cast<int *>(pass_offset), - &task.denoising.input_passes, - &rtile.sample}; - launch_filter_kernel( - "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args); - - input_ptr = input_rgb.device_pointer; - pixel_stride = 3 * sizeof(float); - input_stride = rect_size.x * pixel_stride; -# endif - - const bool recreate_denoiser = (denoiser == NULL) || - (task.denoising.input_passes != denoiser_input_passes); - if (recreate_denoiser) { - // Destroy existing handle before creating new one - if (denoiser != NULL) { - optixDenoiserDestroy(denoiser); - } - - // Create OptiX denoiser handle on demand when it is first used - OptixDenoiserOptions denoiser_options = {}; - assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3); -# if OPTIX_ABI_VERSION >= 47 - denoiser_options.guideAlbedo = task.denoising.input_passes >= 2; - denoiser_options.guideNormal = task.denoising.input_passes >= 3; - check_result_optix_ret(optixDenoiserCreate( - context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser)); -# else - denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>( - OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1)); -# if OPTIX_ABI_VERSION < 28 - denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3; -# endif - check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser)); - check_result_optix_ret( - optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0)); -# endif - - // OptiX denoiser handle was created with the requested number of input passes - denoiser_input_passes = task.denoising.input_passes; - } - - OptixDenoiserSizes sizes = {}; - check_result_optix_ret( - optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes)); - -# if OPTIX_ABI_VERSION < 28 - const size_t scratch_size = sizes.recommendedScratchSizeInBytes; -# else - const size_t scratch_size = sizes.withOverlapScratchSizeInBytes; -# endif - const size_t scratch_offset = sizes.stateSizeInBytes; - - // Allocate denoiser state if tile size has changed since last setup - if (recreate_denoiser || (denoiser_state.data_width != rect_size.x || - denoiser_state.data_height != rect_size.y)) { - denoiser_state.alloc_to_device(scratch_offset + scratch_size); - - // Initialize denoiser state for the current tile size - check_result_optix_ret(optixDenoiserSetup(denoiser, - 0, - rect_size.x, - rect_size.y, - denoiser_state.device_pointer, - scratch_offset, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); - - denoiser_state.data_width = rect_size.x; - denoiser_state.data_height = rect_size.y; - } - - // Set up input and output layer information - OptixImage2D input_layers[3] = {}; - OptixImage2D output_layers[1] = {}; - - for (int i = 0; i < 3; ++i) { -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i); -# else - input_layers[i].data = input_ptr + pass_offset[i]; -# endif - input_layers[i].width = rect_size.x; - input_layers[i].height = rect_size.y; - input_layers[i].rowStrideInBytes = input_stride; - input_layers[i].pixelStrideInBytes = pixel_stride; - input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3; - } - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - output_layers[0].data = input_ptr; - output_layers[0].width = rect_size.x; - output_layers[0].height = rect_size.y; - output_layers[0].rowStrideInBytes = input_stride; - output_layers[0].pixelStrideInBytes = pixel_stride; - int2 output_offset = overlap_offset; - overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually -# else - output_layers[0].data = target_tile.buffer + pixel_offset; - output_layers[0].width = target_tile.w; - output_layers[0].height = target_tile.h; - output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride; - output_layers[0].pixelStrideInBytes = pixel_stride; -# endif - output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; - -# if OPTIX_ABI_VERSION >= 47 - OptixDenoiserLayer image_layers = {}; - image_layers.input = input_layers[0]; - image_layers.output = output_layers[0]; - - OptixDenoiserGuideLayer guide_layers = {}; - guide_layers.albedo = input_layers[1]; - guide_layers.normal = input_layers[2]; -# endif - - // Finally run denonising - OptixDenoiserParams params = {}; // All parameters are disabled/zero -# if OPTIX_ABI_VERSION >= 47 - check_result_optix_ret(optixDenoiserInvoke(denoiser, - NULL, - ¶ms, - denoiser_state.device_pointer, - scratch_offset, - &guide_layers, - &image_layers, - 1, - overlap_offset.x, - overlap_offset.y, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); -# else - check_result_optix_ret(optixDenoiserInvoke(denoiser, - NULL, - ¶ms, - denoiser_state.device_pointer, - scratch_offset, - input_layers, - task.denoising.input_passes, - overlap_offset.x, - overlap_offset.y, - output_layers, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); -# endif - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - void *output_args[] = {&input_ptr, - &target_tile.buffer, - &output_offset.x, - &output_offset.y, - &rect_size.x, - &rect_size.y, - &target_tile.x, - &target_tile.y, - &target_tile.w, - &target_tile.h, - &target_tile.offset, - &target_tile.stride, - &task.pass_stride, - &rtile.sample}; - launch_filter_kernel( - "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args); -# endif - - check_result_cuda_ret(cuStreamSynchronize(0)); - - task.unmap_neighbor_tiles(neighbors, this); - } - else { - // Run CUDA denoising kernels - DenoisingTask denoising(this, task); - CUDADevice::denoise(rtile, denoising); - } - - // Update task progress after the denoiser completed processing - task.update_progress(&rtile, rtile.w * rtile.h); - - return true; - } - - void launch_shader_eval(DeviceTask &task, int thread_index) - { - unsigned int rgen_index = PG_BACK; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) - rgen_index = PG_BAKE; - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) - rgen_index = PG_DISP; - - const CUDAContextScope scope(cuContext); - - device_ptr launch_params_ptr = launch_params.device_pointer + - thread_index * launch_params.data_elements; - - for (int sample = 0; sample < task.num_samples; ++sample) { - ShaderParams params; - params.input = (uint4 *)task.shader_input; - params.output = (float4 *)task.shader_output; - params.type = task.shader_eval_type; - params.filter = task.shader_filter; - params.sx = task.shader_x; - params.offset = task.offset; - params.sample = sample; - - check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader), - ¶ms, - sizeof(params), - cuda_stream[thread_index])); - - OptixShaderBindingTable sbt_params = {}; - sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord); - sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); - sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = 1; - sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); - sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); -# if OPTIX_ABI_VERSION >= 36 - sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL -# else - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL -# endif - sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); - sbt_params.callablesRecordCount = 3; - sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); - - check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], - cuda_stream[thread_index], - launch_params_ptr, - launch_params.data_elements, - &sbt_params, - task.shader_w, - 1, - 1)); - - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - task.update_progress(NULL); - } - } - - bool build_optix_bvh(BVHOptiX *bvh, - OptixBuildOperation operation, - const OptixBuildInput &build_input, - uint16_t num_motion_steps) - { - /* Allocate and build acceleration structures only one at a time, to prevent parallel builds - * from running out of memory (since both original and compacted acceleration structure memory - * may be allocated at the same time for the duration of this function). The builds would - * otherwise happen on the same CUDA stream anyway. */ - static thread_mutex mutex; - thread_scoped_lock lock(mutex); - - const CUDAContextScope scope(cuContext); - - const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC); - - // Compute memory usage - OptixAccelBufferSizes sizes = {}; - OptixAccelBuildOptions options = {}; - options.operation = operation; - if (use_fast_trace_bvh) { - VLOG(2) << "Using fast to trace OptiX BVH"; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; - } - else { - VLOG(2) << "Using fast to update OptiX BVH"; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; - } - - options.motionOptions.numKeys = num_motion_steps; - options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; - options.motionOptions.timeBegin = 0.0f; - options.motionOptions.timeEnd = 1.0f; - - check_result_optix_ret( - optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); - - // Allocate required output buffers - device_only_memory<char> temp_mem(this, "optix temp as build mem", true); - temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); - if (!temp_mem.device_pointer) - return false; // Make sure temporary memory allocation succeeded - - // Acceleration structure memory has to be allocated on the device (not allowed to be on host) - device_only_memory<char> &out_data = bvh->as_data; - if (operation == OPTIX_BUILD_OPERATION_BUILD) { - assert(out_data.device == this); - out_data.alloc_to_device(sizes.outputSizeInBytes); - if (!out_data.device_pointer) - return false; - } - else { - assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes); - } - - // Finally build the acceleration structure - OptixAccelEmitDesc compacted_size_prop = {}; - compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE; - // A tiny space was allocated for this property at the end of the temporary buffer above - // Make sure this pointer is 8-byte aligned - compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8); - - OptixTraversableHandle out_handle = 0; - check_result_optix_ret(optixAccelBuild(context, - NULL, - &options, - &build_input, - 1, - temp_mem.device_pointer, - sizes.tempSizeInBytes, - out_data.device_pointer, - sizes.outputSizeInBytes, - &out_handle, - use_fast_trace_bvh ? &compacted_size_prop : NULL, - use_fast_trace_bvh ? 1 : 0)); - bvh->traversable_handle = static_cast<uint64_t>(out_handle); - - // Wait for all operations to finish - check_result_cuda_ret(cuStreamSynchronize(NULL)); - - // Compact acceleration structure to save memory (only if using fast trace as the - // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case). - if (use_fast_trace_bvh) { - uint64_t compacted_size = sizes.outputSizeInBytes; - check_result_cuda_ret( - cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); - - // Temporary memory is no longer needed, so free it now to make space - temp_mem.free(); - - // There is no point compacting if the size does not change - if (compacted_size < sizes.outputSizeInBytes) { - device_only_memory<char> compacted_data(this, "optix compacted as", false); - compacted_data.alloc_to_device(compacted_size); - if (!compacted_data.device_pointer) - // Do not compact if memory allocation for compacted acceleration structure fails - // Can just use the uncompacted one then, so succeed here regardless - return true; - - check_result_optix_ret(optixAccelCompact(context, - NULL, - out_handle, - compacted_data.device_pointer, - compacted_size, - &out_handle)); - bvh->traversable_handle = static_cast<uint64_t>(out_handle); - - // Wait for compaction to finish - check_result_cuda_ret(cuStreamSynchronize(NULL)); - - std::swap(out_data.device_size, compacted_data.device_size); - std::swap(out_data.device_pointer, compacted_data.device_pointer); - // Original acceleration structure memory is freed when 'compacted_data' goes out of scope - } - } - - return true; - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { - if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) { - /* For baking CUDA is used, build appropriate BVH for that. */ - Device::build_bvh(bvh, progress, refit); - return; - } - - const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC); - - free_bvh_memory_delayed(); - - BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); - - progress.set_substatus("Building OptiX acceleration structure"); - - if (!bvh->params.top_level) { - assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1); - - OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD; - /* Refit is only possible when using fast to trace BVH (because AS is built with - * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */ - if (refit && !use_fast_trace_bvh) { - assert(bvh_optix->traversable_handle != 0); - operation = OPTIX_BUILD_OPERATION_UPDATE; - } - else { - bvh_optix->as_data.free(); - bvh_optix->traversable_handle = 0; - } - - // Build bottom level acceleration structures (BLAS) - Geometry *const geom = bvh->geometry[0]; - if (geom->geometry_type == Geometry::HAIR) { - // Build BLAS for curve primitives - Hair *const hair = static_cast<Hair *const>(geom); - if (hair->num_curves() == 0) { - return; - } - - const size_t num_segments = hair->num_segments(); - - size_t num_motion_steps = 1; - Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && hair->get_use_motion_blur() && motion_keys) { - num_motion_steps = hair->get_motion_steps(); - } - - device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); -# if OPTIX_ABI_VERSION >= 36 - device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); - device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); - // Four control points for each curve segment - const size_t num_vertices = num_segments * 4; - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - index_data.alloc(num_segments); - vertex_data.alloc(num_vertices * num_motion_steps); - } - else -# endif - aabb_data.alloc(num_segments * num_motion_steps); - - // Get AABBs for each motion step - for (size_t step = 0; step < num_motion_steps; ++step) { - // The center step for motion vertices is not stored in the attribute - const float3 *keys = hair->get_curve_keys().data(); - size_t center_step = (num_motion_steps - 1) / 2; - if (step != center_step) { - size_t attr_offset = (step > center_step) ? step - 1 : step; - // Technically this is a float4 array, but sizeof(float3) == sizeof(float4) - keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size(); - } - - for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { - const Hair::Curve curve = hair->get_curve(j); -# if OPTIX_ABI_VERSION >= 36 - const array<float> &curve_radius = hair->get_curve_radius(); -# endif - - for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - int k0 = curve.first_key + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, curve.first_key); - int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); - - const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); - const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); - const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); - const float4 pw = make_float4( - curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]); - - // Convert Catmull-Rom data to Bezier spline - static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; - static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; - static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; - static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; - - index_data[i] = i * 4; - float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; - v[0] = make_float4( - dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); - v[1] = make_float4( - dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw)); - v[2] = make_float4( - dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); - v[3] = make_float4( - dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); - } - else -# endif - { - BoundBox bounds = BoundBox::empty; - curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds); - - const size_t index = step * num_segments + i; - aabb_data[index].minX = bounds.min.x; - aabb_data[index].minY = bounds.min.y; - aabb_data[index].minZ = bounds.min.z; - aabb_data[index].maxX = bounds.max.x; - aabb_data[index].maxY = bounds.max.y; - aabb_data[index].maxZ = bounds.max.z; - } - } - } - } - - // Upload AABB data to GPU - aabb_data.copy_to_device(); -# if OPTIX_ABI_VERSION >= 36 - index_data.copy_to_device(); - vertex_data.copy_to_device(); -# endif - - vector<device_ptr> aabb_ptrs; - aabb_ptrs.reserve(num_motion_steps); -# if OPTIX_ABI_VERSION >= 36 - vector<device_ptr> width_ptrs; - vector<device_ptr> vertex_ptrs; - width_ptrs.reserve(num_motion_steps); - vertex_ptrs.reserve(num_motion_steps); -# endif - for (size_t step = 0; step < num_motion_steps; ++step) { - aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); -# if OPTIX_ABI_VERSION >= 36 - const device_ptr base_ptr = vertex_data.device_pointer + - step * num_vertices * sizeof(float4); - width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size - vertex_ptrs.push_back(base_ptr); -# endif - } - - // Force a single any-hit call, so shadow record-all behavior works correctly - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; - OptixBuildInput build_input = {}; -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; - build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; - build_input.curveArray.numPrimitives = num_segments; - build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); - build_input.curveArray.numVertices = num_vertices; - build_input.curveArray.vertexStrideInBytes = sizeof(float4); - build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data(); - build_input.curveArray.widthStrideInBytes = sizeof(float4); - build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; - build_input.curveArray.indexStrideInBytes = sizeof(int); - build_input.curveArray.flag = build_flags; - build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; - } - else -# endif - { - // Disable visibility test any-hit program, since it is already checked during - // intersection. Those trace calls that require anyhit can force it with a ray flag. - build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; - - build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; -# if OPTIX_ABI_VERSION < 23 - build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.aabbArray.numPrimitives = num_segments; - build_input.aabbArray.strideInBytes = sizeof(OptixAabb); - build_input.aabbArray.flags = &build_flags; - build_input.aabbArray.numSbtRecords = 1; - build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset; -# else - build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.customPrimitiveArray.numPrimitives = num_segments; - build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); - build_input.customPrimitiveArray.flags = &build_flags; - build_input.customPrimitiveArray.numSbtRecords = 1; - build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; -# endif - } - - if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - } - else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) { - // Build BLAS for triangle primitives - Mesh *const mesh = static_cast<Mesh *const>(geom); - if (mesh->num_triangles() == 0) { - return; - } - - const size_t num_verts = mesh->get_verts().size(); - - size_t num_motion_steps = 1; - Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { - num_motion_steps = mesh->get_motion_steps(); - } - - device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); - index_data.alloc(mesh->get_triangles().size()); - memcpy(index_data.data(), - mesh->get_triangles().data(), - mesh->get_triangles().size() * sizeof(int)); - device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); - vertex_data.alloc(num_verts * num_motion_steps); - - for (size_t step = 0; step < num_motion_steps; ++step) { - const float3 *verts = mesh->get_verts().data(); - - size_t center_step = (num_motion_steps - 1) / 2; - // The center step for motion vertices is not stored in the attribute - if (step != center_step) { - verts = motion_keys->data_float3() + - (step > center_step ? step - 1 : step) * num_verts; - } - - memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3)); - } - - // Upload triangle data to GPU - index_data.copy_to_device(); - vertex_data.copy_to_device(); - - vector<device_ptr> vertex_ptrs; - vertex_ptrs.reserve(num_motion_steps); - for (size_t step = 0; step < num_motion_steps; ++step) { - vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); - } - - // Force a single any-hit call, so shadow record-all behavior works correctly - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; - OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; - build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); - build_input.triangleArray.numVertices = num_verts; - build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; - build_input.triangleArray.vertexStrideInBytes = sizeof(float3); - build_input.triangleArray.indexBuffer = index_data.device_pointer; - build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); - build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; - build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); - build_input.triangleArray.flags = &build_flags; - // The SBT does not store per primitive data since Cycles already allocates separate - // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in - // one and rely on that having the same meaning in this case. - build_input.triangleArray.numSbtRecords = 1; - build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; - - if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - } - } - else { - unsigned int num_instances = 0; - unsigned int max_num_instances = 0xFFFFFFFF; - - bvh_optix->as_data.free(); - bvh_optix->traversable_handle = 0; - bvh_optix->motion_transform_data.free(); - - optixDeviceContextGetProperty(context, - OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, - &max_num_instances, - sizeof(max_num_instances)); - // Do not count first bit, which is used to distinguish instanced and non-instanced objects - max_num_instances >>= 1; - if (bvh->objects.size() > max_num_instances) { - progress.set_error( - "Failed to build OptiX acceleration structure because there are too many instances"); - return; - } - - // Fill instance descriptions -# if OPTIX_ABI_VERSION < 41 - device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY); - aabbs.alloc(bvh->objects.size()); -# endif - device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY); - instances.alloc(bvh->objects.size()); - - // Calculate total motion transform size and allocate memory for them - size_t motion_transform_offset = 0; - if (motion_blur) { - size_t total_motion_transform_size = 0; - for (Object *const ob : bvh->objects) { - if (ob->is_traceable() && ob->use_motion()) { - total_motion_transform_size = align_up(total_motion_transform_size, - OPTIX_TRANSFORM_BYTE_ALIGNMENT); - const size_t motion_keys = max(ob->get_motion().size(), 2) - 2; - total_motion_transform_size = total_motion_transform_size + - sizeof(OptixSRTMotionTransform) + - motion_keys * sizeof(OptixSRTData); - } - } - - assert(bvh_optix->motion_transform_data.device == this); - bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); - } - - for (Object *ob : bvh->objects) { - // Skip non-traceable objects - if (!ob->is_traceable()) - continue; - - BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh); - OptixTraversableHandle handle = blas->traversable_handle; - -# if OPTIX_ABI_VERSION < 41 - OptixAabb &aabb = aabbs[num_instances]; - aabb.minX = ob->bounds.min.x; - aabb.minY = ob->bounds.min.y; - aabb.minZ = ob->bounds.min.z; - aabb.maxX = ob->bounds.max.x; - aabb.maxY = ob->bounds.max.y; - aabb.maxZ = ob->bounds.max.z; -# endif - - OptixInstance &instance = instances[num_instances++]; - memset(&instance, 0, sizeof(instance)); - - // Clear transform to identity matrix - instance.transform[0] = 1.0f; - instance.transform[5] = 1.0f; - instance.transform[10] = 1.0f; - - // Set user instance ID to object index (but leave low bit blank) - instance.instanceId = ob->get_device_index() << 1; - - // Have to have at least one bit in the mask, or else instance would always be culled - instance.visibilityMask = 1; - - if (ob->get_geometry()->has_volume) { - // Volumes have a special bit set in the visibility mask so a trace can mask only volumes - instance.visibilityMask |= 2; - } - - if (ob->get_geometry()->geometry_type == Geometry::HAIR) { - // Same applies to curves (so they can be skipped in local trace calls) - instance.visibilityMask |= 4; - -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur && ob->get_geometry()->has_motion_blur() && - DebugFlags().optix.curves_api && - static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { - // Select between motion blur and non-motion blur built-in intersection module - instance.sbtOffset = PG_HITD_MOTION - PG_HITD; - } -# endif - } - - // Insert motion traversable if object has motion - if (motion_blur && ob->use_motion()) { - size_t motion_keys = max(ob->get_motion().size(), 2) - 2; - size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + - motion_keys * sizeof(OptixSRTData); - - const CUDAContextScope scope(cuContext); - - motion_transform_offset = align_up(motion_transform_offset, - OPTIX_TRANSFORM_BYTE_ALIGNMENT); - CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + - motion_transform_offset; - motion_transform_offset += motion_transform_size; - - // Allocate host side memory for motion transform and fill it with transform data - OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>( - new uint8_t[motion_transform_size]); - motion_transform.child = handle; - motion_transform.motionOptions.numKeys = ob->get_motion().size(); - motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; - motion_transform.motionOptions.timeBegin = 0.0f; - motion_transform.motionOptions.timeEnd = 1.0f; - - OptixSRTData *const srt_data = motion_transform.srtData; - array<DecomposedTransform> decomp(ob->get_motion().size()); - transform_motion_decompose( - decomp.data(), ob->get_motion().data(), ob->get_motion().size()); - - for (size_t i = 0; i < ob->get_motion().size(); ++i) { - // Scale - srt_data[i].sx = decomp[i].y.w; // scale.x.x - srt_data[i].sy = decomp[i].z.w; // scale.y.y - srt_data[i].sz = decomp[i].w.w; // scale.z.z - - // Shear - srt_data[i].a = decomp[i].z.x; // scale.x.y - srt_data[i].b = decomp[i].z.y; // scale.x.z - srt_data[i].c = decomp[i].w.x; // scale.y.z - assert(decomp[i].z.z == 0.0f); // scale.y.x - assert(decomp[i].w.y == 0.0f); // scale.z.x - assert(decomp[i].w.z == 0.0f); // scale.z.y - - // Pivot point - srt_data[i].pvx = 0.0f; - srt_data[i].pvy = 0.0f; - srt_data[i].pvz = 0.0f; - - // Rotation - srt_data[i].qx = decomp[i].x.x; - srt_data[i].qy = decomp[i].x.y; - srt_data[i].qz = decomp[i].x.z; - srt_data[i].qw = decomp[i].x.w; - - // Translation - srt_data[i].tx = decomp[i].y.x; - srt_data[i].ty = decomp[i].y.y; - srt_data[i].tz = decomp[i].y.z; - } - - // Upload motion transform to GPU - cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); - delete[] reinterpret_cast<uint8_t *>(&motion_transform); - - // Disable instance transform if object uses motion transform already - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - - // Get traversable handle to motion transform - optixConvertPointerToTraversableHandle(context, - motion_transform_gpu, - OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, - &instance.traversableHandle); - } - else { - instance.traversableHandle = handle; - - if (ob->get_geometry()->is_instanced()) { - // Set transform matrix - memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); - } - else { - // Disable instance transform if geometry already has it applied to vertex data - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - // Non-instanced objects read ID from 'prim_object', so distinguish - // them from instanced objects with the low bit set - instance.instanceId |= 1; - } - } - } - - // Upload instance descriptions -# if OPTIX_ABI_VERSION < 41 - aabbs.resize(num_instances); - aabbs.copy_to_device(); -# endif - instances.resize(num_instances); - instances.copy_to_device(); - - // Build top-level acceleration structure (TLAS) - OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES; -# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2 - build_input.instanceArray.aabbs = aabbs.device_pointer; - build_input.instanceArray.numAabbs = num_instances; -# endif - build_input.instanceArray.instances = instances.device_pointer; - build_input.instanceArray.numInstances = num_instances; - - if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - tlas_handle = bvh_optix->traversable_handle; - } - } - - void release_optix_bvh(BVH *bvh) override - { - thread_scoped_lock lock(delayed_free_bvh_mutex); - /* Do delayed free of BVH memory, since geometry holding BVH might be deleted - * while GPU is still rendering. */ - BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); - - delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data)); - delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data)); - bvh_optix->traversable_handle = 0; - } - - void free_bvh_memory_delayed() - { - thread_scoped_lock lock(delayed_free_bvh_mutex); - delayed_free_bvh_memory.free_memory(); - } - - void const_copy_to(const char *name, void *host, size_t size) override - { - // Set constant memory for CUDA module - // TODO(pmours): This is only used for tonemapping (see 'film_convert'). - // Could be removed by moving those functions to filter CUDA module. - CUDADevice::const_copy_to(name, host, size); - - if (strcmp(name, "__data") == 0) { - assert(size <= sizeof(KernelData)); - - // Update traversable handle (since it is different for each device on multi devices) - KernelData *const data = (KernelData *)host; - *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; - - update_launch_params(offsetof(KernelParams, data), host, size); - return; - } - - // Update data storage pointers in launch parameters -# define KERNEL_TEX(data_type, tex_name) \ - if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(offsetof(KernelParams, tex_name), host, size); \ - return; \ - } -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX - } - - void update_launch_params(size_t offset, void *data, size_t data_size) - { - const CUDAContextScope scope(cuContext); - - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda( - cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset, - data, - data_size)); - } - - void task_add(DeviceTask &task) override - { - // Upload texture information to device if it has changed since last launch - load_texture_info(); - - if (task.type == DeviceTask::FILM_CONVERT) { - // Execute in main thread because of OpenGL access - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - return; - } - - if (task.type == DeviceTask::DENOISE_BUFFER) { - // Execute denoising in a single thread (e.g. to avoid race conditions during creation) - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy, 0); - }); - return; - } - - // Split task into smaller ones - list<DeviceTask> tasks; - task.split(tasks, info.cpu_threads); - - // Queue tasks in internal task pool - int task_index = 0; - for (DeviceTask &task : tasks) { - task_pool.push([=] { - // Using task index parameter instead of thread index, since number of CUDA streams may - // differ from number of threads - DeviceTask task_copy = task; - thread_run(task_copy, task_index); - }); - task_index++; - } - } - - void task_wait() override - { - // Wait for all queued tasks to finish - task_pool.wait_work(); - } - - void task_cancel() override - { - // Cancel any remaining tasks in the internal pool - task_pool.cancel(); - } -}; - -bool device_optix_init() -{ - if (g_optixFunctionTable.optixDeviceContextCreate != NULL) - return true; // Already initialized function table - - // Need to initialize CUDA as well - if (!device_cuda_init()) - return false; - - const OptixResult result = optixInit(); - - if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { - VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. " - "Please update to the latest driver first!"; - return false; - } - else if (result != OPTIX_SUCCESS) { - VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; - return false; - } - - // Loaded OptiX successfully! - return true; -} - -void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices) -{ - devices.reserve(cuda_devices.size()); - - // Simply add all supported CUDA devices as OptiX devices again - for (DeviceInfo info : cuda_devices) { - assert(info.type == DEVICE_CUDA); - - int major; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); - if (major < 5) { - continue; // Only Maxwell and up are supported by OptiX - } - - info.type = DEVICE_OPTIX; - info.id += "_OptiX"; - info.denoisers |= DENOISER_OPTIX; - info.has_branched_path = false; - - devices.push_back(info); - } -} - -Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new OptiXDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END - -#endif |