diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/cuda/device_impl.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_impl.h | 4 | ||||
-rw-r--r-- | intern/cycles/device/device.h | 5 | ||||
-rw-r--r-- | intern/cycles/device/hip/device_impl.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/kernel.cpp | 24 | ||||
-rw-r--r-- | intern/cycles/device/kernel.h | 3 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.mm | 68 | ||||
-rw-r--r-- | intern/cycles/device/multi/device.cpp | 15 | ||||
-rw-r--r-- | intern/cycles/device/optix/device.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/device/optix/device_impl.cpp | 525 | ||||
-rw-r--r-- | intern/cycles/device/optix/device_impl.h | 32 | ||||
-rw-r--r-- | intern/cycles/device/optix/queue.cpp | 90 |
12 files changed, 621 insertions, 160 deletions
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 01c021551f3..c9764d1c21b 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -232,7 +232,7 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features) return cflags; } -string CUDADevice::compile_kernel(const uint kernel_features, +string CUDADevice::compile_kernel(const string &common_cflags, const char *name, const char *base, bool force_ptx) @@ -281,7 +281,6 @@ string CUDADevice::compile_kernel(const uint kernel_features, /* We include cflags into md5 so changing cuda toolkit or changing other * compiler command line arguments makes sure cubin gets re-built. */ - string common_cflags = compile_kernel_get_common_cflags(kernel_features); const string kernel_md5 = util_md5_string(source_md5 + common_cflags); const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; @@ -417,7 +416,8 @@ bool CUDADevice::load_kernels(const uint kernel_features) /* get kernel */ const char *kernel_name = "kernel"; - string cubin = compile_kernel(kernel_features, kernel_name); + string cflags = compile_kernel_get_common_cflags(kernel_features); + string cubin = compile_kernel(cflags, kernel_name); if (cubin.empty()) return false; diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index a754c33f79d..c18f2811161 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -77,9 +77,9 @@ class CUDADevice : public Device { bool use_adaptive_compilation(); - virtual string compile_kernel_get_common_cflags(const uint kernel_features); + string compile_kernel_get_common_cflags(const uint kernel_features); - string compile_kernel(const uint kernel_features, + string compile_kernel(const string &cflags, const char *name, const char *base = "cuda", bool force_ptx = false); diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 2e4d18241cf..06a2f5c7b01 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -160,6 +160,11 @@ class Device { return true; } + virtual bool load_osl_kernels() + { + return true; + } + /* GPU device only functions. * These may not be used on CPU or multi-devices. */ diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h index 9afef3789af..efdc15dca79 100644 --- a/intern/cycles/device/hip/device_impl.h +++ b/intern/cycles/device/hip/device_impl.h @@ -74,7 +74,7 @@ class HIPDevice : public Device { bool use_adaptive_compilation(); - virtual string compile_kernel_get_common_cflags(const uint kernel_features); + string compile_kernel_get_common_cflags(const uint kernel_features); string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip"); diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp index 96a99cd62cd..27ca0d81817 100644 --- a/intern/cycles/device/kernel.cpp +++ b/intern/cycles/device/kernel.cpp @@ -7,6 +7,30 @@ CCL_NAMESPACE_BEGIN +bool device_kernel_has_shading(DeviceKernel kernel) +{ + return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW || + kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE || + kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND || + kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY); +} + +bool device_kernel_has_intersection(DeviceKernel kernel) +{ + return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); +} + const char *device_kernel_as_string(DeviceKernel kernel) { switch (kernel) { diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h index 4ae461f1f67..b829a891260 100644 --- a/intern/cycles/device/kernel.h +++ b/intern/cycles/device/kernel.h @@ -11,6 +11,9 @@ CCL_NAMESPACE_BEGIN +bool device_kernel_has_shading(DeviceKernel kernel); +bool device_kernel_has_intersection(DeviceKernel kernel); + const char *device_kernel_as_string(DeviceKernel kernel); std::ostream &operator<<(std::ostream &os, DeviceKernel kernel); diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 55938d1a03a..35cf832c537 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel) struct ShaderCache { ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice) { + /* Initialize occupancy tuning LUT. */ + if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) { + switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) { + default: + case APPLE_M2: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024}; + break; + case APPLE_M1: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832}; + break; + } + } } ~ShaderCache(); @@ -73,6 +103,11 @@ struct ShaderCache { std::function<void(MetalKernelPipeline *)> completionHandler; }; + struct OccupancyTuningParameters { + int threads_per_threadgroup = 0; + int num_threads_per_block = 0; + } occupancy_tuning[DEVICE_KERNEL_NUM]; + std::mutex cache_mutex; PipelineCollection pipelines[DEVICE_KERNEL_NUM]; @@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, request.pipeline->device_kernel = device_kernel; request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; + if (occupancy_tuning[device_kernel].threads_per_threadgroup) { + request.pipeline->threads_per_threadgroup = + occupancy_tuning[device_kernel].threads_per_threadgroup; + request.pipeline->num_threads_per_block = + occupancy_tuning[device_kernel].num_threads_per_block; + } + /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; request.pipeline->metalrt_hair = device->use_metalrt && @@ -374,13 +416,6 @@ void MetalKernelPipeline::compile() const std::string function_name = std::string("cycles_metal_") + device_kernel_as_string(device_kernel); - int threads_per_threadgroup = this->threads_per_threadgroup; - if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && - device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) { - /* Always use 512 for the sorting kernels */ - threads_per_threadgroup = 512; - } - NSString *entryPoint = [@(function_name.c_str()) copy]; NSError *error = NULL; @@ -583,7 +618,9 @@ void MetalKernelPipeline::compile() metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); path_create_directories(metalbin_path); - if (path_exists(metalbin_path) && use_binary_archive) { + /* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as + * intended. */ + if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) { if (@available(macOS 11.0, *)) { MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; @@ -644,12 +681,14 @@ void MetalKernelPipeline::compile() return; } - int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, - computePipelineState.threadExecutionWidth); - num_threads_per_block = std::max(num_threads_per_block, - (int)computePipelineState.threadExecutionWidth); + if (!num_threads_per_block) { + num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, + computePipelineState.threadExecutionWidth); + num_threads_per_block = std::max(num_threads_per_block, + (int)computePipelineState.threadExecutionWidth); + } + this->pipeline = computePipelineState; - this->num_threads_per_block = num_threads_per_block; if (@available(macOS 11.0, *)) { if (creating_new_archive || recreate_archive) { @@ -658,6 +697,9 @@ void MetalKernelPipeline::compile() metal_printf("Failed to save binary archive, error:\n%s\n", [[error localizedDescription] UTF8String]); } + else { + path_cache_kernel_mark_added_and_clear_old(metalbin_path); + } } } }; diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index 6904d2c2dc6..9605c6a7538 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -138,6 +138,15 @@ class MultiDevice : public Device { return true; } + bool load_osl_kernels() override + { + foreach (SubDevice &sub, devices) + if (!sub.device->load_osl_kernels()) + return false; + + return true; + } + void build_bvh(BVH *bvh, Progress &progress, bool refit) override { /* Try to build and share a single acceleration structure, if possible */ @@ -204,10 +213,12 @@ class MultiDevice : public Device { virtual void *get_cpu_osl_memory() override { - if (devices.size() > 1) { + /* Always return the OSL memory of the CPU device (this works since the constructor above + * guarantees that CPU devices are always added to the back). */ + if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) { return NULL; } - return devices.front().device->get_cpu_osl_memory(); + return devices.back().device->get_cpu_osl_memory(); } bool is_resident(device_ptr key, Device *sub_device) override diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp index 68ca21374fd..58b72374a7d 100644 --- a/intern/cycles/device/optix/device.cpp +++ b/intern/cycles/device/optix/device.cpp @@ -9,6 +9,10 @@ #include "util/log.h" +#ifdef WITH_OSL +# include <OSL/oslversion.h> +#endif + #ifdef WITH_OPTIX # include <optix_function_table_definition.h> #endif @@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo info.type = DEVICE_OPTIX; info.id += "_OptiX"; +# if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1) + info.has_osl = true; +# endif info.denoisers |= DENOISER_OPTIX; devices.push_back(info); diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index 6c64e7106d5..02f34bf3bd0 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice() if (optix_module != NULL) { optixModuleDestroy(optix_module); } - for (unsigned int i = 0; i < 2; ++i) { + for (int i = 0; i < 2; ++i) { if (builtin_modules[i] != NULL) { optixModuleDestroy(builtin_modules[i]); } } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + for (int i = 0; i < NUM_PIPELINES; ++i) { if (pipelines[i] != NULL) { optixPipelineDestroy(pipelines[i]); } } + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + if (groups[i] != NULL) { + optixProgramGroupDestroy(groups[i]); + } + } + +# ifdef WITH_OSL + for (const OptixModule &module : osl_modules) { + if (module != NULL) { + optixModuleDestroy(module); + } + } + for (const OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + optixProgramGroupDestroy(group); + } + } +# endif /* Make sure denoiser is destroyed before device context! */ if (denoiser_.optix_denoiser != nullptr) { @@ -381,13 +399,51 @@ bool OptiXDevice::load_kernels(const uint kernel_features) return false; } +# ifdef WITH_OSL + const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL); +# else + const bool use_osl = false; +# endif + + /* Skip creating OptiX module if only doing denoising. */ + const bool need_optix_kernels = (kernel_features & + (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING)); + + /* Detect existence of OptiX kernel and SDK here early. So we can error out + * before compiling the CUDA kernels, to avoid failing right after when + * compiling the OptiX kernel. */ + string suffix = use_osl ? "_osl" : + (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? + "_shader_raytrace" : + ""; + string ptx_filename; + if (need_optix_kernels) { + ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx"); + if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { + std::string optix_include_dir = get_optix_include_dir(); + if (optix_include_dir.empty()) { + set_error( + "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable " + "to a directory containing the OptiX SDK."); + return false; + } + else if (!path_is_directory(optix_include_dir)) { + set_error(string_printf( + "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install " + "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a " + "directory containing the OptiX SDK.", + optix_include_dir.c_str())); + return false; + } + } + } + /* Load CUDA modules because we need some of the utility kernels. */ if (!CUDADevice::load_kernels(kernel_features)) { return false; } - /* Skip creating OptiX module if only doing denoising. */ - if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) { + if (!need_optix_kernels) { return true; } @@ -398,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features) optixModuleDestroy(optix_module); optix_module = NULL; } - for (unsigned int i = 0; i < 2; ++i) { + for (int i = 0; i < 2; ++i) { if (builtin_modules[i] != NULL) { optixModuleDestroy(builtin_modules[i]); builtin_modules[i] = NULL; } } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + for (int i = 0; i < NUM_PIPELINES; ++i) { if (pipelines[i] != NULL) { optixPipelineDestroy(pipelines[i]); pipelines[i] = NULL; } } + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + if (groups[i] != NULL) { + optixProgramGroupDestroy(groups[i]); + groups[i] = NULL; + } + } + +# ifdef WITH_OSL + /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */ + for (const OptixModule &module : osl_modules) { + if (module != NULL) { + optixModuleDestroy(module); + } + } + osl_modules.clear(); + + for (const OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + optixProgramGroupDestroy(group); + } + } + osl_groups.clear(); +# endif OptixModuleCompileOptions module_options = {}; module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */ @@ -430,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) module_options.numPayloadTypes = 0; # endif - OptixPipelineCompileOptions pipeline_options = {}; /* Default to no motion blur and two-level graph, since it is the fastest option. */ pipeline_options.usesMotionBlur = false; pipeline_options.traversableGraphFlags = @@ -459,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds * This is necessary since objects may be reported to have motion if the Vector pass is * active, but may still need to be rendered without motion blur if that isn't active as well. */ - motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0; - - if (motion_blur) { + if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) { pipeline_options.usesMotionBlur = true; /* Motion blur can insert motion transforms into the traversal graph. * It is no longer a two-level graph then, so need to set flags to allow any configuration. */ @@ -469,33 +545,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } { /* Load and compile PTX module with OptiX kernels. */ - string ptx_data, ptx_filename = path_get( - (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? - "lib/kernel_optix_shader_raytrace.ptx" : - "lib/kernel_optix.ptx"); + string ptx_data; if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { - std::string optix_include_dir = get_optix_include_dir(); - if (optix_include_dir.empty()) { - set_error( - "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable " - "to a directory containing the OptiX SDK."); - return false; - } - else if (!path_is_directory(optix_include_dir)) { - set_error(string_printf( - "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install " - "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a " - "directory containing the OptiX SDK.", - optix_include_dir.c_str())); - return false; - } - ptx_filename = compile_kernel( - kernel_features, - (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? - "kernel_shader_raytrace" : - "kernel", - "optix", - true); + string cflags = compile_kernel_get_common_cflags(kernel_features); + ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true); } if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str())); @@ -537,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } /* Create program groups. */ - OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; OptixProgramGroupOptions group_options = {}; /* There are no options currently. */ group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; @@ -595,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { builtin_options.usesMotionBlur = true; optix_assert(optixBuiltinISModuleGet( @@ -616,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } } - /* Pointclouds */ if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD]; group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; @@ -628,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point"; } + /* Add hit group for local intersections. */ if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) { - /* Add hit group for local intersections. */ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; group_descs[PG_HITL].hitgroup.moduleAH = optix_module; group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; @@ -641,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module; group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName = "__raygen__kernel_optix_integrator_shade_surface_raytrace"; - group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module; - group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao"; - group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; - group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = - "__direct_callable__svm_node_bevel"; + + /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */ + if (!use_osl) { + group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao"; + group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = + "__direct_callable__svm_node_bevel"; + } } - /* MNEE. */ if (kernel_features & KERNEL_FEATURE_MNEE) { group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module; @@ -658,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features) "__raygen__kernel_optix_integrator_shade_surface_mnee"; } + /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */ + if (use_osl) { + group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_background"; + group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_light"; + group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_surface"; + group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_volume"; + group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_shadow"; + group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module; + group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName = + "__raygen__kernel_optix_shader_eval_displace"; + group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module; + group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName = + "__raygen__kernel_optix_shader_eval_background"; + group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module; + group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName = + "__raygen__kernel_optix_shader_eval_curve_shadow_transparency"; + } + optix_assert(optixProgramGroupCreate( context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); @@ -666,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) /* Set up SBT, which in this case is used only to select between different programs. */ sbt_data.alloc(NUM_PROGRAM_GROUPS); memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); } @@ -690,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features) OptixPipelineLinkOptions link_options = {}; link_options.maxTraceDepth = 1; + link_options.debugLevel = module_options.debugLevel; - if (DebugFlags().optix.use_debug) { - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; - } - else { - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; - } - - if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { - /* Create shader raytracing pipeline. */ + if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) { + /* Create shader raytracing and MNEE pipeline. */ vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); + pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); + } + if (kernel_features & KERNEL_FEATURE_MNEE) { + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + } pipeline_groups.push_back(groups[PG_MISS]); pipeline_groups.push_back(groups[PG_HITD]); pipeline_groups.push_back(groups[PG_HITS]); pipeline_groups.push_back(groups[PG_HITL]); pipeline_groups.push_back(groups[PG_HITV]); - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); } @@ -716,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); } - pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); - pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); optix_assert(optixPipelineCreate(context, &pipeline_options, @@ -726,30 +815,33 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.size(), nullptr, 0, - &pipelines[PIP_SHADE_RAYTRACE])); + &pipelines[PIP_SHADE])); /* Combine ray generation and trace continuation stack size. */ - const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG + + const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG, + stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) + link_options.maxTraceDepth * trace_css; const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC, stack_size[PG_CALL_SVM_BEVEL].dssDC); /* Set stack size depending on pipeline options. */ optix_assert(optixPipelineSetStackSize( - pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2)); + pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2)); } - if (kernel_features & KERNEL_FEATURE_MNEE) { - /* Create MNEE pipeline. */ + { /* Create intersection-only pipeline. */ vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]); pipeline_groups.push_back(groups[PG_MISS]); pipeline_groups.push_back(groups[PG_HITD]); pipeline_groups.push_back(groups[PG_HITS]); pipeline_groups.push_back(groups[PG_HITL]); pipeline_groups.push_back(groups[PG_HITV]); - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); } @@ -757,8 +849,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); } - pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); - pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); optix_assert(optixPipelineCreate(context, &pipeline_options, @@ -767,37 +857,234 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.size(), nullptr, 0, - &pipelines[PIP_SHADE_MNEE])); + &pipelines[PIP_INTERSECT])); - /* Combine ray generation and trace continuation stack size. */ - const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG + - link_options.maxTraceDepth * trace_css; - const unsigned int dss = 0; + /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */ + const unsigned int css = + std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG, + stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) + + link_options.maxTraceDepth * trace_css; - /* Set stack size depending on pipeline options. */ - optix_assert( - optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2)); + optix_assert(optixPipelineSetStackSize( + pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2)); } - { /* Create intersection-only pipeline. */ + return !have_error(); +} + +bool OptiXDevice::load_osl_kernels() +{ +# ifdef WITH_OSL + if (have_error()) { + return false; + } + + struct OSLKernel { + string ptx; + string init_entry; + string exec_entry; + }; + + /* This has to be in the same order as the ShaderType enum, so that the index calculation in + * osl_eval_nodes checks out */ + vector<OSLKernel> osl_kernels; + + for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP; + type = static_cast<ShaderType>(type + 1)) { + const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ? + osl_globals.surface_state : + type == SHADER_TYPE_VOLUME ? + osl_globals.volume_state : + type == SHADER_TYPE_DISPLACEMENT ? + osl_globals.displacement_state : + osl_globals.bump_state); + for (const OSL::ShaderGroupRef &group : groups) { + if (group) { + string osl_ptx, init_name, entry_name; + osl_globals.ss->getattribute(group.get(), "group_init_name", init_name); + osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name); + osl_globals.ss->getattribute( + group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx); + + int groupdata_size = 0; + osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size); + if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */ + set_error( + string_printf("Requested OSL group data size (%d) is greater than the maximum " + "supported with OptiX (2048)", + groupdata_size)); + return false; + } + + osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)}); + } + else { + /* Add empty entry for non-existent shader groups, so that the index stays stable. */ + osl_kernels.emplace_back(); + } + } + } + + const CUDAContextScope scope(this); + + if (pipelines[PIP_SHADE]) { + optixPipelineDestroy(pipelines[PIP_SHADE]); + } + + for (OptixModule &module : osl_modules) { + if (module != NULL) { + optixModuleDestroy(module); + module = NULL; + } + } + for (OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + optixProgramGroupDestroy(group); + group = NULL; + } + } + + OptixProgramGroupOptions group_options = {}; /* There are no options currently. */ + OptixModuleCompileOptions module_options = {}; + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; + + osl_groups.resize(osl_kernels.size() * 2 + 1); + osl_modules.resize(osl_kernels.size() + 1); + + { /* Load and compile PTX module with OSL services. */ + string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx"); + if (!path_read_text(ptx_filename, ptx_data)) { + set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'", + ptx_filename.c_str())); + return false; + } + + const OptixResult result = optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + ptx_data.data(), + ptx_data.size(), + nullptr, + 0, + &osl_modules.back()); + if (result != OPTIX_SUCCESS) { + set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)", + ptx_filename.c_str(), + optixGetErrorName(result))); + return false; + } + + OptixProgramGroupDesc group_desc = {}; + group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services"; + group_desc.callables.moduleDC = osl_modules.back(); + + optix_assert(optixProgramGroupCreate( + context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back())); + } + + TaskPool pool; + vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS); + + for (size_t i = 0; i < osl_kernels.size(); ++i) { + if (osl_kernels[i].ptx.empty()) { + continue; + } + +# if OPTIX_ABI_VERSION >= 55 + OptixTask task = nullptr; + results[i] = optixModuleCreateFromPTXWithTasks(context, + &module_options, + &pipeline_options, + osl_kernels[i].ptx.data(), + osl_kernels[i].ptx.size(), + nullptr, + nullptr, + &osl_modules[i], + &task); + if (results[i] == OPTIX_SUCCESS) { + execute_optix_task(pool, task, results[i]); + } +# else + pool.push([this, &results, i, &module_options, &osl_kernels]() { + results[i] = optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + osl_kernels[i].ptx.data(), + osl_kernels[i].ptx.size(), + nullptr, + 0, + &osl_modules[i]); + }); +# endif + } + + pool.wait_work(); + + for (size_t i = 0; i < osl_kernels.size(); ++i) { + if (osl_kernels[i].ptx.empty()) { + continue; + } + + if (results[i] != OPTIX_SUCCESS) { + set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)", + osl_kernels[i].init_entry.c_str(), + optixGetErrorName(results[i]))); + return false; + } + + OptixProgramGroupDesc group_descs[2] = {}; + group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str(); + group_descs[0].callables.moduleDC = osl_modules[i]; + group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str(); + group_descs[1].callables.moduleDC = osl_modules[i]; + + optix_assert(optixProgramGroupCreate( + context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2])); + } + + vector<OptixStackSizes> osl_stack_size(osl_groups.size()); + + /* Update SBT with new entries. */ + sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size()); + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); + } + for (size_t i = 0; i < osl_groups.size(); ++i) { + if (osl_groups[i] != NULL) { + optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i])); + optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i])); + } + } + sbt_data.copy_to_device(); /* Upload updated SBT to device. */ + + OptixPipelineLinkOptions link_options = {}; + link_options.maxTraceDepth = 0; + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; + + { vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); - pipeline_groups.push_back(groups[PG_HITV]); - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } - if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { - pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); - pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]); + pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]); + pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]); + pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]); + + for (const OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + pipeline_groups.push_back(group); + } } optix_assert(optixPipelineCreate(context, @@ -807,26 +1094,30 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.size(), nullptr, 0, - &pipelines[PIP_INTERSECT])); + &pipelines[PIP_SHADE])); - /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */ - const unsigned int css = - std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG, - std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG, - std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG, - stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) + - link_options.maxTraceDepth * trace_css; + unsigned int dss = 0; + for (unsigned int i = 0; i < osl_stack_size.size(); ++i) { + dss = std::max(dss, osl_stack_size[i].dssDC); + } - optix_assert( - optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2)); + optix_assert(optixPipelineSetStackSize( + pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2)); } - /* Clean up program group objects. */ - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - optixProgramGroupDestroy(groups[i]); - } + return !have_error(); +# else + return false; +# endif +} - return true; +void *OptiXDevice::get_cpu_osl_memory() +{ +# ifdef WITH_OSL + return &osl_globals; +# else + return NULL; +# endif } /* -------------------------------------------------------------------- @@ -1553,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) size_t num_motion_steps = 1; Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && hair->get_use_motion_blur() && motion_keys) { + if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) { num_motion_steps = hair->get_motion_steps(); } @@ -1707,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) size_t num_motion_steps = 1; Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { + if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) { num_motion_steps = mesh->get_motion_steps(); } @@ -1774,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) size_t num_motion_steps = 1; Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) { + if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) { num_motion_steps = pointcloud->get_motion_steps(); } @@ -1871,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) /* Calculate total motion transform size and allocate memory for them. */ size_t motion_transform_offset = 0; - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { size_t total_motion_transform_size = 0; for (Object *const ob : bvh->objects) { if (ob->is_traceable() && ob->use_motion()) { @@ -1922,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) if (ob->get_geometry()->geometry_type == Geometry::HAIR && static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { - if (motion_blur && ob->get_geometry()->has_motion_blur()) { + if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) { /* Select between motion blur and non-motion blur built-in intersection module. */ instance.sbtOffset = PG_HITD_MOTION - PG_HITD; } @@ -1950,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } /* Insert motion traversable if object has motion. */ - if (motion_blur && ob->use_motion()) { + if (pipeline_options.usesMotionBlur && ob->use_motion()) { size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2; size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + motion_keys * sizeof(OptixSRTData); diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h index 817afdc8384..ad0e7b93454 100644 --- a/intern/cycles/device/optix/device_impl.h +++ b/intern/cycles/device/optix/device_impl.h @@ -9,6 +9,7 @@ # include "device/cuda/device_impl.h" # include "device/optix/queue.h" # include "device/optix/util.h" +# include "kernel/osl/globals.h" # include "kernel/types.h" # include "util/unique_ptr.h" @@ -23,8 +24,16 @@ enum { PG_RGEN_INTERSECT_SHADOW, PG_RGEN_INTERSECT_SUBSURFACE, PG_RGEN_INTERSECT_VOLUME_STACK, + PG_RGEN_SHADE_BACKGROUND, + PG_RGEN_SHADE_LIGHT, + PG_RGEN_SHADE_SURFACE, PG_RGEN_SHADE_SURFACE_RAYTRACE, PG_RGEN_SHADE_SURFACE_MNEE, + PG_RGEN_SHADE_VOLUME, + PG_RGEN_SHADE_SHADOW, + PG_RGEN_EVAL_DISPLACE, + PG_RGEN_EVAL_BACKGROUND, + PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY, PG_MISS, PG_HITD, /* Default hit group. */ PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */ @@ -40,14 +49,14 @@ enum { }; static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS; -static const int NUM_MIS_PROGRAM_GROUPS = 1; +static const int NUM_MISS_PROGRAM_GROUPS = 1; static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD; static const int NUM_HIT_PROGRAM_GROUPS = 8; static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO; static const int NUM_CALLABLE_PROGRAM_GROUPS = 2; /* List of OptiX pipelines. */ -enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES }; +enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES }; /* A single shader binding table entry. */ struct SbtRecord { @@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice { OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */ OptixModule builtin_modules[2] = {}; OptixPipeline pipelines[NUM_PIPELINES] = {}; + OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; + OptixPipelineCompileOptions pipeline_options = {}; - bool motion_blur = false; device_vector<SbtRecord> sbt_data; device_only_memory<KernelParamsOptiX> launch_params; - OptixTraversableHandle tlas_handle = 0; +# ifdef WITH_OSL + OSLGlobals osl_globals; + vector<OptixModule> osl_modules; + vector<OptixProgramGroup> osl_groups; +# endif + + private: + OptixTraversableHandle tlas_handle = 0; vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory; thread_mutex delayed_free_bvh_mutex; @@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice { OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); ~OptiXDevice(); - private: BVHLayoutMask get_bvh_layout_mask() const override; - string compile_kernel_get_common_cflags(const uint kernel_features) override; + string compile_kernel_get_common_cflags(const uint kernel_features); bool load_kernels(const uint kernel_features) override; + bool load_osl_kernels() override; + bool build_optix_bvh(BVHOptiX *bvh, OptixBuildOperation operation, const OptixBuildInput &build_input, @@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice { virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + void *get_cpu_osl_memory() override; + /* -------------------------------------------------------------------- * Denoising. */ diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index 3bc547ed11d..1bfd154d449 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution() CUDADeviceQueue::init_execution(); } -static bool is_optix_specific_kernel(DeviceKernel kernel) +static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl) { - return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); +# ifdef WITH_OSL + /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */ + if (use_osl && device_kernel_has_shading(kernel)) { + return true; + } +# else + (void)use_osl; +# endif + + return device_kernel_has_intersection(kernel); } bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { - if (!is_optix_specific_kernel(kernel)) { + OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_); + +# ifdef WITH_OSL + const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use; +# else + const bool use_osl = false; +# endif + + if (!is_optix_specific_kernel(kernel, use_osl)) { return CUDADeviceQueue::enqueue(kernel, work_size, args); } @@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const CUDAContextScope scope(cuda_device_); - OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_); - const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer; const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer; @@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sizeof(device_ptr), cuda_stream_)); - if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { + if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) { cuda_device_assert( cuda_device_, cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), @@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sizeof(device_ptr), cuda_stream_)); } + if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE || + kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND || + kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) { + cuda_device_assert(cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset), + args.values[2], // &d_offset + sizeof(int32_t), + cuda_stream_)); + } cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); @@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, OptixShaderBindingTable sbt_params = {}; switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord); + break; case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: - pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE]; + pipeline = optix_device->pipelines[PIP_SHADE]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord); break; case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: - pipeline = optix_device->pipelines[PIP_SHADE_MNEE]; + pipeline = optix_device->pipelines[PIP_SHADE]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord); break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: pipeline = optix_device->pipelines[PIP_INTERSECT]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord); @@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord); break; + case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + + PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord); + break; + default: LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel) << " is attempted to be enqueued."; @@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord); sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS; + sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS; sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord); sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS; @@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS; sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); +# ifdef WITH_OSL + if (use_osl) { + sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size()); + } +# endif + /* Launch the ray generation program. */ optix_device_assert(optix_device, optixLaunch(pipeline, |