6 files changed, 168 insertions, 74 deletions
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 95234845f98..682540a51fd 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -141,7 +141,8 @@ class OptiXDevice : public CUDADevice {
     PG_BAKE,  // kernel_bake_evaluate
     PG_DISP,  // kernel_displace_evaluate
     PG_BACK,  // kernel_background_evaluate
-    NUM_PROGRAM_GROUPS
+    PG_CALL,
+    NUM_PROGRAM_GROUPS = PG_CALL + 3
   };
 
   // List of OptiX pipelines
@@ -334,11 +335,6 @@ class OptiXDevice : public CUDADevice {
       set_error("OptiX backend does not support baking yet");
       return false;
     }
-    // Disable shader raytracing support for now, since continuation callables are slow
-    if (requested_features.use_shader_raytrace) {
-      set_error("OptiX backend does not support 'Ambient Occlusion' and 'Bevel' shader nodes yet");
-      return false;
-    }
 
     const CUDAContextScope scope(cuContext);
 
@@ -410,7 +406,9 @@ class OptiXDevice : public CUDADevice {
     }
 
     {  // Load and compile PTX module with OptiX kernels
-      string ptx_data, ptx_filename = path_get("lib/kernel_optix.ptx");
+      string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
+                                                   "lib/kernel_optix_shader_raytrace.ptx" :
+                                                   "lib/kernel_optix.ptx");
       if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
         if (!getenv("OPTIX_ROOT_DIR")) {
           set_error(
@@ -525,6 +523,21 @@ class OptiXDevice : public CUDADevice {
       group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
     }
 
+    // Shader raytracing replaces some functions with direct callables
+    if (requested_features.use_shader_raytrace) {
+      group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
+      group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
+      group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
+      group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
+          "__direct_callable__kernel_volume_shadow";
+      group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
+      group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
+          "__direct_callable__subsurface_scatter_multi_setup";
+    }
+
     check_result_optix_ret(optixProgramGroupCreate(
         context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
 
@@ -564,33 +577,51 @@ class OptiXDevice : public CUDADevice {
 #  endif
 
     {  // Create path tracing pipeline
-      OptixProgramGroup pipeline_groups[] = {
-        groups[PG_RGEN],
-        groups[PG_MISS],
-        groups[PG_HITD],
-        groups[PG_HITS],
-        groups[PG_HITL],
+      vector<OptixProgramGroup> pipeline_groups;
+      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+      pipeline_groups.push_back(groups[PG_RGEN]);
+      pipeline_groups.push_back(groups[PG_MISS]);
+      pipeline_groups.push_back(groups[PG_HITD]);
+      pipeline_groups.push_back(groups[PG_HITS]);
+      pipeline_groups.push_back(groups[PG_HITL]);
 #  if OPTIX_ABI_VERSION >= 36
-        groups[PG_HITD_MOTION],
-        groups[PG_HITS_MOTION],
+      if (motion_blur) {
+        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+      }
 #  endif
-      };
-      check_result_optix_ret(
-          optixPipelineCreate(context,
-                              &pipeline_options,
-                              &link_options,
-                              pipeline_groups,
-                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
-                              nullptr,
-                              0,
-                              &pipelines[PIP_PATH_TRACE]));
+      if (requested_features.use_shader_raytrace) {
+        pipeline_groups.push_back(groups[PG_CALL + 0]);
+        pipeline_groups.push_back(groups[PG_CALL + 1]);
+        pipeline_groups.push_back(groups[PG_CALL + 2]);
+      }
+
+      check_result_optix_ret(optixPipelineCreate(context,
+                                                 &pipeline_options,
+                                                 &link_options,
+                                                 pipeline_groups.data(),
+                                                 pipeline_groups.size(),
+                                                 nullptr,
+                                                 0,
+                                                 &pipelines[PIP_PATH_TRACE]));
 
       // Combine ray generation and trace continuation stack size
       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
+      // Max direct callable depth is one of the following, so combine accordingly
+      // - __raygen__ -> svm_eval_nodes
+      // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
+      // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
+      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
+                               std::max(stack_size[PG_CALL + 1].dssDC,
+                                        stack_size[PG_CALL + 2].dssDC);
 
       // Set stack size depending on pipeline options
       check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
+                                    0,
+                                    requested_features.use_shader_raytrace ? dss : 0,
+                                    css,
+                                    motion_blur ? 3 : 2));
     }
 
     // Only need to create shader evaluation pipeline if one of these features is used:
@@ -599,37 +630,51 @@ class OptiXDevice : public CUDADevice {
                                           requested_features.use_true_displacement;
 
     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
-      OptixProgramGroup pipeline_groups[] = {
-        groups[PG_BAKE],
-        groups[PG_DISP],
-        groups[PG_BACK],
-        groups[PG_MISS],
-        groups[PG_HITD],
-        groups[PG_HITS],
-        groups[PG_HITL],
+      vector<OptixProgramGroup> pipeline_groups;
+      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+      pipeline_groups.push_back(groups[PG_BAKE]);
+      pipeline_groups.push_back(groups[PG_DISP]);
+      pipeline_groups.push_back(groups[PG_BACK]);
+      pipeline_groups.push_back(groups[PG_MISS]);
+      pipeline_groups.push_back(groups[PG_HITD]);
+      pipeline_groups.push_back(groups[PG_HITS]);
+      pipeline_groups.push_back(groups[PG_HITL]);
 #  if OPTIX_ABI_VERSION >= 36
-        groups[PG_HITD_MOTION],
-        groups[PG_HITS_MOTION],
+      if (motion_blur) {
+        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+      }
 #  endif
-      };
-      check_result_optix_ret(
-          optixPipelineCreate(context,
-                              &pipeline_options,
-                              &link_options,
-                              pipeline_groups,
-                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
-                              nullptr,
-                              0,
-                              &pipelines[PIP_SHADER_EVAL]));
+      if (requested_features.use_shader_raytrace) {
+        pipeline_groups.push_back(groups[PG_CALL + 0]);
+        pipeline_groups.push_back(groups[PG_CALL + 1]);
+        pipeline_groups.push_back(groups[PG_CALL + 2]);
+      }
+
+      check_result_optix_ret(optixPipelineCreate(context,
+                                                 &pipeline_options,
+                                                 &link_options,
+                                                 pipeline_groups.data(),
+                                                 pipeline_groups.size(),
+                                                 nullptr,
+                                                 0,
+                                                 &pipelines[PIP_SHADER_EVAL]));
 
       // Calculate continuation stack size based on the maximum of all ray generation stack sizes
       const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
                                         std::max(stack_size[PG_DISP].cssRG,
                                                  stack_size[PG_BACK].cssRG)) +
                                link_options.maxTraceDepth * trace_css;
+      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
+                               std::max(stack_size[PG_CALL + 1].dssDC,
+                                        stack_size[PG_CALL + 2].dssDC);
 
-      check_result_optix_ret(optixPipelineSetStackSize(
-          pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
+                                    0,
+                                    requested_features.use_shader_raytrace ? dss : 0,
+                                    css,
+                                    motion_blur ? 3 : 2));
     }
 
     // Clean up program group objects
@@ -734,6 +779,9 @@ class OptiXDevice : public CUDADevice {
 #  else
       sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
 #  endif
+      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
+      sbt_params.callablesRecordCount = 3;
+      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
 
       // Launch the ray generation program
       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@@ -1061,6 +1109,9 @@ class OptiXDevice : public CUDADevice {
 #  else
       sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
 #  endif
+      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
+      sbt_params.callablesRecordCount = 3;
+      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
 
       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                      cuda_stream[thread_index],
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index c39c67afb5a..f6b4b963a7a 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -423,7 +423,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
     set(cuda_kernel_src "/kernels/cuda/${name}.cu")
 
-    set(cuda_flags
+    set(cuda_flags ${flags}
       -D CCL_NAMESPACE_BEGIN=
       -D CCL_NAMESPACE_END=
       -D NVCC
@@ -545,11 +545,11 @@ endif()
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
-  foreach(input ${SRC_OPTIX_KERNELS})
-    get_filename_component(input_we ${input} NAME_WE)
+  macro(CYCLES_OPTIX_KERNEL_ADD name flags)
+    set(input "kernels/optix/kernel_optix.cu")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
 
-    set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx")
-    set(cuda_flags
+    set(cuda_flags ${flags}
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
       -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
@@ -625,7 +625,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     list(APPEND optix_ptx ${output})
 
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
-  endforeach()
+  endmacro()
+
+  CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
+  CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index ed8572467ea..917f35d37dc 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -281,13 +281,28 @@ ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
   return num_eval_hits;
 }
 
-ccl_device_noinline void subsurface_scatter_multi_setup(KernelGlobals *kg,
-                                                        LocalIntersection *ss_isect,
-                                                        int hit,
-                                                        ShaderData *sd,
-                                                        ccl_addr_space PathState *state,
-                                                        ClosureType type,
-                                                        float roughness)
+#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
+ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
+                                                      LocalIntersection *ss_isect,
+                                                      int hit,
+                                                      ShaderData *sd,
+                                                      ccl_addr_space PathState *state,
+                                                      ClosureType type,
+                                                      float roughness)
+{
+  optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
+}
+extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
+#else
+ccl_device_noinline void subsurface_scatter_multi_setup(
+#endif
+    KernelGlobals *kg,
+    LocalIntersection *ss_isect,
+    int hit,
+    ShaderData *sd,
+    ccl_addr_space PathState *state,
+    ClosureType type,
+    float roughness)
 {
 #ifdef __SPLIT_KERNEL__
   Ray ray_object = ss_isect->ray;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 8e2b0e46a66..6beabebb92f 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -139,8 +139,6 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_OPTIX__
 #  undef __BAKING__
 #  undef __BRANCHED_PATH__
-/* TODO(pmours): Cannot use optixTrace in non-inlined functions */
-#  undef __SHADER_RAYTRACE__
 #endif /* __KERNEL_OPTIX__ */
 
 #ifdef __KERNEL_OPENCL__
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index f5d10c0ca8a..fdf712293e7 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -274,11 +274,24 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
 
 /* get the volume attenuation over line segment defined by ray, with the
  * assumption that there are no surfaces blocking light between the endpoints */
-ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
-                                              ShaderData *shadow_sd,
-                                              ccl_addr_space PathState *state,
-                                              Ray *ray,
-                                              float3 *throughput)
+#  if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
+ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
+                                            ShaderData *shadow_sd,
+                                            ccl_addr_space PathState *state,
+                                            Ray *ray,
+                                            float3 *throughput)
+{
+  optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
+}
+extern "C" __device__ void __direct_callable__kernel_volume_shadow(
+#  else
+ccl_device_noinline void kernel_volume_shadow(
+#  endif
+    KernelGlobals *kg,
+    ShaderData *shadow_sd,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    float3 *throughput)
 {
   shader_setup_from_volume(kg, shadow_sd, ray);
 
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 6c849f5b2fc..000da1fa615 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -217,12 +217,26 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
-ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
-                                        ShaderData *sd,
-                                        ccl_addr_space PathState *state,
-                                        ccl_global float *buffer,
-                                        ShaderType type,
-                                        int path_flag)
+#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
+ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      ccl_addr_space PathState *state,
+                                      ccl_global float *buffer,
+                                      ShaderType type,
+                                      int path_flag)
+{
+  optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
+}
+extern "C" __device__ void __direct_callable__svm_eval_nodes(
+#else
+ccl_device_noinline void svm_eval_nodes(
+#endif
+    KernelGlobals *kg,
+    ShaderData *sd,
+    ccl_addr_space PathState *state,
+    ccl_global float *buffer,
+    ShaderType type,
+    int path_flag)
 {
   float stack[SVM_STACK_SIZE];
   int offset = sd->shader & SHADER_MASK;