Cycles: bake transparent shadows for hair

These transparent shadows can be expansive to evaluate. Especially on the GPU they can lead to poor occupancy when only some pixels require many kernel launches to trace and evaluate many layers of transparency. Baked transparency allows tracing a single ray in many cases by accumulating the throughput directly in the intersection program without recording hits or evaluating shaders. Transparency is baked at curve vertices and interpolated, for most shaders this will look practically the same as actual shader evaluation. Fixes T91428, performance regression with spring demo file due to transparent hair, and makes it render significantly faster than Blender 2.93. Differential Revision: https://developer.blender.org/D12880
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 17:16:11 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-19 16:11:09 +0300
commit: fd77a28031daff3122ded3a1cb37a7fb44feedf6 (patch)
tree: f54967b7f5f1175555aa21d613137fe436d7fc8c /intern/cycles/kernel/device
parent: d06828f0b8ebb083de59fd2cb8c5f8fe6af1da22 (diff)
4 files changed, 63 insertions, 9 deletions
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index ae7fab65100..7a438b58e73 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -64,6 +64,11 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
                                                      float *output,
                                                      const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+    const KernelGlobalsCPU *kg,
+    const KernelShaderEvalInput *input,
+    float *output,
+    const int offset);
 
 /* --------------------------------------------------------------------
  * Adaptive sampling.
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 2b0eea4fb61..ac606c768db 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -150,6 +150,19 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *k
 #endif
 }
 
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+    const KernelGlobalsCPU *kg,
+    const KernelShaderEvalInput *input,
+    float *output,
+    const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency);
+#else
+  kernel_curve_shadow_transparency_evaluate(kg, input, output, offset);
+#endif
+}
+
 /* --------------------------------------------------------------------
  * Adaptive sampling.
  */
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 6b4d79ed5b7..b6df74e835a 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -621,7 +621,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
-/* Background Shader Evaluation */
+/* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
@@ -635,6 +635,20 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
+/* Curve Shadow Transparency */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
+                                                     float *output,
+                                                     const int offset,
+                                                     const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+  }
+}
+
 /* --------------------------------------------------------------------
  * Denoising.
  */
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 574f66ab708..a3bafb9846c 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -210,29 +210,50 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
   optixSetPayload_5(true);
   return optixTerminateRay();
 #  else
-  const int max_hits = optixGetPayload_3();
+  const uint max_hits = optixGetPayload_3();
+  const uint num_hits_packed = optixGetPayload_2();
+  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
 
   /* If no transparent shadows, all light is blocked and we can stop immediately. */
-  if (max_hits == 0 ||
+  if (num_hits >= max_hits ||
       !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
     optixSetPayload_5(true);
     return optixTerminateRay();
   }
 
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_ALL_CURVE) {
+    float throughput = __uint_as_float(optixGetPayload_1());
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    optixSetPayload_1(__float_as_uint(throughput));
+    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
+
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      optixSetPayload_4(true);
+      return optixTerminateRay();
+    }
+    else {
+      /* Continue tracing. */
+      optixIgnoreIntersection();
+      return;
+    }
+  }
+
   /* Record transparent intersection. */
-  const int num_hits = optixGetPayload_2();
-  int record_index = num_hits;
+  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
 
-  optixSetPayload_2(num_hits + 1);
+  uint record_index = num_recorded_hits;
 
   const IntegratorShadowState state = optixGetPayload_0();
 
-  if (record_index >= max_hits) {
+  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
     /* If maximum number of hits reached, find a hit to replace. */
     float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
-    int max_recorded_hit = 0;
+    uint max_recorded_hit = 0;
 
-    for (int i = 1; i < max_hits; i++) {
+    for (int i = 1; i < max_record_hits; i++) {
       const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
       if (isect_t > max_recorded_t) {
         max_recorded_t = isect_t;
@@ -256,6 +277,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
 
+  /* Continue tracing. */
   optixIgnoreIntersection();
 #  endif /* __TRANSPARENT_SHADOWS__ */
 #endif   /* __SHADOW_RECORD_ALL__ */
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 17:16:11 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-19 16:11:09 +0300
commit	fd77a28031daff3122ded3a1cb37a7fb44feedf6 (patch)
tree	f54967b7f5f1175555aa21d613137fe436d7fc8c /intern/cycles/kernel/device
parent	d06828f0b8ebb083de59fd2cb8c5f8fe6af1da22 (diff)