1 files changed, 439 insertions, 222 deletions
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 2981f6ac566..0426e0a62c9 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,9 +16,84 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SHADOW_RECORD_ALL__
+/* Attenuate throughput accordingly to the given intersection event.
+ * Returns true if the throughput is zero and traversal can be aborted.
+ */
+ccl_device_forceinline bool shadow_handle_transparent_isect(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+#    ifdef __VOLUME__
+        ccl_addr_space struct PathState *volume_state,
+#    endif
+        Intersection *isect,
+        Ray *ray,
+        float3 *throughput)
+{
+#ifdef __VOLUME__
+	/* Attenuation between last surface and next surface. */
+	if(volume_state->volume_stack[0].shader != SHADER_NONE) {
+		Ray segment_ray = *ray;
+		segment_ray.t = isect->t;
+		kernel_volume_shadow(kg,
+		                     shadow_sd,
+		                     volume_state,
+		                     &segment_ray,
+		                     throughput);
+	}
+#endif
+	/* Setup shader data at surface. */
+	shader_setup_from_ray(kg, shadow_sd, isect, ray);
+	/* Attenuation from transparent surface. */
+	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+		path_state_modify_bounce(state, true);
+		shader_eval_surface(kg,
+		                    shadow_sd,
+		                    NULL,
+		                    state,
+		                    0.0f,
+		                    PATH_RAY_SHADOW,
+		                    SHADER_CONTEXT_SHADOW);
+		path_state_modify_bounce(state, false);
+		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
+	}
+	/* Stop if all light is blocked. */
+	if(is_zero(*throughput)) {
+		return true;
+	}
+#ifdef __VOLUME__
+	/* Exit/enter volume. */
+	kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
+#endif
+	return false;
+}
+
+/* Special version which only handles opaque shadows. */
+ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray,
+                                      Intersection *isect,
+                                      float3 *shadow)
+{
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+#ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
+		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+	}
+#endif
+	return blocked;
+}
 
-/* Shadow function to compute how much light is blocked, CPU variation.
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+/* Shadow function to compute how much light is blocked,
  *
  * We trace a single ray. If it hits any opaque surface, or more than a given
  * number of transparent surfaces is hit, then we consider the geometry to be
@@ -36,261 +111,403 @@ CCL_NAMESPACE_BEGIN
  * or there is a performance increase anyway due to avoiding the need to send
  * two rays with transparent shadows.
  *
- * This is CPU only because of qsort, and malloc or high stack space usage to
- * record all these intersections. */
+ * On CPU it'll handle all transparent bounces (by allocating storage for
+ * intersections when they don't fit into the stack storage).
+ *
+ * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
+ * is something to be kept an eye on.
+ */
 
-#define STACK_MAX_HITS 64
+#    define SHADOW_STACK_MAX_HITS 64
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
+/* Actual logic with traversal loop implementation which is free from device
+ * specific tweaks.
+ *
+ * Note that hits array should be as big as max_hits+1.
+ */
+ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+                                                    ShaderData *shadow_sd,
+                                                    ccl_addr_space PathState *state,
+                                                    const int skip_object,
+                                                    Ray *ray,
+                                                    Intersection *hits,
+                                                    uint max_hits,
+                                                    float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray->t == 0.0f)
-		return false;
-	
-	bool blocked;
-
-	if(kernel_data.integrator.transparent_shadows) {
-		/* check transparent bounces here, for volume scatter which can do
-		 * lighting before surface path termination is checked */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
-			return true;
-
-		/* intersect to find an opaque surface, or record all transparent surface hits */
-		Intersection hits_stack[STACK_MAX_HITS];
-		Intersection *hits = hits_stack;
-		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-
-		/* prefer to use stack but use dynamic allocation if too deep max hits
-		 * we need max_hits + 1 storage space due to the logic in
-		 * scene_intersect_shadow_all which will first store and then check if
-		 * the limit is exceeded */
-		if(max_hits + 1 > STACK_MAX_HITS) {
-			if(kg->transparent_shadow_intersections == NULL) {
-				kg->transparent_shadow_intersections =
-				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+	/* Intersect to find an opaque surface, or record all transparent
+	 * surface hits.
+	 */
+	uint num_hits;
+	const bool blocked = scene_intersect_shadow_all(kg,
+	                                                ray,
+	                                                hits,
+	                                                skip_object,
+	                                                max_hits,
+	                                                &num_hits);
+	/* If no opaque surface found but we did find transparent hits,
+	 * shade them.
+	 */
+	if(!blocked && num_hits > 0) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		float last_t = 0.0f;
+		int bounce = state->transparent_bounce;
+		Intersection *isect = hits;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
+#    endif
+		sort_intersections(hits, num_hits);
+		for(int hit = 0; hit < num_hits; hit++, isect++) {
+			/* Adjust intersection distance for moving ray forward. */
+			float new_t = isect->t;
+			isect->t -= last_t;
+			/* Skip hit if we did not move forward, step by step raytracing
+			 * would have skipped it as well then.
+			 */
+			if(last_t == new_t) {
+				continue;
 			}
-			hits = kg->transparent_shadow_intersections;
-		}
-
-		uint num_hits;
-		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
-
-		/* if no opaque surface found but we did find transparent hits, shade them */
-		if(!blocked && num_hits > 0) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			float last_t = 0.0f;
-			int bounce = state->transparent_bounce;
-			Intersection *isect = hits;
+			last_t = new_t;
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-			PathState ps = *state;
+			                                   ps,
 #endif
-
-			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-			for(int hit = 0; hit < num_hits; hit++, isect++) {
-				/* adjust intersection distance for moving ray forward */
-				float new_t = isect->t;
-				isect->t -= last_t;
-
-				/* skip hit if we did not move forward, step by step raytracing
-				 * would have skipped it as well then */
-				if(last_t == new_t)
-					continue;
-
-				last_t = new_t;
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
-				}
-#endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = shadow_sd->P;
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
-#ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
-#endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
-
-#ifdef __VOLUME__
-			/* attenuation for last line segment towards light */
-			if(ps.volume_stack[0].shader != SHADER_NONE)
-				kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-			*shadow = throughput;
-
-			return is_zero(throughput);
+			/* Move ray forward. */
+			ray->P = shadow_sd->P;
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
 		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
+		}
+#    endif
+		*shadow = throughput;
+		return is_zero(throughput);
 	}
-	else {
-		Intersection isect;
-		blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-	}
-
-#ifdef __VOLUME__
+#    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
+		/* Apply attenuation from current volume shader/ */
 		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
 	}
-#endif
-
+#    endif
 	return blocked;
 }
 
-#undef STACK_MAX_HITS
-
-#else
+/* Here we do all device specific trickery before invoking actual traversal
+ * loop to help readability of the actual logic.
+ */
+ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+                                               ShaderData *shadow_sd,
+                                               ccl_addr_space PathState *state,
+                                               const int skip_object,
+                                               Ray *ray,
+                                               uint max_hits,
+                                               float3 *shadow)
+{
+#    ifdef __SPLIT_KERNEL__
+	Intersection hits_[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = &hits_[0];
+#    elif defined(__KERNEL_CUDA__)
+	Intersection *hits = kg->hits_stack;
+#    else
+	Intersection hits_stack[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = hits_stack;
+#    endif
+#    ifndef __KERNEL_GPU__
+	/* Prefer to use stack but use dynamic allocation if too deep max hits
+	 * we need max_hits + 1 storage space due to the logic in
+	 * scene_intersect_shadow_all which will first store and then check if
+	 * the limit is exceeded.
+	 *
+	 * Ignore this on GPU because of slow/unavailable malloc().
+	 */
+	if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
+		if(kg->transparent_shadow_intersections == NULL) {
+			const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+			kg->transparent_shadow_intersections =
+				(Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+		}
+		hits = kg->transparent_shadow_intersections;
+	}
+#    endif  /* __KERNEL_GPU__ */
+	/* Invoke actual traversal. */
+	return shadow_blocked_transparent_all_loop(kg,
+	                                           shadow_sd,
+	                                           state,
+	                                           skip_object,
+	                                           ray,
+	                                           hits,
+	                                           max_hits,
+	                                           shadow);
+}
+#  endif  /* __SHADOW_RECORD_ALL__ */
 
-/* Shadow function to compute how much light is blocked, GPU variation.
+#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
+/* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
  * To minimize overhead in cases where we don't need transparent shadows, we
  * first trace a regular shadow ray. We check if the hit primitive was
  * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency. */
+ * one extra ray cast for the cases were we do want transparency.
+ */
 
-ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
-                                        ShaderData *shadow_sd,
-                                        ccl_addr_space PathState *state,
-                                        ccl_addr_space Ray *ray_input,
-                                        float3 *shadow)
+/* This function is only implementing device-independent traversal logic
+ * which requires some precalculation done.
+ */
+ccl_device bool shadow_blocked_transparent_stepped_loop(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const int skip_object,
+        Ray *ray,
+        Intersection *isect,
+        const bool blocked,
+        const bool is_transparent_isect,
+        float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray_input->t == 0.0f)
-		return false;
-
-#ifdef __SPLIT_KERNEL__
-	Ray private_ray = *ray_input;
-	Ray *ray = &private_ray;
-#else
-	Ray *ray = ray_input;
-#endif
-
-#ifdef __SPLIT_KERNEL__
-	Intersection *isect = &kg->isect_shadow[SD_THREAD];
-#else
-	Intersection isect_object;
-	Intersection *isect = &isect_object;
-#endif
-
-	bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
-
-#ifdef __TRANSPARENT_SHADOWS__
-	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, isect)) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			int bounce = state->transparent_bounce;
-#ifdef __VOLUME__
-			PathState ps = *state;
-#endif
-
-			for(;;) {
-				if(bounce >= kernel_data.integrator.transparent_max_bounce)
-					return true;
-
-				if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
-				{
-#ifdef __VOLUME__
-					/* attenuation for last line segment towards light */
-					if(ps.volume_stack[0].shader != SHADER_NONE)
-						kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-					*shadow *= throughput;
-
-					return false;
-				}
-
-				if(!shader_transparent_shadow(kg, isect)) {
-					return true;
-				}
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
+	if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		int bounce = state->transparent_bounce;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
+#    endif
+		for(;;) {
+			if(bounce >= kernel_data.integrator.transparent_max_bounce) {
+				return true;
+			}
+			if(!scene_intersect(kg,
+			                    *ray,
+			                    PATH_RAY_SHADOW_TRANSPARENT,
+			                    isect,
+			                    NULL,
+			                    0.0f, 0.0f))
+			{
+				break;
+			}
+#ifdef __SHADOW_TRICKS__
+			if(skip_object != OBJECT_NONE) {
+				const int isect_object = (isect->object == PRIM_NONE)
+				        ? kernel_tex_fetch(__prim_object, isect->prim)
+				        : isect->object;
+				if(isect_object == skip_object) {
+					shader_setup_from_ray(kg, shadow_sd, isect, ray);
+					/* Move ray forward. */
+					ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+					if(ray->t != FLT_MAX) {
+						ray->D = normalize_len(Pend - ray->P, &ray->t);
+					}
+					bounce++;
+					continue;
 				}
+			}
 #endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
+			if(!shader_transparent_shadow(kg, isect)) {
+				return true;
+			}
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
+			                                   ps,
 #endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
+			/* Move ray forward. */
+			ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
+		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
 		}
+#    endif
+		*shadow *= throughput;
+		return is_zero(throughput);
 	}
-#ifdef __VOLUME__
-	else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
+#    ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
 		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
 	}
-#endif
-#endif
-
+#    endif
 	return blocked;
 }
 
+ccl_device bool shadow_blocked_transparent_stepped(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const int skip_object,
+        Ray *ray,
+        Intersection *isect,
+        float3 *shadow)
+{
+	bool blocked, is_transparent_isect;
+	if (skip_object == OBJECT_NONE) {
+		blocked = scene_intersect(kg,
+		                          *ray,
+		                          PATH_RAY_SHADOW_OPAQUE,
+		                          isect,
+		                          NULL,
+		                          0.0f, 0.0f);
+		is_transparent_isect = blocked
+			        ? shader_transparent_shadow(kg, isect)
+			        : false;
+	}
+	else {
+		blocked = false;
+		is_transparent_isect = false;
+	}
+	return shadow_blocked_transparent_stepped_loop(kg,
+	                                               shadow_sd,
+	                                               state,
+	                                               skip_object,
+	                                               ray,
+	                                               isect,
+	                                               blocked,
+	                                               is_transparent_isect,
+	                                               shadow);
+}
+
+#  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray_input,
+                                      float3 *shadow)
+{
+	Ray *ray = ray_input;
+	Intersection isect;
+	/* Some common early checks. */
+	*shadow = make_float3(1.0f, 1.0f, 1.0f);
+	if(ray->t == 0.0f) {
+		return false;
+	}
+#ifdef __SHADOW_TRICKS__
+    const int skip_object = state->catcher_object;
+#else
+    const int skip_object = OBJECT_NONE;
 #endif
+	/* Do actual shadow shading. */
+	/* First of all, we check if integrator requires transparent shadows.
+	 * if not, we use simplest and fastest ever way to calculate occlusion.
+	 *
+	 * NOTE: We can't do quick opaque test here if we are on shadow-catcher
+	 * path because we don't want catcher object to be casting shadow here.
+	 */
+#ifdef __TRANSPARENT_SHADOWS__
+	if(!kernel_data.integrator.transparent_shadows &&
+	   skip_object == OBJECT_NONE)
+#endif
+	{
+		return shadow_blocked_opaque(kg,
+		                             shadow_sd,
+		                             state,
+		                             ray,
+		                             &isect,
+		                             shadow);
+	}
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+	/* For the transparent shadows we try to use record-all logic on the
+	 * devices which supports this.
+	 */
+	const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+	/* Check transparent bounces here, for volume scatter which can do
+	 * lighting before surface path termination is checked.
+	 */
+	if(state->transparent_bounce >= transparent_max_bounce) {
+		return true;
+	}
+	const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
+#    ifdef __KERNEL_GPU__
+	/* On GPU we do trickey with tracing opaque ray first, this avoids speed
+	 * regressions in some files.
+	 *
+	 * TODO(sergey): Check why using record-all behavior causes slowdown in such
+	 * cases. Could that be caused by a higher spill pressure?
+	 */
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     &isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+	const bool is_transparent_isect = blocked
+	        ? shader_transparent_shadow(kg, &isect)
+	        : false;
+	if(!blocked || !is_transparent_isect ||
+	   max_hits + 1 >= SHADOW_STACK_MAX_HITS)
+	{
+		return shadow_blocked_transparent_stepped_loop(kg,
+		                                               shadow_sd,
+		                                               state,
+		                                               skip_object,
+		                                               ray,
+		                                               &isect,
+		                                               blocked,
+		                                               is_transparent_isect,
+		                                               shadow);
+	}
+#    endif  /* __KERNEL_GPU__ */
+	return shadow_blocked_transparent_all(kg,
+	                                      shadow_sd,
+	                                      state,
+	                                      skip_object,
+	                                      ray,
+	                                      max_hits,
+	                                      shadow);
+#  else  /* __SHADOW_RECORD_ALL__ */
+	/* Fallback to a slowest version which works on all devices. */
+	return shadow_blocked_transparent_stepped(kg,
+	                                          shadow_sd,
+	                                          state,
+	                                          skip_object,
+	                                          ray,
+	                                          &isect,
+	                                          shadow);
+#  endif  /* __SHADOW_RECORD_ALL__ */
+#endif  /* __TRANSPARENT_SHADOWS__ */
+}
 
-CCL_NAMESPACE_END
+#undef SHADOW_STACK_MAX_HITS
 
+CCL_NAMESPACE_END