1 files changed, 277 insertions, 298 deletions
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index c84727ace99..42df7e85b41 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,8 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
-                                               RNG *rng,
+                                               ccl_addr_space PathState *state,
                                                float3 throughput)
 {
 	int num_samples = kernel_data.integrator.ao_samples;
@@ -35,46 +34,225 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 	for(int j = 0; j < num_samples; j++) {
 		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 		float3 ao_D;
 		float ao_pdf;
 
 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;
 
-			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
-#endif  /* __OBJECT_MOTION__ */
-			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.time = sd->time;
+			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
+				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
+			}
+			else {
+				path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
+			}
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline void kernel_branched_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *indirect_sd,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return;
+	}
+
+	/* volume attenuation, emission, scatter */
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+	/* decoupled ray marching only supported on CPU */
+	if(kernel_data.integrator.volume_decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
+
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
+
+		/* direct light sampling */
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+
+			int all = kernel_data.integrator.sample_all_lights_direct;
+
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
+
+			/* indirect light sampling */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = *state;
+				Ray pray = *ray;
+				float3 tp = *throughput;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				/* scatter sample. if we use distance sampling and take just one
+				 * sample for direct and indirect light, we could share this
+				 * computation, but makes code a bit complex */
+				float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+				float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					&ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+
+				if(result == VOLUME_PATH_SCATTERED &&
+				   kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp*num_samples_inv,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+		}
+
+		/* emission and transmittance */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+		*throughput *= volume_segment.accum_transmittance;
+
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
+	}
+	else
+#  endif  /* __VOLUME_DECOUPLED__ */
+	{
+		/* GPU: no decoupled ray marching, scatter probalistically */
+		int num_samples = kernel_data.integrator.volume_samples;
+		float num_samples_inv = 1.0f/num_samples;
+
+		/* todo: we should cache the shader evaluations from stepping
+		 * through the volume, for now we redo them multiple times */
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			Ray pray = *ray;
+			float3 tp = (*throughput) * num_samples_inv;
+
+			/* branch RNG state */
+			path_state_branch(&ps, j, num_samples);
+
+			VolumeIntegrateResult result = kernel_volume_integrate(
+				kg, &ps, sd, &volume_ray, L, &tp, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+			if(result == VOLUME_PATH_SCATTERED) {
+				/* todo: support equiangular, MIS and all light sampling.
+				 * alternatively get decoupled ray marching working on the GPU */
+				kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
+
+				if(kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+# endif  /* __VOLUME_SCATTER__ */
+		}
+
+		/* todo: avoid this calculation using decoupled ray marching */
+		kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
+	}
+}
+#endif  /* __VOLUME__ */
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+	ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
 		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+		if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
 			continue;
+		}
 
 		int num_samples;
 
@@ -90,34 +268,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
 
 		for(int j = 0; j < num_samples; j++) {
 			PathState ps = *state;
 			float3 tp = throughput;
 			Ray bsdf_ray;
+#ifdef __SHADOW_TRICKS__
+			float shadow_transparency = L->shadow_transparency;
+#endif
+
+			ps.rng_hash = cmj_hash(state->rng_hash, i);
 
 			if(!kernel_branched_path_surface_bounce(kg,
-			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        &tp,
 			                                        &ps,
-			                                        L,
-			                                        &bsdf_ray))
+			                                        &L->state,
+			                                        &bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
 
+			ps.rng_hash = state->rng_hash;
+
 			kernel_path_indirect(kg,
 			                     indirect_sd,
 			                     emission_sd,
-			                     rng,
 			                     &bsdf_ray,
 			                     tp*num_samples_inv,
-			                     num_samples,
 			                     &ps,
 			                     L);
 
@@ -125,6 +307,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			 * for the next samples */
 			path_radiance_sum_indirect(L);
 			path_radiance_reset_indirect(L);
+
+#ifdef __SHADOW_TRICKS__
+			L->shadow_transparency = shadow_transparency;
+#endif
 		}
 	}
 }
@@ -136,28 +322,27 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         ShaderData *emission_sd,
                                                         PathRadiance *L,
                                                         PathState *state,
-                                                        RNG *rng,
                                                         Ray *ray,
                                                         float3 throughput)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
 
 		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		uint lcg_state = lcg_state_init(state, 0x68bc21eb);
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
+		uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
 
 		/* do subsurface scatter step with copy of shader data, this will
 		 * replace the BSSRDF with a diffuse BSDF closure */
 		for(int j = 0; j < num_samples; j++) {
 			SubsurfaceIntersection ss_isect;
 			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 			int num_hits = subsurface_scatter_multi_intersect(kg,
 			                                                  &ss_isect,
 			                                                  sd,
@@ -167,8 +352,9 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			                                                  true);
 #ifdef __VOLUME__
 			Ray volume_ray = *ray;
-			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
@@ -205,10 +391,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __EMISSION__
 				/* direct light */
 				if(kernel_data.integrator.use_direct_light) {
-					int all = kernel_data.integrator.sample_all_lights_direct;
+					int all = (kernel_data.integrator.sample_all_lights_direct) ||
+					          (state->flag & PATH_RAY_SHADOW_CATCHER);
 					kernel_branched_path_surface_connect_light(
 					        kg,
-					        rng,
 					        &bssrdf_sd,
 					        emission_sd,
 					        &hit_state,
@@ -222,7 +408,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 				/* indirect light */
 				kernel_branched_path_surface_indirect_light(
 				        kg,
-				        rng,
 				        &bssrdf_sd,
 				        indirect_sd,
 				        emission_sd,
@@ -236,14 +421,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */
 
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
+                                               uint rng_hash,
+                                               int sample,
+                                               Ray ray,
+                                               ccl_global float *buffer,
+                                               PathRadiance *L)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -251,264 +439,67 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 	ShaderData emission_sd, indirect_sd;
 
 	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
 
 	/* Main Loop
 	 * Here we only handle transparency intersections from the camera ray.
 	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
 	 */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if(kernel_data.cam.resolution == 1) {
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
-		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
-		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
+		bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L);
 
 #ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-
-			shader_setup_from_volume(kg, &sd, &volume_ray);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				int all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, rng, &sd,
-					&emission_sd, throughput, &state, &L, all,
-					&volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     rng,
-						                     &pray,
-						                     tp*num_samples_inv,
-						                     num_samples,
-						                     &ps,
-						                     &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
-
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     rng,
-						                     &pray,
-						                     tp,
-						                     num_samples,
-						                     &ps,
-						                     &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-#endif  /* __VOLUME_SCATTER__ */
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
-#endif  /* __VOLUME_DECOUPLED__ */
-		}
+		/* Volume integration. */
+		kernel_branched_path_volume(kg,
+		                            &sd,
+		                            &state,
+		                            &ray,
+		                            &throughput,
+		                            &isect,
+		                            hit,
+		                            &indirect_sd,
+		                            &emission_sd,
+		                            L);
 #endif  /* __VOLUME__ */
 
+		/* Shade background. */
 		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, &state, &ray, throughput, &emission_sd, L);
 			break;
 		}
 
-		/* setup shading */
+		/* Setup and evaluate shader. */
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, &sd, &state, state.flag);
 		shader_merge_closures(&sd);
 
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				
-				if(sd.flag & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, &sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.flag & SD_HOLDOUT_MASK)
-				break;
-		}
-#endif  /* __HOLDOUT__ */
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             &state,
+		                             &ray,
+		                             throughput,
+		                             &emission_sd,
+		                             L,
+		                             buffer))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* transparency termination */
 		if(state.flag & PATH_RAY_TRANSPARENT) {
 			/* path termination. this is a strange place to put the termination, it's
 			 * mainly due to the mixed in MIS that we use. gives too many unneeded
 			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
+			float probability = path_state_continuation_probability(kg, &state, throughput);
 
 			if(probability == 0.0f) {
 				break;
 			}
 			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
 
 				if(terminate >= probability)
 					break;
@@ -517,10 +508,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput);
 		}
 #endif  /* __AO__ */
 
@@ -528,7 +521,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        &L, &state, rng, &ray, throughput);
+			                                        L, &state, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */
 
@@ -538,15 +531,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __EMISSION__
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
-				int all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+				int all = (kernel_data.integrator.sample_all_lights_direct) ||
+				          (state.flag & PATH_RAY_SHADOW_CATCHER);
+				kernel_branched_path_surface_connect_light(kg,
+					&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
 			}
 #endif  /* __EMISSION__ */
 
 			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+			kernel_branched_path_surface_indirect_light(kg,
+				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
 
 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
@@ -574,50 +568,35 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
 #endif  /* __VOLUME__ */
 	}
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
+	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
 
-	rng_state += index;
 	buffer += index*pass_stride;
 
 	/* initialize random numbers and ray */
-	RNG rng;
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+	kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	PathRadiance L;
 
-	path_rng_end(kg, rng_state, rng);
+	if(ray.t != 0.0f) {
+		kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
+		kernel_write_result(kg, buffer, sample, &L);
+	}
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
-