1 files changed, 279 insertions, 55 deletions
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 0b4d50c70ee..d5083b23f80 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -16,82 +16,306 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		*local_queue_atomics = 0;
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	branched_state->ss_next_closure = 0;
+	branched_state->ss_next_sample = 0;
+
+	branched_state->num_hits = 0;
+	branched_state->next_hit = 0;
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &branched_state->sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
+		{
+			branched_state->lcg_state = lcg_state_init(&rng,
+			                                           branched_state->path_state.rng_offset,
+			                                           branched_state->path_state.sample,
+			                                           0x68bc21eb);
+		}
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg,
+			                     &bssrdf_rng,
+			                     &branched_state->path_state,
+			                     j,
+			                     num_samples,
+			                     PRNG_BSDF_U,
+			                     &bssrdf_u,
+			                     &bssrdf_v);
+
+			/* intersection is expensive so avoid doing multiple times for the same input */
+			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+				RNG lcg_state = branched_state->lcg_state;
+				SubsurfaceIntersection ss_isect_private;
+
+				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+				                                                              &ss_isect_private,
+				                                                              sd,
+				                                                              sc,
+				                                                              &lcg_state,
+				                                                              bssrdf_u, bssrdf_v,
+				                                                              true);
+
+				branched_state->lcg_state = lcg_state;
+				*ss_isect = ss_isect_private;
+			}
+
+#ifdef __VOLUME__
+			Ray volume_ray = branched_state->ray;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+				ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+				*bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+				                   * important as the indirect path will write into bssrdf_sd */
+
+				SubsurfaceIntersection ss_isect_private = *ss_isect;
+				subsurface_scatter_multi_setup(kg,
+				                               &ss_isect_private,
+				                               hit,
+				                               bssrdf_sd,
+				                               &branched_state->path_state,
+				                               branched_state->path_state.flag,
+				                               sc,
+				                               true);
+				*ss_isect = ss_isect_private;
+
+				ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+				*hit_state = branched_state->path_state;
+
+				path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+					/* this next part is expensive as it does scene intersection so only do once */
+					if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+						for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+							branched_state->volume_stack[k] = hit_state->volume_stack[k];
+						}
+
+						kernel_volume_stack_update_for_subsurface(kg,
+						                                          emission_sd,
+						                                          &volume_ray,
+						                                          branched_state->volume_stack);
+					}
+
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state->volume_stack[k] = branched_state->volume_stack[k];
+					}
+				}
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+				if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+					/* direct light */
+					if(kernel_data.integrator.use_direct_light) {
+						int all = (kernel_data.integrator.sample_all_lights_direct) ||
+							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+						kernel_branched_path_surface_connect_light(kg,
+						                                           &rng,
+						                                           bssrdf_sd,
+						                                           emission_sd,
+						                                           hit_state,
+						                                           branched_state->throughput,
+						                                           num_samples_inv,
+						                                           L,
+						                                           all);
+					}
+				}
+#endif  /* __EMISSION__ */
+
+				/* indirect light */
+				if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+				                                                          ray_index,
+				                                                          num_samples_inv,
+				                                                          bssrdf_sd,
+				                                                          false,
+				                                                          false))
+				{
+					branched_state->ss_next_closure = i;
+					branched_state->ss_next_sample = j;
+					branched_state->next_hit = hit;
+
+					return true;
+				}
+
+				branched_state->next_closure = 0;
+			}
+
+			branched_state->next_hit = 0;
+		}
+
+		branched_state->ss_next_sample = 0;
+	}
+
+	branched_state->ss_next_closure = sd->num_closure;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
 	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-	char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+	                          1);
+	get_ray_index(kg, thread_index,
+	              QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
 
+#ifdef __SUBSURFACE__
 	ccl_global char *ray_state = kernel_split_state.ray_state;
-	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
-	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-	ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
 		if(sd->flag & SD_BSSRDF) {
-			if(kernel_path_subsurface_scatter(kg,
-			                                  sd,
-			                                  emission_sd,
-			                                  L,
-			                                  state,
-			                                  &rng,
-			                                  ray,
-			                                  throughput,
-			                                  ss_indirect)) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched) {
+#endif
+				if(kernel_path_subsurface_scatter(kg,
+				                                  sd,
+				                                  emission_sd,
+				                                  L,
+				                                  state,
+				                                  &rng,
+				                                  ray,
+				                                  throughput,
+				                                  ss_indirect))
+				{
+					kernel_split_path_end(kg, ray_index);
+				}
+#ifdef __BRANCHED_PATH__
+			}
+			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+				float bssrdf_probability;
+				ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+				/* modify throughput for picking bssrdf or bsdf */
+				*throughput *= bssrdf_probability;
+
+				/* do bssrdf scatter step if we picked a bssrdf closure */
+				if(sc) {
+					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+					float bssrdf_u, bssrdf_v;
+					path_state_rng_2D(kg,
+					                  &rng,
+					                  state,
+					                  PRNG_BSDF_U,
+					                  &bssrdf_u, &bssrdf_v);
+					subsurface_scatter_step(kg,
+					                        sd,
+					                        state,
+					                        state->flag,
+					                        sc,
+					                        &lcg_state,
+					                        bssrdf_u, bssrdf_v,
+					                        false);
+				}
+			}
+			else {
+				kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
 			}
+#endif
 		}
 		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
 	}
-#endif
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        local_queue_atomics,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
 
 #endif  /* __SUBSURFACE__ */