Cycles: Delay shooting SSS indirect rays

The idea is to delay shooting indirect rays for the SSS sampling and trace them after the main integration loop was finished. This reduces GPU stack usage even further and brings it down to around 652MB (comparing to 722MB before the change and 946MB with previous stable release). This also solves the speed regression happened in the previous commit and now simple SSS scene (SSS suzanne on the floor) renders in 0:50 (comparing to 1:16 with previous commit and 1:03 with official release).
author: Sergey Sharybin <sergey.vfx@gmail.com> 2015-11-22 13:48:33 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2015-11-25 11:01:22 +0300
commit: 2a5c1fc9ccbabfaef4eeaf90093dfb2ac0acfc90 (patch)
tree: 98ee26786419e78d9fa0e325c527bf2e1fa3d905 /intern/cycles/kernel/kernel_path.h
parent: 8bca34fe326d10cc2f20df7fa541179e9ba835d2 (diff)
1 files changed, 264 insertions, 89 deletions
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 87d36efa4d4..1f385b8d304 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -52,47 +52,64 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
-	float3 throughput, int num_samples, PathState state, PathRadiance *L)
+ccl_device void kernel_path_indirect(KernelGlobals *kg,
+                                     RNG *rng,
+                                     Ray *ray,
+                                     float3 throughput,
+                                     int num_samples,
+                                     PathState *state,
+                                     PathRadiance *L)
 {
 	/* path iteration */
 	for(;;) {
 		/* intersect scene */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+		uint visibility = path_state_ray_visibility(kg, state);
+		bool hit = scene_intersect(kg,
+		                           ray,
+		                           visibility,
+		                           &isect,
+		                           NULL,
+		                           0.0f, 0.0f);
 
 #ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
 			/* ray starting from previous non-transparent bounce */
 			Ray light_ray;
 
-			light_ray.P = ray.P - state.ray_t*ray.D;
-			state.ray_t += isect.t;
-			light_ray.D = ray.D;
-			light_ray.t = state.ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
+			light_ray.P = ray->P - state->ray_t*ray->D;
+			state->ray_t += isect.t;
+			light_ray.D = ray->D;
+			light_ray.t = state->ray_t;
+			light_ray.time = ray->time;
+			light_ray.dD = ray->dD;
+			light_ray.dP = ray->dP;
 
 			/* intersect with lamp */
 			float3 emission;
-
-			if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
-				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			if(indirect_lamp_emission(kg, state, &light_ray, &emission)) {
+				path_radiance_accum_emission(L,
+				                             throughput,
+				                             emission,
+				                             state->bounce);
+			}
 		}
 #endif
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = *ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+			bool heterogeneous =
+			        volume_stack_is_heterogeneous(kg,
+			                                      state->volume_stack);
 
 #ifdef __VOLUME_DECOUPLED__
-			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			int sampling_method =
+			        volume_stack_sampling_method(kg,
+			                                     state->volume_stack);
 			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
 
 			if(decoupled) {
@@ -100,15 +117,27 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				VolumeSegment volume_segment;
 				ShaderData volume_sd;
 
-				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-				kernel_volume_decoupled_record(kg, &state,
-					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-				
+				shader_setup_from_volume(kg,
+				                         &volume_sd,
+				                         &volume_ray,
+				                         state->bounce,
+				                         state->transparent_bounce);
+				kernel_volume_decoupled_record(kg,
+				                               state,
+				                               &volume_ray,
+				                               &volume_sd,
+				                               &volume_segment,
+				                               heterogeneous);
+
 				volume_segment.sampling_method = sampling_method;
 
 				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+				if(volume_segment.closure_flag & SD_EMISSION) {
+					path_radiance_accum_emission(L,
+					                             throughput,
+					                             volume_segment.accum_emission,
+					                             state->bounce);
+				}
 
 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -117,28 +146,51 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 					bool all = kernel_data.integrator.sample_all_lights_indirect;
 
 					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, L, all, &volume_ray, &volume_segment);
+					kernel_branched_path_volume_connect_light(kg,
+					                                          rng,
+					                                          &volume_sd,
+					                                          throughput,
+					                                          state,
+					                                          L,
+					                                          all,
+					                                          &volume_ray,
+					                                          &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
 					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 
 					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &volume_sd, &throughput,
-						rphase, rscatter, &volume_segment, NULL, true);
+					                                         state,
+					                                         &volume_ray,
+					                                         &volume_sd,
+					                                         &throughput,
+					                                         rphase,
+					                                         rscatter,
+					                                         &volume_segment,
+					                                         NULL,
+					                                         true);
 				}
 
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+					if(kernel_path_volume_bounce(kg,
+					                             rng,
+					                             &volume_sd,
+					                             &throughput,
+					                             state,
+					                             L,
+					                             ray))
+					{
 						continue;
-					else
+					}
+					else {
 						break;
+					}
 				}
 				else {
 					throughput *= volume_segment.accum_transmittance;
@@ -150,18 +202,32 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				/* integrate along volume segment with distance sampling */
 				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
+					kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
 
 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L);
+					kernel_path_volume_connect_light(kg,
+					                                 rng,
+					                                 &volume_sd,
+					                                 throughput,
+					                                 state,
+					                                 L);
 
 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+					if(kernel_path_volume_bounce(kg,
+					                             rng,
+					                             &volume_sd,
+					                             &throughput,
+					                             state,
+					                             L,
+					                             ray))
+					{
 						continue;
-					else
+					}
+					else {
 						break;
+					}
 				}
 #endif
 			}
@@ -171,8 +237,11 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		if(!hit) {
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
-			path_radiance_accum_background(L, throughput, L_background, state.bounce);
+			float3 L_background = indirect_background(kg, state, ray);
+			path_radiance_accum_background(L,
+			                               throughput,
+			                               L_background,
+			                               state->bounce);
 #endif
 
 			break;
@@ -180,9 +249,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 
 		/* setup shading */
 		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
+		shader_setup_from_ray(kg,
+		                      &sd,
+		                      &isect,
+		                      ray,
+		                      state->bounce,
+		                      state->transparent_bounce);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, &sd, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
 		shader_merge_closures(&sd);
 #endif
@@ -190,7 +264,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
 		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 
 			if(blur_pdf < 1.0f) {
 				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,21 +275,28 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #ifdef __EMISSION__
 		/* emission */
 		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			float3 emission = indirect_primitive_emission(kg,
+			                                              &sd,
+			                                              isect.t,
+			                                              state->flag,
+			                                              state->ray_pdf);
+			path_radiance_accum_emission(L, throughput, emission, state->bounce);
 		}
 #endif
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability = path_state_terminate_probability(kg, &state, throughput*num_samples);
+		float probability =
+		        path_state_terminate_probability(kg,
+		                                         state,
+		                                         throughput*num_samples);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
 
 			if(terminate >= probability)
 				break;
@@ -227,7 +308,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
@@ -251,8 +332,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				light_ray.dP = sd.dP;
 				light_ray.dD = differential3_zero();
 
-				if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow))
-					path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state.bounce);
+				if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) {
+					path_radiance_accum_ao(L,
+					                       throughput,
+					                       ao_alpha,
+					                       ao_bsdf,
+					                       ao_shadow,
+					                       state->bounce);
+				}
 			}
 		}
 #endif
@@ -269,11 +356,21 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb);
+				uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
 
 				float bssrdf_u, bssrdf_v;
-				path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+				path_state_rng_2D(kg,
+				                  rng,
+				                  state,
+				                  PRNG_BSDF_U,
+				                  &bssrdf_u, &bssrdf_v);
+				subsurface_scatter_step(kg,
+				                        &sd,
+				                        state->flag,
+				                        sc,
+				                        &lcg_state,
+				                        bssrdf_u, bssrdf_v,
+				                        false);
 			}
 		}
 #endif
@@ -281,11 +378,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
 			bool all = kernel_data.integrator.sample_all_lights_indirect;
-			kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all);
+			kernel_branched_path_surface_connect_light(kg,
+			                                           rng,
+			                                           &sd,
+			                                           state,
+			                                           throughput,
+			                                           1.0f,
+			                                           L,
+			                                           all);
 		}
 #endif
 
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray))
 			break;
 	}
 }
@@ -326,7 +430,15 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 
 #ifdef __SUBSURFACE__
 
-ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
+ccl_device bool kernel_path_subsurface_scatter(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        PathRadiance *L,
+        PathState *state,
+        RNG *rng,
+        Ray *ray,
+        float3 *throughput,
+        SubsurfaceIndirectRays *ss_indirect)
 {
 	float bssrdf_probability;
 	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
@@ -349,9 +461,9 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 		                                                  bssrdf_u, bssrdf_v,
 		                                                  false);
 #ifdef __VOLUME__
-		Ray volume_ray = *ray;
-		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-		                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+		ss_indirect->need_update_volume_stack =
+		        kernel_data.integrator.use_volumes &&
+		        ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
 
 		/* compute lighting with the BSDF closure */
@@ -367,46 +479,81 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 			                               sc,
 			                               false);
 
-			float3 tp = *throughput;
-			PathState hit_state = *state;
-			Ray hit_ray = *ray;
+			PathState *hit_state = &ss_indirect->state;
+			Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
+			float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
+
+			*hit_state = *state;
+			*hit_ray = *ray;
+			*hit_tp = *throughput;
 
-			hit_state.rng_offset += PRNG_BOUNCE_NUM;
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
 
-			kernel_path_surface_connect_light(kg, rng, sd, tp, state, L);
+			kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, L);
 
-			if(kernel_path_surface_bounce(kg, rng, sd, &tp, &hit_state, L, &hit_ray)) {
+			if(kernel_path_surface_bounce(kg,
+			                              rng,
+			                              sd,
+			                              hit_tp,
+			                              hit_state,
+			                              L,
+			                              hit_ray))
+			{
 #ifdef __LAMP_MIS__
-				hit_state.ray_t = 0.0f;
+				hit_state->ray_t = 0.0f;
 #endif
+				ss_indirect->num_rays++;
+			}
+		}
+		return true;
+	}
+	return false;
+}
+
+/* Trace subsurface indirect rays separately after the path loop, to reduce
+ * GPU stack memory usage. this way ShaderData and other data structures
+ * used during the loop are not needed during kernel_path_indirect.
+ */
+ccl_device void kernel_path_subsurface_scatter_indirect(
+        KernelGlobals *kg,
+        PathRadiance *L,
+        PathState *state,
+        RNG *rng,
+        Ray *ray,
+        SubsurfaceIndirectRays *ss_indirect)
+{
+	for (int i = 0; i < ss_indirect->num_rays; i++) {
+		Ray *indirect_ray = &ss_indirect->rays[i];
+		float3 indirect_throughput = ss_indirect->throughputs[i];
+
+		*state = ss_indirect->state;
 
 #ifdef __VOLUME__
-				if(need_update_volume_stack) {
-					/* Setup ray from previous surface point to the new one. */
-					volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_volume_stack_update_for_subsurface(
-					    kg,
-					    &volume_ray,
-					    hit_state.volume_stack);
-
-					/* Move volume ray forward. */
-					volume_ray.P = hit_ray.P;
-				}
-#endif
+		if(ss_indirect->need_update_volume_stack) {
+			/* TODO(sergey): Single assignment per scatter. */
+			Ray volume_ray = *ray;
 
-				kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
+			/* Setup ray from previous surface point to the new one. */
+			volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
+			                             &volume_ray.t);
+
+			kernel_volume_stack_update_for_subsurface(
+			        kg,
+			        &volume_ray,
+			        state->volume_stack);
 
-				/* for render passes, sum and reset indirect light pass variables
-				 * for the next samples */
-				path_radiance_sum_indirect(L);
-				path_radiance_reset_indirect(L);
-			}
 		}
-		return true;
+#endif
+
+		/* Note that this modifies state. */
+		kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
+
+		/* For render passes, sum and reset indirect light pass variables
+		 * for the next samples.
+		 */
+		path_radiance_sum_indirect(L);
+		path_radiance_reset_indirect(L);
 	}
-	return false;
 }
 #endif
 
@@ -427,6 +574,11 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	debug_data_init(&debug_data);
 #endif
 
+#ifdef __SUBSURFACE__
+	SubsurfaceIndirectRays ss_indirect;
+	ss_indirect.num_rays = 0;
+#endif
+
 	/* path iteration */
 	for(;;) {
 		/* intersect scene */
@@ -664,8 +816,17 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
 		if(sd.flag & SD_BSSRDF) {
-			if(kernel_path_subsurface_scatter(kg, &sd, &L, &state, rng, &ray, &throughput))
+			if(kernel_path_subsurface_scatter(kg,
+			                                  &sd,
+			                                  &L,
+			                                  &state,
+			                                  rng,
+			                                  &ray,
+			                                  &throughput,
+			                                  &ss_indirect))
+			{
 				break;
+			}
 		}
 #endif
 
@@ -677,6 +838,20 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			break;
 	}
 
+#ifdef __SUBSURFACE__
+	/* Trace indirect subsurface afterwards to reduce GPU stack size.
+	 * note that this modifies state.
+	 */
+	if (ss_indirect.num_rays) {
+		kernel_path_subsurface_scatter_indirect(kg,
+		                                        &L,
+		                                        &state,
+		                                        rng,
+		                                        &ray,
+		                                        &ss_indirect);
+	}
+#endif
+
 	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
author	Sergey Sharybin <sergey.vfx@gmail.com>	2015-11-22 13:48:33 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2015-11-25 11:01:22 +0300
commit	2a5c1fc9ccbabfaef4eeaf90093dfb2ac0acfc90 (patch)
tree	98ee26786419e78d9fa0e325c527bf2e1fa3d905 /intern/cycles/kernel/kernel_path.h
parent	8bca34fe326d10cc2f20df7fa541179e9ba835d2 (diff)