From b49185df99d92ba1334ac107177690e27deb3182 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Tue, 24 May 2016 22:28:03 +0200 Subject: Cycles CUDA: reduce branched path stack memory by sharing indirect ShaderData. Saves about 15% for the branched path kernel. --- intern/cycles/kernel/kernel_bake.h | 12 ++++--- intern/cycles/kernel/kernel_path.h | 54 ++++++++++++++--------------- intern/cycles/kernel/kernel_path_branched.h | 19 ++++++---- 3 files changed, 45 insertions(+), 40 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 77982ee548a..3966a06fe33 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -30,8 +30,8 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian Ray ray; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - /* emission shader data memory used by various functions */ - ShaderData emission_sd; + /* emission and indirect shader data memory used by various functions */ + ShaderData emission_sd, indirect_sd; ray.P = sd->P + sd->Ng; ray.D = -sd->Ng; @@ -94,6 +94,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian &L_sample, &throughput); kernel_path_indirect(kg, + &indirect_sd, &emission_sd, &rng, &ray, @@ -117,7 +118,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample); + kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); @@ -144,7 +145,8 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian /* sample subsurface scattering */ if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ - kernel_branched_path_subsurface_scatter(kg, sd, &emission_sd, &L_sample, &state, &rng, &ray, throughput); + kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd, + &emission_sd, &L_sample, &state, &rng, &ray, throughput); } #endif @@ -161,7 +163,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian /* indirect light */ kernel_branched_path_surface_indirect_light(kg, &rng, - sd, &emission_sd, throughput, 1.0f, &state, &L_sample); + sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample); } } #endif diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 5527d8aa861..0dded397ffa 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -53,6 +53,7 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_path_indirect(KernelGlobals *kg, + ShaderData *sd, ShaderData *emission_sd, RNG *rng, Ray *ray, @@ -61,9 +62,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, PathState *state, PathRadiance *L) { - /* shader data memory used for both volumes and surfaces, saves stack space */ - ShaderData sd; - /* path iteration */ for(;;) { /* intersect scene */ @@ -121,12 +119,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, VolumeSegment volume_segment; shader_setup_from_volume(kg, - &sd, + sd, &volume_ray); kernel_volume_decoupled_record(kg, state, &volume_ray, - &sd, + sd, &volume_segment, heterogeneous); @@ -149,7 +147,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, rng, - &sd, + sd, emission_sd, throughput, state, @@ -167,7 +165,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, result = kernel_volume_decoupled_scatter(kg, state, &volume_ray, - &sd, + sd, &throughput, rphase, rscatter, @@ -182,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, if(result == VOLUME_PATH_SCATTERED) { if(kernel_path_volume_bounce(kg, rng, - &sd, + sd, &throughput, state, L, @@ -203,14 +201,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); + kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ kernel_path_volume_connect_light(kg, rng, - &sd, + sd, emission_sd, throughput, state, @@ -219,7 +217,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* indirect light bounce */ if(kernel_path_volume_bounce(kg, rng, - &sd, + sd, &throughput, state, L, @@ -251,13 +249,13 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* setup shading */ shader_setup_from_ray(kg, - &sd, + sd, &isect, ray); float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, &sd, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT); + shader_eval_surface(kg, sd, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT); #ifdef __BRANCHED_PATH__ - shader_merge_closures(&sd); + shader_merge_closures(sd); #endif /* blurring of bsdf after bounces, for rays that have a small likelihood @@ -267,15 +265,15 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, if(blur_pdf < 1.0f) { float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, &sd, blur_roughness); + shader_bsdf_blur(kg, sd, blur_roughness); } } #ifdef __EMISSION__ /* emission */ - if(sd.flag & SD_EMISSION) { + if(sd->flag & SD_EMISSION) { float3 emission = indirect_primitive_emission(kg, - &sd, + sd, isect.t, state->flag, state->ray_pdf); @@ -305,30 +303,30 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { + if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { float bsdf_u, bsdf_v; path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; - float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N); + float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); float3 ao_D; float ao_pdf; float3 ao_alpha = make_float3(0.0f, 0.0f, 0.0f); sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(sd.P, sd.Ng); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; # ifdef __OBJECT_MOTION__ - light_ray.time = sd.time; + light_ray.time = sd->time; # endif - light_ray.dP = sd.dP; + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { @@ -346,9 +344,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object, replacing * the closures with a diffuse BSDF */ - if(sd.flag & SD_BSSRDF) { + if(sd->flag & SD_BSSRDF) { float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, &sd, &bssrdf_probability); + ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); /* modify throughput for picking bssrdf or bsdf */ throughput *= bssrdf_probability; @@ -364,7 +362,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); subsurface_scatter_step(kg, - &sd, + sd, state, state->flag, sc, @@ -380,7 +378,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, int all = kernel_data.integrator.sample_all_lights_indirect; kernel_branched_path_surface_connect_light(kg, rng, - &sd, + sd, emission_sd, state, throughput, @@ -390,7 +388,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } #endif - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray)) + if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) break; } } diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index b4dee220aa5..fdba1a7b025 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -64,8 +64,8 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg, /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, - float num_samples_adjust, PathState *state, PathRadiance *L) + RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, + float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; @@ -112,6 +112,7 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba } kernel_path_indirect(kg, + indirect_sd, emission_sd, rng, &bsdf_ray, @@ -131,6 +132,7 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba #ifdef __SUBSURFACE__ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, + ShaderData *indirect_sd, ShaderData *emission_sd, PathRadiance *L, PathState *state, @@ -222,6 +224,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, kg, rng, &bssrdf_sd, + indirect_sd, emission_sd, throughput, num_samples_inv, @@ -244,8 +247,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; - /* shader data used by emission, shadows, volume stacks */ - ShaderData emission_sd; + /* shader data used by emission, shadows, volume stacks, indirect path */ + ShaderData emission_sd, indirect_sd; PathState state; path_state_init(kg, &emission_sd, &state, rng, sample, &ray); @@ -356,6 +359,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in &pray)) { kernel_path_indirect(kg, + &indirect_sd, &emission_sd, rng, &pray, @@ -413,6 +417,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in &pray)) { kernel_path_indirect(kg, + &indirect_sd, &emission_sd, rng, &pray, @@ -522,8 +527,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { - kernel_branched_path_subsurface_scatter(kg, &sd, &emission_sd, &L, &state, - rng, &ray, throughput); + kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, + &L, &state, rng, &ray, throughput); } #endif @@ -541,7 +546,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* indirect light */ kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); -- cgit v1.2.3