diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2016-05-22 23:35:47 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2016-05-23 23:29:24 +0300 |
commit | 999d5a67852b5958b9361c9888734ebc889e4a22 (patch) | |
tree | 5f3c5ad0409c77fc6ae3486420b3888fa1e2fea8 /intern/cycles/kernel/kernel_path.h | |
parent | af4a04eae07184f7437a8c51858a4ddb8a2e3e4c (diff) |
Cycles CUDA: reduce stack memory by reusing ShaderData.
57% less for path and 48% less for branched path.
Diffstat (limited to 'intern/cycles/kernel/kernel_path.h')
-rw-r--r-- | intern/cycles/kernel/kernel_path.h | 77 |
1 files changed, 44 insertions, 33 deletions
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index c136c85df59..5527d8aa861 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -53,6 +53,7 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_path_indirect(KernelGlobals *kg, + ShaderData *emission_sd, RNG *rng, Ray *ray, float3 throughput, @@ -60,6 +61,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, PathState *state, PathRadiance *L) { + /* shader data memory used for both volumes and surfaces, saves stack space */ + ShaderData sd; + /* path iteration */ for(;;) { /* intersect scene */ @@ -87,7 +91,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* intersect with lamp */ float3 emission; - if(indirect_lamp_emission(kg, state, &light_ray, &emission)) { + if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) { path_radiance_accum_emission(L, throughput, emission, @@ -115,15 +119,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, if(decoupled) { /* cache steps along volume for repeated sampling */ VolumeSegment volume_segment; - ShaderData volume_sd; shader_setup_from_volume(kg, - &volume_sd, + &sd, &volume_ray); kernel_volume_decoupled_record(kg, state, &volume_ray, - &volume_sd, + &sd, &volume_segment, heterogeneous); @@ -146,7 +149,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, rng, - &volume_sd, + &sd, + emission_sd, throughput, state, L, @@ -163,7 +167,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, result = kernel_volume_decoupled_scatter(kg, state, &volume_ray, - &volume_sd, + &sd, &throughput, rphase, rscatter, @@ -178,7 +182,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, if(result == VOLUME_PATH_SCATTERED) { if(kernel_path_volume_bounce(kg, rng, - &volume_sd, + &sd, &throughput, state, L, @@ -198,16 +202,16 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, # endif { /* integrate along volume segment with distance sampling */ - ShaderData volume_sd; VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous); + kg, state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ kernel_path_volume_connect_light(kg, rng, - &volume_sd, + &sd, + emission_sd, throughput, state, L); @@ -215,7 +219,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* indirect light bounce */ if(kernel_path_volume_bounce(kg, rng, - &volume_sd, + &sd, &throughput, state, L, @@ -235,7 +239,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, if(!hit) { #ifdef __BACKGROUND__ /* sample background shader */ - float3 L_background = indirect_background(kg, state, ray); + float3 L_background = indirect_background(kg, emission_sd, state, ray); path_radiance_accum_background(L, throughput, L_background, @@ -246,7 +250,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } /* setup shading */ - ShaderData sd; shader_setup_from_ray(kg, &sd, &isect, @@ -328,7 +331,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, light_ray.dP = sd.dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) { + if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, throughput, ao_alpha, @@ -378,6 +381,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, kernel_branched_path_surface_connect_light(kg, rng, &sd, + emission_sd, state, throughput, 1.0f, @@ -393,6 +397,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, + ShaderData *emission_sd, PathRadiance *L, PathState *state, RNG *rng, @@ -425,7 +430,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.dP = ccl_fetch(sd, dP); light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) + if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); } } @@ -435,6 +440,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ccl_device bool kernel_path_subsurface_scatter( KernelGlobals *kg, ShaderData *sd, + ShaderData *emission_sd, PathRadiance *L, PathState *state, RNG *rng, @@ -503,7 +509,7 @@ ccl_device bool kernel_path_subsurface_scatter( hit_L->direct_throughput = L->direct_throughput; path_radiance_copy_indirect(hit_L, L); - kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, hit_L); + kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); if(kernel_path_surface_bounce(kg, rng, @@ -526,6 +532,7 @@ ccl_device bool kernel_path_subsurface_scatter( kernel_volume_stack_update_for_subsurface( kg, + emission_sd, &volume_ray, hit_state->volume_stack); } @@ -604,8 +611,13 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, path_radiance_init(&L, kernel_data.film.use_light_pass); + /* shader data memory used for both volumes and surfaces, saves stack space */ + ShaderData sd; + /* shader data used by emission, shadows, volume stacks */ + ShaderData emission_sd; + PathState state; - path_state_init(kg, &state, rng, sample, &ray); + path_state_init(kg, &emission_sd, &state, rng, sample, &ray); #ifdef __KERNEL_DEBUG__ DebugData debug_data; @@ -669,7 +681,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* intersect with lamp */ float3 emission; - if(indirect_lamp_emission(kg, &state, &light_ray, &emission)) + if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) path_radiance_accum_emission(&L, throughput, emission, state.bounce); } #endif @@ -689,11 +701,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(decoupled) { /* cache steps along volume for repeated sampling */ VolumeSegment volume_segment; - ShaderData volume_sd; - shader_setup_from_volume(kg, &volume_sd, &volume_ray); + shader_setup_from_volume(kg, &sd, &volume_ray); kernel_volume_decoupled_record(kg, &state, - &volume_ray, &volume_sd, &volume_segment, heterogeneous); + &volume_ray, &sd, &volume_segment, heterogeneous); volume_segment.sampling_method = sampling_method; @@ -708,8 +719,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, int all = false; /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, - throughput, &state, &L, all, &volume_ray, &volume_segment); + kernel_branched_path_volume_connect_light(kg, rng, &sd, + &emission_sd, throughput, &state, &L, all, + &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just * one sample for direct and indirect light, we could share @@ -718,7 +730,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); result = kernel_volume_decoupled_scatter(kg, - &state, &volume_ray, &volume_sd, &throughput, + &state, &volume_ray, &sd, &throughput, rphase, rscatter, &volume_segment, NULL, true); } @@ -726,7 +738,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) continue; else break; @@ -739,17 +751,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, # endif { /* integrate along volume segment with distance sampling */ - ShaderData volume_sd; VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous); + kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) continue; else break; @@ -772,7 +783,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ - float3 L_background = indirect_background(kg, &state, &ray); + float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); path_radiance_accum_background(&L, throughput, L_background, state.bounce); #endif @@ -780,7 +791,6 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } /* setup shading */ - ShaderData sd; shader_setup_from_ray(kg, &sd, &isect, &ray); float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); shader_eval_surface(kg, &sd, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); @@ -848,7 +858,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &L, &state, rng, throughput); + kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); } #endif @@ -858,6 +868,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_BSSRDF) { if(kernel_path_subsurface_scatter(kg, &sd, + &emission_sd, &L, &state, rng, @@ -871,7 +882,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); /* compute direct lighting and next bounce */ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) |