diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-09-13 01:12:51 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-09-13 16:24:14 +0300 |
commit | c4c450045d072c79d02a1857f56ecf94689375c8 (patch) | |
tree | dca97bb0b1aa073f55fe599d1306351eae0b7461 | |
parent | 76e7c039ec501e64f55b293a17aee1968cdb7aeb (diff) |
Code cleanup: tweak inlining for 2% better CUDA performance with hair.
-rw-r--r-- | intern/cycles/kernel/kernel_path.h | 191 |
1 files changed, 100 insertions, 91 deletions
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index bc8f8ec09f8..bfde96ec270 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -424,26 +424,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ -ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, - uint rng_hash, - int sample, - Ray ray, - ccl_global float *buffer, - PathRadiance *L, - bool *is_shadow_catcher) +ccl_device_forceinline void kernel_path_integrate( + KernelGlobals *kg, + PathState *state, + float3 throughput, + Ray *ray, + PathRadiance *L, + float *buffer, + ShaderData *emission_sd, + bool *is_shadow_catcher) { - /* initialize */ - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - - path_radiance_init(L, kernel_data.film.use_light_pass); - - /* shader data memory used for both volumes and surfaces, saves stack space */ + /* Shader data memory used for both volumes and surfaces, saves stack space. */ ShaderData sd; - /* shader data used by emission, shadows, volume stacks */ - ShaderData emission_sd; - - PathState state; - path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); #ifdef __SUBSURFACE__ SubsurfaceIndirectRays ss_indirect; @@ -456,40 +448,40 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, for(;;) { /* intersect scene */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); + uint visibility = path_state_ray_visibility(kg, state); #ifdef __HAIR__ float difl = 0.0f, extmax = 0.0f; uint lcg_state = 0; if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray->dD.dx + ray->dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray->D)*ray->D;*/ difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(&state, 0x51633e2d); + lcg_state = lcg_state_init(state, 0x51633e2d); } - if(path_state_ao_bounce(kg, &state)) { + if(path_state_ao_bounce(kg, state)) { visibility = PATH_RAY_SHADOW; - ray.t = kernel_data.background.ao_distance; + ray->t = kernel_data.background.ao_distance; } - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); + bool hit = scene_intersect(kg, *ray, visibility, &isect, &lcg_state, difl, extmax); #else if(path_state_ao_bounce(kg, state)) { visibility = PATH_RAY_SHADOW; - ray.t = kernel_data.background.ao_distance; + ray->t = kernel_data.background.ao_distance; } bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); #endif /* __HAIR__ */ #ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { + if(state->flag & PATH_RAY_CAMERA) { L->debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; L->debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; L->debug_data.num_bvh_intersections += isect.num_intersections; @@ -498,40 +490,40 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, #endif /* __KERNEL_DEBUG__ */ #ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { + if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { /* ray starting from previous non-transparent bounce */ Ray light_ray; - light_ray.P = ray.P - state.ray_t*ray.D; - state.ray_t += isect.t; - light_ray.D = ray.D; - light_ray.t = state.ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; + light_ray.P = ray->P - state->ray_t*ray->D; + state->ray_t += isect.t; + light_ray.D = ray->D; + light_ray.t = state->ray_t; + light_ray.time = ray->time; + light_ray.dD = ray->dD; + light_ray.dP = ray->dP; /* intersect with lamp */ float3 emission; - if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(L, throughput, emission, state.bounce); + if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) + path_radiance_accum_emission(L, throughput, emission, state->bounce); } #endif /* __LAMP_MIS__ */ #ifdef __VOLUME__ /* Sanitize volume stack. */ if(!hit) { - kernel_volume_clean_stack(kg, state.volume_stack); + kernel_volume_clean_stack(kg, state->volume_stack); } /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; + if(state->volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = *ray; volume_ray.t = (hit)? isect.t: FLT_MAX; - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); # ifdef __VOLUME_DECOUPLED__ - int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + int sampling_method = volume_stack_sampling_method(kg, state->volume_stack); bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); if(decoupled) { @@ -539,14 +531,14 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, VolumeSegment volume_segment; shader_setup_from_volume(kg, &sd, &volume_ray); - kernel_volume_decoupled_record(kg, &state, + kernel_volume_decoupled_record(kg, state, &volume_ray, &sd, &volume_segment, heterogeneous); volume_segment.sampling_method = sampling_method; /* emission */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state->bounce); /* scattering */ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; @@ -556,17 +548,17 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, &sd, - &emission_sd, throughput, &state, L, all, + emission_sd, throughput, state, L, all, &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just * one sample for direct and indirect light, we could share * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); result = kernel_volume_decoupled_scatter(kg, - &state, &volume_ray, &sd, &throughput, + state, &volume_ray, &sd, &throughput, rphase, rscatter, &volume_segment, NULL, true); } @@ -574,7 +566,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray)) + if(kernel_path_volume_bounce(kg, &sd, &throughput, state, L, ray)) continue; else break; @@ -588,15 +580,15 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, L, &throughput, heterogeneous); + kg, state, &sd, &volume_ray, L, &throughput, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L); + kernel_path_volume_connect_light(kg, &sd, emission_sd, throughput, state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray)) + if(kernel_path_volume_bounce(kg, &sd, &throughput, state, L, ray)) continue; else break; @@ -608,7 +600,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, if(!hit) { /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) { + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { L->transparent += average(throughput); #ifdef __PASSES__ @@ -619,35 +611,35 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(L, &state, throughput, L_background); + float3 L_background = indirect_background(kg, emission_sd, state, ray); + path_radiance_accum_background(L, state, throughput, L_background); #endif /* __BACKGROUND__ */ break; } - else if(path_state_ao_bounce(kg, &state)) { + else if(path_state_ao_bounce(kg, state)) { break; } /* setup shading */ - shader_setup_from_ray(kg, &sd, &isect, &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, &state, PRNG_BSDF); - shader_eval_surface(kg, &sd, &state, rbsdf, state.flag); + shader_setup_from_ray(kg, &sd, &isect, ray); + float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); + shader_eval_surface(kg, &sd, state, rbsdf, state->flag); #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | + if(state->flag & PATH_RAY_CAMERA) { + state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO); if(!kernel_data.background.transparent) { L->shadow_background_color = - indirect_background(kg, &emission_sd, &state, &ray); + indirect_background(kg, emission_sd, state, ray); } L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); L->shadow_throughput = average(throughput); } } - else if(state.flag & PATH_RAY_SHADOW_CATCHER) { + else if(state->flag & PATH_RAY_SHADOW_CATCHER) { /* Only update transparency after shadow catcher bounce. */ L->shadow_transparency *= average(shader_bsdf_transparency(kg, &sd)); @@ -658,7 +650,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, #ifdef __HOLDOUT__ if(((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) && - (state.flag & PATH_RAY_CAMERA)) + (state->flag & PATH_RAY_CAMERA)) { if(kernel_data.background.transparent) { float3 holdout_weight; @@ -679,12 +671,12 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, state->sample, state, throughput); /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf; + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; if(blur_pdf < 1.0f) { float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; @@ -696,33 +688,33 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, /* emission */ if(sd.flag & SD_EMISSION) { /* todo: is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state.bounce); + float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, throughput, emission, state->bounce); } #endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_continuation_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; throughput /= probability; } - kernel_update_denoising_features(kg, &sd, &state, L); + kernel_update_denoising_features(kg, &sd, state, L); #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -732,10 +724,10 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_BSSRDF) { if(kernel_path_subsurface_scatter(kg, &sd, - &emission_sd, + emission_sd, L, - &state, - &ray, + state, + ray, &throughput, &ss_indirect)) { @@ -745,10 +737,10 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, &sd, &emission_sd, throughput, &state, L); + kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, &sd, &throughput, &state, L, &ray)) + if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, L, ray)) break; } @@ -761,8 +753,8 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, if(ss_indirect.num_rays) { kernel_path_subsurface_setup_indirect(kg, &ss_indirect, - &state, - &ray, + state, + ray, L, &throughput); } @@ -773,7 +765,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ #ifdef __SHADOW_TRICKS__ - *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER) != 0; + *is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER) != 0; #endif /* __SHADOW_TRICKS__ */ } @@ -788,23 +780,40 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, rng_state += index; buffer += index*pass_stride; - /* initialize random numbers and ray */ + /* Initialize random numbers and sample ray. */ uint rng_hash; Ray ray; kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray); - /* integrate */ - PathRadiance L; - bool is_shadow_catcher; - - if(ray.t != 0.0f) { - kernel_path_integrate(kg, rng_hash, sample, ray, buffer, &L, &is_shadow_catcher); - kernel_write_result(kg, buffer, sample, &L, is_shadow_catcher); - } - else { + if(ray.t == 0.0f) { kernel_write_result(kg, buffer, sample, NULL, false); + return; } + + /* Initialize state. */ + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + + PathRadiance L; + path_radiance_init(&L, kernel_data.film.use_light_pass); + + ShaderData emission_sd; + + PathState state; + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); + + /* Integrate. */ + bool is_shadow_catcher; + kernel_path_integrate(kg, + &state, + throughput, + &ray, + &L, + buffer, + &emission_sd, + &is_shadow_catcher); + + kernel_write_result(kg, buffer, sample, &L, is_shadow_catcher); } #endif /* __SPLIT_KERNEL__ */ |