diff options
author | Mai Lavelle <mai.lavelle@gmail.com> | 2017-03-21 05:31:54 +0300 |
---|---|---|
committer | Mai Lavelle <mai.lavelle@gmail.com> | 2017-05-02 21:26:46 +0300 |
commit | 915766f42df56a0fee47e8eb21f8b024f01afe18 (patch) | |
tree | 52b29b9ff2385e42221e2d39741a220354937e63 /intern/cycles/kernel/split/kernel_next_iteration_setup.h | |
parent | 89b1805df624bf665336baaf51650928c556456b (diff) |
Cycles: Branched path tracing for the split kernel
This implements branched path tracing for the split kernel.
General approach is to store the ray state at a branch point, trace the
branched ray as normal, then restore the state as necessary before iterating
to the next part of the path. A state machine is used to advance the indirect
loop state, which avoids the need to add any new kernels. Each iteration the
state machine recreates as much state as possible from the stored ray to keep
overall storage down.
Its kind of hard to keep all the different integration loops in sync, so this
needs lots of testing to make sure everything is working correctly. We should
probably start trying to deduplicate the integration loops more now.
Nonbranched BMW is ~2% slower, while classroom is ~2% faster, other scenes
could use more testing still.
Reviewers: sergey, nirved
Reviewed By: nirved
Subscribers: Blendify, bliblubli
Differential Revision: https://developer.blender.org/D2611
Diffstat (limited to 'intern/cycles/kernel/split/kernel_next_iteration_setup.h')
-rw-r--r-- | intern/cycles/kernel/split/kernel_next_iteration_setup.h | 220 |
1 files changed, 144 insertions, 76 deletions
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 1bebc16e25b..71017fed19e 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ + +#ifdef __BRANCHED_PATH__ +ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); +} + +ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + + /* continue in case of transparency */ + *throughput *= shader_bsdf_transparency(kg, sd); + + if(is_zero(*throughput)) { + kernel_split_path_end(kg, ray_index); + } + else { + /* Update Path State */ + state->flag |= PATH_RAY_TRANSPARENT; + state->transparent_bounce++; + + ray->P = ray_offset(sd->P, -sd->Ng); + ray->t -= sd->ray_length; /* clipping works through transparent */ + +# ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD.dx = -sd->dI.dx; + ray->dD.dy = -sd->dI.dy; +# endif /* __RAY_DIFFERENTIALS__ */ + +# ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +# endif /* __VOLUME__ */ + } +} +#endif /* __BRANCHED_PATH__ */ + ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, ccl_local_param unsigned int *local_queue_atomics) { @@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; } - char enqueue_flag = 0; int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -75,102 +120,125 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - - /* Load ShaderData structure. */ - PathRadiance *L = NULL; - ccl_global PathState *state = NULL; ccl_global char *ray_state = kernel_split_state.ray_state; - /* Path radiance update for AO/Direct_lighting's shadow blocked. */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; - float3 _throughput = kernel_split_state.throughput[ray_index]; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = kernel_split_state.ao_light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, - _throughput, - kernel_split_state.ao_alpha[ray_index], - kernel_split_state.ao_bsdf[ray_index], - shadow, - state->bounce); - } - else { - path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]); + bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); + if(active) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + /* Compute direct lighting and next bounce. */ + if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) { + kernel_split_path_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); +#ifdef __BRANCHED_PATH__ } - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = kernel_split_state.light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t; - BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; - if(update_path_radiance) { - path_radiance_accum_light(L, - _throughput, - &L_light, - shadow, - 1.0f, - state->bounce, - kernel_split_state.is_lamp[ray_index]); + else { + kernel_split_branched_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } else { - path_radiance_accum_total_light(L, _throughput, &L_light); + kernel_split_branched_indirect_light_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } +#endif /* __BRANCHED_PATH__ */ + + kernel_split_state.rng[ray_index] = rng; } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __BRANCHED_PATH__ + /* iter loop */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; + } - /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_LIGHT_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_indirect_light_end(kg, ray_index); } - kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __VOLUME__ + /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; } -#endif + ccl_barrier(CCL_LOCAL_MEM_FENCE); - /* Enqueue RAY_UPDATE_BUFFER rays. */ + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, + QUEUE_VOLUME_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +# endif /* __VOLUME__ */ + +# ifdef __SUBSURFACE__ + /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_SUBSURFACE_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), kernel_split_params.queue_size, local_queue_atomics, kernel_split_state.queue_data, kernel_split_params.queue_index); +# endif /* __SUBSURFACE__ */ +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END |