From 915766f42df56a0fee47e8eb21f8b024f01afe18 Mon Sep 17 00:00:00 2001 From: Mai Lavelle Date: Mon, 20 Mar 2017 22:31:54 -0400 Subject: Cycles: Branched path tracing for the split kernel This implements branched path tracing for the split kernel. General approach is to store the ray state at a branch point, trace the branched ray as normal, then restore the state as necessary before iterating to the next part of the path. A state machine is used to advance the indirect loop state, which avoids the need to add any new kernels. Each iteration the state machine recreates as much state as possible from the stored ray to keep overall storage down. Its kind of hard to keep all the different integration loops in sync, so this needs lots of testing to make sure everything is working correctly. We should probably start trying to deduplicate the integration loops more now. Nonbranched BMW is ~2% slower, while classroom is ~2% faster, other scenes could use more testing still. Reviewers: sergey, nirved Reviewed By: nirved Subscribers: Blendify, bliblubli Differential Revision: https://developer.blender.org/D2611 --- .../cycles/kernel/split/kernel_direct_lighting.h | 66 +++++++++++----------- 1 file changed, 32 insertions(+), 34 deletions(-) (limited to 'intern/cycles/kernel/split/kernel_direct_lighting.h') diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index bdbf7387b95..3336c968a44 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; @@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, /* direct lighting */ #ifdef __EMISSION__ RNG rng = kernel_split_state.rng[ray_index]; + bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); + +# ifdef __BRANCHED_PATH__ + if(flag && kernel_data.integrator.branched) { + flag = false; + enqueue_flag = 1; + } +# endif /* __BRANCHED_PATH__ */ + # ifdef __SHADOW_TRICKS__ if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { flag = false; - ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - float3 throughput = kernel_split_state.throughput[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - kernel_branched_path_surface_connect_light(kg, - &rng, - sd, - emission_sd, - state, - throughput, - 1.0f, - L, - 1); + enqueue_flag = 1; } # endif /* __SHADOW_TRICKS__ */ + if(flag) { /* Sample illumination from lights to find path contribution. */ float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); @@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.bsdf_eval[ray_index] = L_light; kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } @@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, #endif /* __EMISSION__ */ } -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - #ifdef __EMISSION__ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ enqueue_ray_index_local(ray_index, @@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.queue_data, kernel_split_params.queue_index); #endif + +#ifdef __BRANCHED_PATH__ + /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays + * this is the last kernel before next_iteration_setup that uses local atomics so we do this here + */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_LIGHT_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END -- cgit v1.2.3