diff options
25 files changed, 1043 insertions, 389 deletions
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 19ab9053e9a..ab1fe8924bc 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -78,7 +78,7 @@ def use_cuda(context): def use_branched_path(context): cscene = context.scene.cycles - return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context)) + return (cscene.progressive == 'BRANCHED_PATH') def use_sample_all_lights(context): @@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): row = layout.row() sub = row.row() - sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context) sub.prop(cscene, "progressive", text="") row.prop(cscene, "use_square_samples") diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 71d52bb8097..abd26f5be49 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -240,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 3750225571d..a92e8bc4aee 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -235,6 +235,7 @@ set(SRC_UTIL_HEADERS ) set(SRC_SPLIT_HEADERS + split/kernel_branched.h split/kernel_buffer_update.h split/kernel_data_init.h split/kernel_direct_lighting.h diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index e7957042182..58da141aed3 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, + ccl_addr_space PathState *state, RNG *rng, float3 throughput, float3 ao_alpha) @@ -98,6 +98,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, } } +#ifndef __SPLIT_KERNEL__ + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, @@ -818,5 +820,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, path_rng_end(kg, rng_state, rng); } +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 085eb42325d..ddcb57161ea 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, + ccl_addr_space PathState *state, RNG *rng, float3 throughput) { @@ -65,6 +65,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, } } +#ifndef __SPLIT_KERNEL__ /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, @@ -648,6 +649,8 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, path_rng_end(kg, rng_state, rng); } +#endif /* __SPLIT_KERNEL__ */ + #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index 076c82f3853..bd4ba775b4d 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -155,7 +155,7 @@ ccl_device bool kernel_branched_path_surface_bounce( ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, - Ray *ray) + ccl_addr_space Ray *ray) { /* sample BSDF */ float bsdf_pdf; diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index baf629342b9..274713addc2 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -417,9 +417,8 @@ ccl_device_noinline void subsurface_scatter_multi_setup( subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N); } -#ifndef __SPLIT_KERNEL__ /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ -ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state, +ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state, int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -507,7 +506,6 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS /* setup diffuse bsdf */ subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N); } -#endif /* ! __SPLIT_KERNEL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index b1269cdb6b4..6417f621c8f 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -71,22 +71,18 @@ CCL_NAMESPACE_BEGIN # endif # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifndef __SPLIT_KERNEL__ -# define __BRANCHED_PATH__ -# endif +# define __BRANCHED_PATH__ # ifdef WITH_OSL # define __OSL__ # endif -# define __SUBSURFACE__ # define __PRINCIPLED__ +# define __SUBSURFACE__ # define __CMJ__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ -# ifndef __SPLIT_KERNEL__ -# define __VOLUME_DECOUPLED__ -# define __VOLUME_RECORD_ALL__ -# endif +# define __VOLUME_DECOUPLED__ +# define __VOLUME_RECORD_ALL__ #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ @@ -138,6 +134,7 @@ CCL_NAMESPACE_BEGIN # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ # define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_AMD__ */ # ifdef __KERNEL_OPENCL_INTEL_CPU__ @@ -1300,7 +1297,6 @@ typedef ccl_addr_space struct DebugData { * Queue 3 - Shadow ray cast kernel - AO * Queeu 4 - Shadow ray cast kernel - direct lighting */ -#define NUM_QUEUES 4 /* Queue names */ enum QueueNumber { @@ -1313,22 +1309,37 @@ enum QueueNumber { * 3. Rays to be regenerated * are enqueued here. */ - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contribution for AO are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contributing for direct lighting are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + +#ifdef __BRANCHED_PATH__ + /* All rays moving to next iteration of the indirect loop for light */ + QUEUE_LIGHT_INDIRECT_ITER, +# ifdef __VOLUME__ + /* All rays moving to next iteration of the indirect loop for volumes */ + QUEUE_VOLUME_INDIRECT_ITER, +# endif +# ifdef __SUBSURFACE__ + /* All rays moving to next iteration of the indirect loop for subsurface */ + QUEUE_SUBSURFACE_INDIRECT_ITER, +# endif +#endif /* __BRANCHED_PATH__ */ + + NUM_QUEUES }; -/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ -#define RAY_STATE_MASK 0x007 -#define RAY_FLAG_MASK 0x0F8 +/* We use RAY_STATE_MASK to get ray_state */ +#define RAY_STATE_MASK 0x0F +#define RAY_FLAG_MASK 0xF0 enum RayState { RAY_INVALID = 0, /* Denotes ray is actively involved in path-iteration. */ @@ -1343,14 +1354,22 @@ enum RayState { RAY_TO_REGENERATE, /* Denotes ray has been regenerated */ RAY_REGENERATED, - /* Flag's ray has to execute shadow blocked function in AO part */ - RAY_SHADOW_RAY_CAST_AO = 16, - /* Flag's ray has to execute shadow blocked function in direct lighting part. */ - RAY_SHADOW_RAY_CAST_DL = 32, + /* Denotes ray is moving to next iteration of the branched indirect loop */ + RAY_LIGHT_INDIRECT_NEXT_ITER, + RAY_VOLUME_INDIRECT_NEXT_ITER, + RAY_SUBSURFACE_INDIRECT_NEXT_ITER, + + /* Ray flags */ + + /* Flags to denote that the ray is currently evaluating the branched indirect loop */ + RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4), + RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), + RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), + RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT), }; #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) -#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state)) #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 148b2eef568..96f54bb427e 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -183,7 +183,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index a679eff8409..585b91876a9 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -110,7 +110,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl index 7a1838e485f..99b74a1802b 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_subsurface_scatter( ccl_global char *kg, ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics; - kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics); + kernel_subsurface_scatter((KernelGlobals*)kg); } diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h new file mode 100644 index 00000000000..c7bc1b4df0a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -0,0 +1,150 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __BRANCHED_PATH__ + +/* sets up the various state needed to do an indirect loop */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* save a copy of the state to restore later */ +#define BRANCHED_STORE(name) \ + branched_state->name = kernel_split_state.name[ray_index]; + + BRANCHED_STORE(path_state); + BRANCHED_STORE(throughput); + BRANCHED_STORE(ray); + BRANCHED_STORE(sd); + BRANCHED_STORE(isect); + BRANCHED_STORE(ray_state); + +#undef BRANCHED_STORE + + /* set loop counters to intial position */ + branched_state->next_closure = 0; + branched_state->next_sample = 0; +} + +/* ends an indirect loop and restores the previous state */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* restore state */ +#define BRANCHED_RESTORE(name) \ + kernel_split_state.name[ray_index] = branched_state->name; + + BRANCHED_RESTORE(path_state); + BRANCHED_RESTORE(throughput); + BRANCHED_RESTORE(ray); + BRANCHED_RESTORE(sd); + BRANCHED_RESTORE(isect); + BRANCHED_RESTORE(ray_state); + +#undef BRANCHED_RESTORE + + /* leave indirect loop */ + REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); +} + +/* bounce off surface and integrate indirect light */ +ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg, + int ray_index, + float num_samples_adjust, + ShaderData *saved_sd, + bool reset_path_state) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = saved_sd; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + float3 throughput = branched_state->throughput; + + for(int i = branched_state->next_closure; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF(sc->type)) + continue; + /* transparency is not handled here, but in outer loop */ + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + continue; + + int num_samples; + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) + num_samples = kernel_data.integrator.diffuse_samples; + else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) + num_samples = 1; + else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) + num_samples = kernel_data.integrator.glossy_samples; + else + num_samples = kernel_data.integrator.transmission_samples; + + num_samples = ceil_to_int(num_samples_adjust*num_samples); + + float num_samples_inv = num_samples_adjust/num_samples; + RNG bsdf_rng = cmj_hash(rng, i); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + if(reset_path_state) { + *ps = branched_state->path_state; + } + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = throughput; + + ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; + + if(!kernel_branched_path_surface_bounce(kg, + &bsdf_rng, + sd, + sc, + j, + num_samples, + tp, + ps, + L, + bsdf_ray)) + { + continue; + } + + /* update state for next iteration */ + branched_state->next_closure = i; + branched_state->next_sample = j+1; + branched_state->num_samples = num_samples; + + /* start the indirect path */ + *tp *= num_samples_inv; + + return true; + } + + branched_state->next_sample = 0; + } + + return false; +} + +#endif /* __BRANCHED_PATH__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 9d3d01fff75..642ccac8239 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -105,21 +105,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( /* Initialize queue data and queue index. */ if(thread_index < queuesize) { - /* Initialize active ray queue. */ - kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize background and buffer update queue. */ - kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of AO queue. */ - kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of direct lighting queue. */ - kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + for(int i = 0; i < NUM_QUEUES; i++) { + kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } } if(thread_index == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + for(int i = 0; i < NUM_QUEUES; i++) { + Queue_index[i] = 0; + } + /* The scene-intersect kernel should not use the queues very first time. * since the queue would be empty. */ diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index bdbf7387b95..3336c968a44 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; @@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, /* direct lighting */ #ifdef __EMISSION__ RNG rng = kernel_split_state.rng[ray_index]; + bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); + +# ifdef __BRANCHED_PATH__ + if(flag && kernel_data.integrator.branched) { + flag = false; + enqueue_flag = 1; + } +# endif /* __BRANCHED_PATH__ */ + # ifdef __SHADOW_TRICKS__ if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { flag = false; - ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - float3 throughput = kernel_split_state.throughput[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - kernel_branched_path_surface_connect_light(kg, - &rng, - sd, - emission_sd, - state, - throughput, - 1.0f, - L, - 1); + enqueue_flag = 1; } # endif /* __SHADOW_TRICKS__ */ + if(flag) { /* Sample illumination from lights to find path contribution. */ float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); @@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.bsdf_eval[ray_index] = L_light; kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } @@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, #endif /* __EMISSION__ */ } -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - #ifdef __EMISSION__ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ enqueue_ray_index_local(ray_index, @@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.queue_data, kernel_split_params.queue_index); #endif + +#ifdef __BRANCHED_PATH__ + /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays + * this is the last kernel before next_iteration_setup that uses local atomics so we do this here + */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_LIGHT_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h index 47d3c280831..182e6c6e4fa 100644 --- a/intern/cycles/kernel/split/kernel_do_volume.h +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -16,6 +16,81 @@ CCL_NAMESPACE_BEGIN +#if defined(__BRANCHED_PATH__) && defined(__VOLUME__) + +ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &kernel_split_state.sd[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + Ray volume_ray = branched_state->ray; + volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + *ps = branched_state->path_state; + + ccl_global Ray *pray = &kernel_split_state.ray[ray_index]; + *pray = branched_state->ray; + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = branched_state->throughput * num_samples_inv; + + /* branch RNG state */ + path_state_branch(ps, j, num_samples); + + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L); + + /* indirect light bounce */ + if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) { + continue; + } + + /* start the indirect path */ + branched_state->next_closure = 0; + branched_state->next_sample = j+1; + branched_state->num_samples = num_samples; + + return true; + } +# endif + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + /* todo: avoid this calculation using decoupled ray marching */ + float3 throughput = kernel_split_state.throughput[ray_index]; + kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput); + kernel_split_state.throughput[ray_index] = throughput; + + return false; +} + +#endif /* __BRANCHED_PATH__ && __VOLUME__ */ ccl_device void kernel_do_volume(KernelGlobals *kg) { @@ -23,37 +98,37 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) /* We will empty this queue in this kernel. */ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; +# ifdef __BRANCHED_PATH__ + kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0; +# endif /* __BRANCHED_PATH__ */ } - /* Fetch use_queues_flag. */ - char local_use_queues_flag = *kernel_split_params.use_queues_flag; - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if(local_use_queues_flag) { + + if(*kernel_split_params.use_queues_flag) { ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, kernel_split_params.queue_size, 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } } - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global char *ray_state = kernel_split_state.ray_state; - bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; - ShaderData *sd = &kernel_split_state.sd[ray_index]; - ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index]; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + + bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); /* Sanitize volume stack. */ if(!hit) { @@ -64,31 +139,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) Ray volume_ray = *ray; volume_ray.t = (hit)? isect->t: FLT_MAX; - bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); +# ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +# endif /* __BRANCHED_PATH__ */ + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); # ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED); - else - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER); + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_path_end(kg, ray_index); + } + } +# endif /* __VOLUME_SCATTER__ */ + } + +# ifdef __BRANCHED_PATH__ + } + else { + kernel_split_branched_path_volume_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } -# endif } +# endif /* __BRANCHED_PATH__ */ } + kernel_split_state.rng[ray_index] = rng; } -#endif +# ifdef __BRANCHED_PATH__ + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_VOLUME_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __VOLUME__ */ } diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 89adeb64c8a..87498910d38 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with * flag RAY_SHADOW_RAY_CAST_AO */ + ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals) @@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( } ccl_barrier(CCL_LOCAL_MEM_FENCE); +#ifdef __AO__ char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; +#endif int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -155,8 +157,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); } if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } } #endif /* __HOLDOUT__ */ @@ -164,18 +165,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - /* Holdout mask objects do not write data passes. */ - kernel_write_data_passes(kg, - buffer, - L, - sd, - sample, - state, - throughput); + +#ifdef __BRANCHED_PATH__ + if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) +#endif /* __BRANCHED_PATH__ */ + { + /* Holdout mask objects do not write data passes. */ + kernel_write_data_passes(kg, + buffer, + L, + sd, + sample, + state, + throughput); + } + /* Blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy. */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { +#ifndef __BRANCHED_PATH__ + if(kernel_data.integrator.filter_glossy != FLT_MAX) +#else + if(kernel_data.integrator.filter_glossy != FLT_MAX && + (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))) +#endif /* __BRANCHED_PATH__ */ + { float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; if(blur_pdf < 1.0f) { float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; @@ -201,19 +215,32 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate. */ +#ifndef __BRANCHED_PATH__ float probability = path_state_terminate_probability(kg, state, throughput); +#else + float probability = 1.0f; + + if(!kernel_data.integrator.branched) { + probability = path_state_terminate_probability(kg, state, throughput); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + int num_samples = kernel_split_state.branched_state[ray_index].num_samples; + probability = path_state_terminate_probability(kg, state, throughput*num_samples); + } + else if(state->flag & PATH_RAY_TRANSPARENT) { + probability = path_state_terminate_probability(kg, state, throughput); + } +#endif if(probability == 0.0f) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { if(probability != 1.0f) { float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE); if(terminate >= probability) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } else { kernel_split_state.throughput[ray_index] = throughput/probability; @@ -225,61 +252,23 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __AO__ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || - (sd->flag & SD_AO)) - { - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd); - - float3 ao_D; - float ao_pdf; - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray _ray; - _ray.P = ray_offset(sd->P, sd->Ng); - _ray.D = ao_D; - _ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - _ray.time = sd->time; -#endif - _ray.dP = sd->dP; - _ray.dD = differential3_zero(); - kernel_split_state.ao_light_ray[ray_index] = _ray; - - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - enqueue_flag_AO_SHADOW_RAY_CAST = 1; - } + if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { + enqueue_flag = 1; } } #endif /* __AO__ */ - kernel_split_state.rng[ray_index] = rng; + kernel_split_state.rng[ray_index] = rng; #ifndef __COMPUTE_DEVICE_GPU__ } #endif - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - &locals->queue_atomics_bg, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - #ifdef __AO__ /* Enqueue to-shadow-ray-cast rays. */ enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, + enqueue_flag, kernel_split_params.queue_size, &locals->queue_atomics_ao, kernel_split_state.queue_data, diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h index 8192528622e..6fbc888e358 100644 --- a/intern/cycles/kernel/split/kernel_indirect_background.h +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; if(state->bounce > kernel_data.integrator.ao_bounces) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } } } @@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { @@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); path_radiance_accum_background(L, state, (*throughput), L_background); #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } } diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h index a56e85abeb9..82bc2f01fd7 100644 --- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - kernel_path_subsurface_accum_indirect(ss_indirect, L); +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + kernel_path_subsurface_accum_indirect(ss_indirect, L); - /* Trace indirect subsurface rays by restarting the loop. this uses less - * stack memory than invoking kernel_path_indirect. - */ - if(ss_indirect->num_rays) { - kernel_path_subsurface_setup_indirect(kg, - ss_indirect, - state, - ray, - L, - throughput); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - else { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. + */ + if(ss_indirect->num_rays) { + kernel_path_subsurface_setup_indirect(kg, + ss_indirect, + state, + ray, + L, + throughput); + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } } +#ifdef __BRANCHED_PATH__ } +#endif #endif /* __SUBSURFACE__ */ diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 1bebc16e25b..71017fed19e 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ + +#ifdef __BRANCHED_PATH__ +ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); +} + +ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + + /* continue in case of transparency */ + *throughput *= shader_bsdf_transparency(kg, sd); + + if(is_zero(*throughput)) { + kernel_split_path_end(kg, ray_index); + } + else { + /* Update Path State */ + state->flag |= PATH_RAY_TRANSPARENT; + state->transparent_bounce++; + + ray->P = ray_offset(sd->P, -sd->Ng); + ray->t -= sd->ray_length; /* clipping works through transparent */ + +# ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD.dx = -sd->dI.dx; + ray->dD.dy = -sd->dI.dy; +# endif /* __RAY_DIFFERENTIALS__ */ + +# ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +# endif /* __VOLUME__ */ + } +} +#endif /* __BRANCHED_PATH__ */ + ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, ccl_local_param unsigned int *local_queue_atomics) { @@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; } - char enqueue_flag = 0; int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -75,102 +120,125 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - - /* Load ShaderData structure. */ - PathRadiance *L = NULL; - ccl_global PathState *state = NULL; ccl_global char *ray_state = kernel_split_state.ray_state; - /* Path radiance update for AO/Direct_lighting's shadow blocked. */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; - float3 _throughput = kernel_split_state.throughput[ray_index]; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = kernel_split_state.ao_light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, - _throughput, - kernel_split_state.ao_alpha[ray_index], - kernel_split_state.ao_bsdf[ray_index], - shadow, - state->bounce); - } - else { - path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]); + bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); + if(active) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + /* Compute direct lighting and next bounce. */ + if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) { + kernel_split_path_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); +#ifdef __BRANCHED_PATH__ } - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = kernel_split_state.light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t; - BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; - if(update_path_radiance) { - path_radiance_accum_light(L, - _throughput, - &L_light, - shadow, - 1.0f, - state->bounce, - kernel_split_state.is_lamp[ray_index]); + else { + kernel_split_branched_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } else { - path_radiance_accum_total_light(L, _throughput, &L_light); + kernel_split_branched_indirect_light_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } +#endif /* __BRANCHED_PATH__ */ + + kernel_split_state.rng[ray_index] = rng; } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __BRANCHED_PATH__ + /* iter loop */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; + } - /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_LIGHT_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_indirect_light_end(kg, ray_index); } - kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __VOLUME__ + /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; } -#endif + ccl_barrier(CCL_LOCAL_MEM_FENCE); - /* Enqueue RAY_UPDATE_BUFFER rays. */ + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, + QUEUE_VOLUME_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +# endif /* __VOLUME__ */ + +# ifdef __SUBSURFACE__ + /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_SUBSURFACE_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), kernel_split_params.queue_size, local_queue_atomics, kernel_split_state.queue_data, kernel_split_params.queue_index); +# endif /* __SUBSURFACE__ */ +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 0f1696e34a0..957d70958d9 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -38,8 +38,10 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg, kernel_split_params.queue_size, 0); + ccl_global char *ray_state = kernel_split_state.ray_state; + char enqueue_flag = 0; - if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { enqueue_flag = 1; } @@ -52,7 +54,7 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg, kernel_split_params.queue_index); /* Continue on with shader evaluation. */ - if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { Intersection isect = kernel_split_state.isect[ray_index]; RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; @@ -62,8 +64,27 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg, &kernel_split_state.sd[ray_index], &isect, &ray); + +#ifndef __BRANCHED_PATH__ float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); +#else + ShaderContext ctx = SHADER_CONTEXT_MAIN; + float rbsdf = 0.0f; + + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); + + } + + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + ctx = SHADER_CONTEXT_INDIRECT; + } + + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx); + shader_merge_closures(&kernel_split_state.sd[ray_index]); +#endif /* __BRANCHED_PATH__ */ + kernel_split_state.rng[ray_index] = rng; } } diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h index 4243e18de72..474286285a9 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) kernel_split_state.queue_data, kernel_split_params.queue_size, 1); } - if(ray_index == QUEUE_EMPTY_SLOT) + if(ray_index == QUEUE_EMPTY_SLOT) { return; + } - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; - - if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index]; - - float3 shadow; - Ray ray = *light_ray_global; - update_path_radiance = !(shadow_blocked(kg, - &kernel_split_state.sd_DL_shadow[ray_index], - state, - &ray, - &shadow)); - - *light_ray_global = ray; - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd)); +#ifdef __BRANCHED_PATH__ + } + else { + kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput); } +#endif + + kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index bb8f0157965..452b6e45a36 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -32,28 +32,71 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) if(ray_index == QUEUE_EMPTY_SLOT) return; - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.light_ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; - if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index]; + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + bool is_lamp = kernel_split_state.is_lamp[ray_index]; +# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__) + bool use_branched = false; + int all = 0; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + use_branched = true; + all = 1; + } +# if defined(__BRANCHED_PATH__) + else if(kernel_data.integrator.branched) { + use_branched = true; + + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + all = (kernel_data.integrator.sample_all_lights_indirect); + } + else + { + all = (kernel_data.integrator.sample_all_lights_direct); + } + } +# endif /* __BRANCHED_PATH__ */ + + if(use_branched) { + kernel_branched_path_surface_connect_light(kg, + &rng, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + all); + } + else +# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/ + { + /* trace shadow ray */ float3 shadow; - Ray ray = *light_ray_global; - update_path_radiance = !(shadow_blocked(kg, - &kernel_split_state.sd_DL_shadow[ray_index], - state, - &ray, - &shadow)); - - *light_ray_global = ray; - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; + + if(!shadow_blocked(kg, + emission_sd, + state, + &ray, + &shadow)) + { + /* accumulate */ + path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + else { + path_radiance_accum_total_light(L, throughput, &L_light); + } } + + kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 4303ba0a905..57f070d51e0 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -37,41 +37,42 @@ #include "util/util_atomic.h" -#include "kernel/kernel_random.h" -#include "kernel/kernel_projection.h" -#include "kernel/kernel_montecarlo.h" -#include "kernel/kernel_differential.h" -#include "kernel/kernel_camera.h" - -#include "kernel/geom/geom.h" -#include "kernel/bvh/bvh.h" - -#include "kernel/kernel_accumulate.h" -#include "kernel/kernel_shader.h" -#include "kernel/kernel_light.h" -#include "kernel/kernel_passes.h" - -#ifdef __SUBSURFACE__ -# include "kernel/kernel_subsurface.h" +#include "kernel/kernel_path.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/kernel_path_branched.h" #endif -#ifdef __VOLUME__ -# include "kernel/kernel_volume.h" -#endif +#include "kernel/kernel_queues.h" +#include "kernel/kernel_work_stealing.h" -#include "kernel/kernel_path_state.h" -#include "kernel/kernel_shadow.h" -#include "kernel/kernel_emission.h" -#include "kernel/kernel_path_common.h" -#include "kernel/kernel_path_surface.h" -#include "kernel/kernel_path_volume.h" -#include "kernel/kernel_path_subsurface.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/split/kernel_branched.h" +#endif -#ifdef __KERNEL_DEBUG__ -# include "kernel/kernel_debug.h" +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + +#ifdef __BRANCHED_PATH__ + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER); + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } +#else + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); #endif +} -#include "kernel/kernel_queues.h" -#include "kernel/kernel_work_stealing.h" +CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index 0af8bfc89d5..a2cb4f6ae98 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -62,7 +62,46 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(DebugData, debug_data, 1) #else # define SPLIT_DATA_DEBUG_ENTRIES -#endif +#endif /* DEBUG */ + +#ifdef __BRANCHED_PATH__ + +typedef ccl_global struct SplitBranchedState { + /* various state that must be kept and restored after an indirect loop */ + PathState path_state; + float3 throughput; + Ray ray; + + struct ShaderData sd; + Intersection isect; + + char ray_state; + + /* indirect loop state */ + int next_closure; + int next_sample; + int num_samples; + +#ifdef __SUBSURFACE__ + int ss_next_closure; + int ss_next_sample; + int next_hit; + int num_hits; + + uint lcg_state; + SubsurfaceIntersection ss_isect; + +# ifdef __VOLUME__ + VolumeStack volume_stack[VOLUME_STACK_SIZE]; +# endif /* __VOLUME__ */ +#endif /*__SUBSURFACE__ */ +} SplitBranchedState; + +#define SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1) +#else +#define SPLIT_DATA_BRANCHED_ENTRIES +#endif /* __BRANCHED_PATH__ */ #define SPLIT_DATA_ENTRIES \ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ @@ -72,9 +111,6 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ - SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \ - SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ @@ -82,6 +118,7 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_BRANCHED_ENTRIES \ SPLIT_DATA_DEBUG_ENTRIES \ /* struct that holds pointers to data in the shared state buffer */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h index 0b4d50c70ee..8364f185d75 100644 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -16,42 +16,206 @@ CCL_NAMESPACE_BEGIN +#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__) -ccl_device void kernel_subsurface_scatter(KernelGlobals *kg, - ccl_local_param unsigned int* local_queue_atomics) +ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index) { -#ifdef __SUBSURFACE__ - if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + branched_state->ss_next_closure = 0; + branched_state->ss_next_sample = 0; + + branched_state->num_hits = 0; + branched_state->next_hit = 0; + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &branched_state->sd; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && + branched_state->next_closure == 0 && branched_state->next_sample == 0) + { + branched_state->lcg_state = lcg_state_init(&rng, + branched_state->path_state.rng_offset, + branched_state->path_state.sample, + 0x68bc21eb); + } + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + RNG bssrdf_rng = cmj_hash(rng, i); + + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = branched_state->ss_next_sample; j < num_samples; j++) { + ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, + &bssrdf_rng, + &branched_state->path_state, + j, + num_samples, + PRNG_BSDF_U, + &bssrdf_u, + &bssrdf_v); + + /* intersection is expensive so avoid doing multiple times for the same input */ + if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { + RNG lcg_state = branched_state->lcg_state; + SubsurfaceIntersection ss_isect_private; + + branched_state->num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect_private, + sd, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + true); + + branched_state->lcg_state = lcg_state; + *ss_isect = ss_isect_private; + } + +#ifdef __VOLUME__ + Ray volume_ray = branched_state->ray; + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) { + ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index]; + *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is + * important as the indirect path will write into bssrdf_sd */ + + SubsurfaceIntersection ss_isect_private = *ss_isect; + subsurface_scatter_multi_setup(kg, + &ss_isect_private, + hit, + bssrdf_sd, + &branched_state->path_state, + branched_state->path_state.flag, + sc, + true); + *ss_isect = ss_isect_private; + + ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index]; + *hit_state = branched_state->path_state; + + path_state_branch(hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng); + volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); + + /* this next part is expensive as it does scene intersection so only do once */ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + branched_state->volume_stack[k] = hit_state->volume_stack[k]; + } + + kernel_volume_stack_update_for_subsurface(kg, + emission_sd, + &volume_ray, + branched_state->volume_stack); + } + + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + hit_state->volume_stack[k] = branched_state->volume_stack[k]; + } + } +#endif /* __VOLUME__ */ + +#ifdef __EMISSION__ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + int all = (kernel_data.integrator.sample_all_lights_direct) || + (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + &rng, + bssrdf_sd, + emission_sd, + hit_state, + branched_state->throughput, + num_samples_inv, + L, + all); + } + } +#endif /* __EMISSION__ */ + + /* indirect light */ + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + num_samples_inv, + bssrdf_sd, + false)) + { + branched_state->ss_next_closure = i; + branched_state->ss_next_sample = j; + branched_state->next_hit = hit; + + return true; + } + + branched_state->next_closure = 0; + } + + branched_state->next_hit = 0; + } + + branched_state->ss_next_sample = 0; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + return false; +} + +#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */ + +ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; } - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, kernel_split_params.queue_size, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - - char enqueue_flag = 0; - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif + 1); + get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); +#ifdef __SUBSURFACE__ ccl_global char *ray_state = kernel_split_state.ray_state; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; @@ -64,34 +228,85 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg, if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { if(sd->flag & SD_BSSRDF) { - if(kernel_path_subsurface_scatter(kg, - sd, - emission_sd, - L, - state, - &rng, - ray, - throughput, - ss_indirect)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + &rng, + ray, + throughput, + ss_indirect)) { + kernel_split_path_end(kg, ray_index); + } +#ifdef __BRANCHED_PATH__ + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + float bssrdf_probability; + ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + + /* modify throughput for picking bssrdf or bsdf */ + *throughput *= bssrdf_probability; + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb); + + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + &rng, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); + subsurface_scatter_step(kg, + sd, + state, + state->flag, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); + } + } + else { + kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } } +#endif } kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __BRANCHED_PATH__ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0; } -#endif - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_SUBSURFACE_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ #endif /* __SUBSURFACE__ */ |