Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/split')
-rw-r--r--intern/cycles/kernel/split/kernel_background_buffer_update.h248
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h219
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h154
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h261
-rw-r--r--intern/cycles/kernel/split/kernel_direct_lighting.h157
-rw-r--r--intern/cycles/kernel/split/kernel_do_volume.h220
-rw-r--r--intern/cycles/kernel/split/kernel_enqueue_inactive.h46
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h318
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_background.h65
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_subsurface.h79
-rw-r--r--intern/cycles/kernel/split/kernel_lamp_emission.h99
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h314
-rw-r--r--intern/cycles/kernel/split/kernel_path_init.h81
-rw-r--r--intern/cycles/kernel/split/kernel_queue_enqueue.h91
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h135
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h92
-rw-r--r--intern/cycles/kernel/split/kernel_shader_setup.h70
-rw-r--r--intern/cycles/kernel/split/kernel_shader_sort.h97
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked.h85
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_ao.h55
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h107
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h102
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h57
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h175
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h313
-rw-r--r--intern/cycles/kernel/split/kernel_sum_all_radiance.h59
26 files changed, 2513 insertions, 1186 deletions
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
deleted file mode 100644
index 9bfa71c75ef..00000000000
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------| |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------| |--- ray_state
- * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array
- * parallel_samples -------------------------------------| |--- PathState_coop
- * end_sample -------------------------------------------| |--- throughput_coop
- * kg (globals) -----------------------------------------| |--- rng_coop
- * rng_state --------------------------------------------| |--- Ray
- * PathRadiance_coop ------------------------------------| |
- * sw ---------------------------------------------------| |
- * sh ---------------------------------------------------| |
- * sx ---------------------------------------------------| |
- * sy ---------------------------------------------------| |
- * stride -----------------------------------------------| |
- * work_array -------------------------------------------| |--- work_array
- * queuesize --------------------------------------------| |
- * start_sample -----------------------------------------| |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------| |
- * num_samples ------------------------------------------| |
- *
- * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
- */
-ccl_device char kernel_background_buffer_update(
- KernelGlobals *kg,
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_state,
- ccl_global uint *rng_coop, /* Required for buffer Update */
- ccl_global float3 *throughput_coop, /* Required for background hit processing */
- PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */
- ccl_global Ray *Ray_coop, /* Required for background hit processing */
- ccl_global PathState *PathState_coop, /* Required for background hit processing */
- ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */
- ccl_global char *ray_state, /* Stores information on the current state of a ray */
- int sw, int sh, int sx, int sy, int stride,
- int rng_state_offset_x,
- int rng_state_offset_y,
- int rng_state_stride,
- ccl_global unsigned int *work_array, /* Denotes work of each ray */
- int end_sample,
- int start_sample,
-#ifdef __WORK_STEALING__
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples, /* Number of samples to be processed in parallel */
- int ray_index)
-{
- char enqueue_flag = 0;
-#ifdef __KERNEL_DEBUG__
- DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
- ccl_global PathState *state = &PathState_coop[ray_index];
- PathRadiance *L = L = &PathRadiance_coop[ray_index];
- ccl_global Ray *ray = &Ray_coop[ray_index];
- ccl_global float3 *throughput = &throughput_coop[ray_index];
- ccl_global float *L_transparent = &L_transparent_coop[ray_index];
- ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
- unsigned int my_work;
- ccl_global float *initial_per_sample_output_buffers;
- ccl_global uint *initial_rng;
-#endif
- unsigned int sample;
- unsigned int tile_x;
- unsigned int tile_y;
- unsigned int pixel_x;
- unsigned int pixel_y;
- unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
- my_work = work_array[ray_index];
- sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
- get_pixel_tile_position(&pixel_x, &pixel_y,
- &tile_x, &tile_y,
- my_work,
- sw, sh, sx, sy,
- parallel_samples,
- ray_index);
- my_sample_tile = 0;
- initial_per_sample_output_buffers = per_sample_output_buffers;
- initial_rng = rng_state;
-#else /* __WORK_STEALING__ */
- sample = work_array[ray_index];
- int tile_index = ray_index / parallel_samples;
- /* buffer and rng_state's stride is "stride". Find x and y using ray_index */
- tile_x = tile_index % sw;
- tile_y = tile_index / sw;
- my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif /* __WORK_STEALING__ */
-
- rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
- per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
- if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
- /* eval background shader if nothing hit */
- if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
- *L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
- if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- }
-
- if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
- /* sample background shader */
- float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
- path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
-#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- }
- }
-
- if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- float3 L_sum = path_radiance_clamp_and_sum(kg, L);
- kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
-#ifdef __KERNEL_DEBUG__
- kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
-#endif
- float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-
- /* accumulate result in output buffer */
- kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
- path_rng_end(kg, rng_state, *rng);
-
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
- }
-
- if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
- /* We have completed current work; So get next work */
- int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
- if(!valid_work) {
- /* If work is invalid, this means no more work is available and the thread may exit */
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
- }
-#else /* __WORK_STEALING__ */
- if((sample + parallel_samples) >= end_sample) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
- }
-#endif /* __WORK_STEALING__ */
-
- if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
- work_array[ray_index] = my_work;
- /* Get the sample associated with the current work */
- sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
- /* Get pixel and tile position associated with current work */
- get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
- my_sample_tile = 0;
-
- /* Remap rng_state according to the current work */
- rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
- /* Remap per_sample_output_buffers according to the current work */
- per_sample_output_buffers = initial_per_sample_output_buffers
- + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else /* __WORK_STEALING__ */
- work_array[ray_index] = sample + parallel_samples;
- sample = work_array[ray_index];
-
- /* Get ray position from ray index */
- pixel_x = sx + ((ray_index / parallel_samples) % sw);
- pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif /* __WORK_STEALING__ */
-
- /* Initialize random numbers and ray. */
- kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-
- if(ray->t != 0.0f) {
- /* Initialize throughput, L_transparent, Ray, PathState;
- * These rays proceed with path-iteration.
- */
- *throughput = make_float3(1.0f, 1.0f, 1.0f);
- *L_transparent = 0.0f;
- path_radiance_init(L, kernel_data.film.use_light_pass);
- path_state_init(kg, kg->sd_input, state, rng, sample, ray);
-#ifdef __KERNEL_DEBUG__
- debug_data_init(debug_data);
-#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- enqueue_flag = 1;
- }
- else {
- /* These rays do not participate in path-iteration. */
- float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- /* Accumulate result in output buffer. */
- kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
- path_rng_end(kg, rng_state, *rng);
-
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
- }
- }
- }
- return enqueue_flag;
-}
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..2313feac089
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ /* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+ branched_state->name = kernel_split_state.name[ray_index];
+
+ BRANCHED_STORE(path_state);
+ BRANCHED_STORE(throughput);
+ BRANCHED_STORE(ray);
+ BRANCHED_STORE(sd);
+ BRANCHED_STORE(isect);
+ BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+ /* set loop counters to intial position */
+ branched_state->next_closure = 0;
+ branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ /* restore state */
+#define BRANCHED_RESTORE(name) \
+ kernel_split_state.name[ray_index] = branched_state->name;
+
+ BRANCHED_RESTORE(path_state);
+ BRANCHED_RESTORE(throughput);
+ BRANCHED_RESTORE(ray);
+ BRANCHED_RESTORE(sd);
+ BRANCHED_RESTORE(isect);
+ BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+ /* leave indirect loop */
+ REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+ if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+ return false;
+ }
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+ kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+ SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+ kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+ kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+ kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+ path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+ path_radiance_copy_indirect(inactive_L, L);
+
+ ray_state[inactive_ray] = RAY_REGENERATED;
+ ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+ ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+ atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+ return true;
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+ int ray_index,
+ float num_samples_adjust,
+ ShaderData *saved_sd,
+ bool reset_path_state,
+ bool wait_for_shared)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = saved_sd;
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ float3 throughput = branched_state->throughput;
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(ps->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
+ for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF(sc->type))
+ continue;
+ /* transparency is not handled here, but in outer loop */
+ if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ continue;
+
+ int num_samples;
+
+ if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+ num_samples = kernel_data.integrator.diffuse_samples;
+ else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+ num_samples = 1;
+ else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+ num_samples = kernel_data.integrator.glossy_samples;
+ else
+ num_samples = kernel_data.integrator.transmission_samples;
+
+ num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+ float num_samples_inv = num_samples_adjust/num_samples;
+
+ for(int j = branched_state->next_sample; j < num_samples; j++) {
+ if(reset_path_state) {
+ *ps = branched_state->path_state;
+ }
+
+ ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+ ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+ *tp = throughput;
+
+ ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+ if(!kernel_branched_path_surface_bounce(kg,
+ sd,
+ sc,
+ j,
+ num_samples,
+ tp,
+ ps,
+ &L->state,
+ bsdf_ray,
+ sum_sample_weight))
+ {
+ continue;
+ }
+
+ ps->rng_hash = branched_state->path_state.rng_hash;
+
+ /* update state for next iteration */
+ branched_state->next_closure = i;
+ branched_state->next_sample = j+1;
+
+ /* start the indirect path */
+ *tp *= num_samples_inv;
+
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+
+ return true;
+ }
+
+ branched_state->next_sample = 0;
+ }
+
+ branched_state->next_closure = sd->num_closure;
+
+ if(wait_for_shared) {
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
new file mode 100644
index 00000000000..511334e0550
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel takes care of rays that hit the background (sceneintersect
+ * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
+ * accumulated radiance in the output buffer. This kernel also takes care of
+ * rays that have been determined to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel.
+ * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
+ * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queues when this kernel is called:
+ * At entry,
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ * RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
+ * At exit,
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ * RAY_REGENERATED rays.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ */
+ccl_device void kernel_buffer_update(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(ray_index == 0) {
+ /* We will empty this queue in this kernel. */
+ kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+ }
+ char enqueue_flag = 0;
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not
+ * required.
+ *
+ * If we are executing on a CPU device, then we need to keep all threads
+ * active since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+ if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ uint sample = state->sample;
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+ /* accumulate result in output buffer */
+ kernel_write_result(kg, buffer, sample, L);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+ /* We have completed current work; So get next work */
+ ccl_global uint *work_pools = kernel_split_params.work_pools;
+ uint total_work_size = kernel_split_params.total_work_size;
+ uint work_index;
+
+ if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
+ /* If work is invalid, this means no more work is available and the thread may exit */
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+ ccl_global WorkTile *tile = &kernel_split_params.tile;
+ uint x, y, sample;
+ get_work_pixel(tile, work_index, &x, &y, &sample);
+
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
+
+ /* Initialize random numbers and ray. */
+ uint rng_hash;
+ kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
+
+ if(ray->t != 0.0f) {
+ /* Initialize throughput, path radiance, Ray, PathState;
+ * These rays proceed with path-iteration.
+ */
+ *throughput = make_float3(1.0f, 1.0f, 1.0f);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
+ path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray);
+#ifdef __SUBSURFACE__
+ kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ enqueue_flag = 1;
+ }
+ else {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+ }
+ }
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+ * These rays will be made active during next SceneIntersectkernel.
+ */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..77fb61b80a8 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,221 +14,96 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
+/* This kernel Initializes structures needed in path-iteration kernels.
*
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------| |--- Initialized throughput
- * Un-initialized L_transparent ----| |--- Initialized L_transparent
- * Un-initialized PathRadiance -----| |--- Initialized PathRadiance
- * Un-initialized Ray --------------| |--- Initialized Ray
- * Un-initialized PathState --------| |--- Initialized PathState
- * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------| |--- Initialized ray_state
- * parallel_samples --------------- | |--- Initialized per_sample_output_buffers
- * rng_state -----------------------| |--- Initialized work_array
- * data ----------------------------| |--- Initialized work_pool_wgs
- * start_sample --------------------| |
- * sx ------------------------------| |
- * sy ------------------------------| |
- * sw ------------------------------| |
- * sh ------------------------------| |
- * stride --------------------------| |
- * queuesize -----------------------| |
- * num_samples ---------------------| |
- *
- * Note on Queues :
+ * Note on Queues:
* All slots in queues are initialized to queue empty slot;
* The number of elements in the queues is initialized to 0;
*/
+
+#ifndef __KERNEL_CPU__
ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
KernelGlobals *kg,
- ShaderData *sd_DL_shadow,
ccl_constant KernelData *data,
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_state,
- ccl_global uint *rng_coop, /* rng array to store rng values for all rays */
- ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */
- ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */
- PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */
- ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */
- ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */
- Intersection *Intersection_coop_shadow,
- ccl_global char *ray_state, /* Stores information on current state of a ray */
-
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "../kernel_textures.h"
-
- int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
- int rng_state_offset_x,
- int rng_state_offset_y,
- int rng_state_stride,
- ccl_global int *Queue_data, /* Memory for queues */
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
+
+#ifdef __KERNEL_OPENCL__
+ KERNEL_BUFFER_PARAMS,
+#endif
+
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
ccl_global int *Queue_index, /* Tracks the number of elements in queues */
int queuesize, /* size (capacity) of the queue */
ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
- ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
- ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */
- unsigned int num_samples, /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples) /* Number of samples to be processed in parallel */
+ ccl_global unsigned int *work_pools, /* Work pool for each work group */
+ unsigned int num_samples,
+ ccl_global float *buffer)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
+#ifdef __KERNEL_OPENCL__
kg->data = data;
- kg->sd_input = sd_DL_shadow;
- kg->isect_shadow = Intersection_coop_shadow;
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "../kernel_textures.h"
-
- int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-#ifdef __WORK_STEALING__
- int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
- /* Initialize work_pool_wgs */
- if(lid == 0) {
- int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
- work_pool_wgs[group_index] = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif /* __WORK_STEALING__ */
+#endif
- /* Initialize queue data and queue index. */
- if(thread_index < queuesize) {
- /* Initialize active ray queue. */
- Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize background and buffer update queue. */
- Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize shadow ray cast of AO queue. */
- Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- /* Initialize shadow ray cast of direct lighting queue. */
- Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- }
+ kernel_split_params.tile.x = sx;
+ kernel_split_params.tile.y = sy;
+ kernel_split_params.tile.w = sw;
+ kernel_split_params.tile.h = sh;
- if(thread_index == 0) {
- Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
- /* The scene-intersect kernel should not use the queues very first time.
- * since the queue would be empty.
- */
- use_queues_flag[0] = 0;
- }
+ kernel_split_params.tile.start_sample = start_sample;
+ kernel_split_params.tile.num_samples = num_samples;
- int x = get_global_id(0);
- int y = get_global_id(1);
+ kernel_split_params.tile.offset = offset;
+ kernel_split_params.tile.stride = stride;
- if(x < (sw * parallel_samples) && y < sh) {
- int ray_index = x + y * (sw * parallel_samples);
+ kernel_split_params.tile.buffer = buffer;
- /* This is the first assignment to ray_state;
- * So we dont use ASSIGN_RAY_STATE macro.
- */
- ray_state[ray_index] = RAY_ACTIVE;
-
- unsigned int my_sample;
- unsigned int pixel_x;
- unsigned int pixel_y;
- unsigned int tile_x;
- unsigned int tile_y;
- unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
- unsigned int my_work = 0;
- /* Get work. */
- get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
- /* Get the sample associated with the work. */
- my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
- my_sample_tile = 0;
-
- /* Get pixel and tile position associated with the work. */
- get_pixel_tile_position(&pixel_x, &pixel_y,
- &tile_x, &tile_y,
- my_work,
- sw, sh, sx, sy,
- parallel_samples,
- ray_index);
- work_array[ray_index] = my_work;
-#else /* __WORK_STEALING__ */
- unsigned int tile_index = ray_index / parallel_samples;
- tile_x = tile_index % sw;
- tile_y = tile_index / sw;
- my_sample_tile = ray_index - (tile_index * parallel_samples);
- my_sample = my_sample_tile + start_sample;
-
- /* Initialize work array. */
- work_array[ray_index] = my_sample ;
-
- /* Calculate pixel position of this ray. */
- pixel_x = sx + tile_x;
- pixel_y = sy + tile_y;
-#endif /* __WORK_STEALING__ */
-
- rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
- /* Initialise per_sample_output_buffers to all zeros. */
- per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
- int per_sample_output_buffers_iterator = 0;
- for(per_sample_output_buffers_iterator = 0;
- per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
- per_sample_output_buffers_iterator++)
- {
- per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
- }
+ kernel_split_params.total_work_size = sw * sh * num_samples;
+
+ kernel_split_params.work_pools = work_pools;
- /* Initialize random numbers and ray. */
- kernel_path_trace_setup(kg,
- rng_state,
- my_sample,
- pixel_x, pixel_y,
- &rng_coop[ray_index],
- &Ray_coop[ray_index]);
-
- if(Ray_coop[ray_index].t != 0.0f) {
- /* Initialize throughput, L_transparent, Ray, PathState;
- * These rays proceed with path-iteration.
- */
- throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
- L_transparent_coop[ray_index] = 0.0f;
- path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
- path_state_init(kg,
- kg->sd_input,
- &PathState_coop[ray_index],
- &rng_coop[ray_index],
- my_sample,
- &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
- debug_data_init(&debugdata_coop[ray_index]);
+ kernel_split_params.queue_index = Queue_index;
+ kernel_split_params.queue_size = queuesize;
+ kernel_split_params.use_queues_flag = use_queues_flag;
+
+ split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
#endif
- }
- else {
- /* These rays do not participate in path-iteration. */
- float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- /* Accumulate result in output buffer. */
- kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
- path_rng_end(kg, rng_state, rng_coop[ray_index]);
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ /* Initialize queue data and queue index. */
+ if(thread_index < queuesize) {
+ for(int i = 0; i < NUM_QUEUES; i++) {
+ kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
}
}
- /* Mark rest of the ray-state indices as RAY_INACTIVE. */
- if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
- /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
- ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+ if(thread_index == 0) {
+ for(int i = 0; i < NUM_QUEUES; i++) {
+ Queue_index[i] = 0;
+ }
+
+ /* The scene-intersect kernel should not use the queues very first time.
+ * since the queue would be empty.
+ */
+ *use_queues_flag = 0;
}
+#endif /* KERENL_STUB */
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..2aac66ecb84 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,95 +14,136 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
+/* This kernel takes care of direct lighting logic.
+ * However, the "shadow ray cast" part of direct lighting is handled
* in the next kernel.
*
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with direct lighting should be executed. Those rays for which
+ * a shadow_blocked() function for direct-lighting must be executed, are
+ * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS
*
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
+ * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
+ * the corresponding shadow_blocked part, after direct lighting, the ray is
+ * marked with RAY_SHADOW_RAY_CAST_DL flag.
*
- * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------| |--- ISLamp_coop
- * sd -----------------------------------------------| |--- LightRay_coop
- * ray_state ----------------------------------------| |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| |
- * kg (globals) -------------------------------------| |
- * queuesize ----------------------------------------| |
- *
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ * State of queues when this kernel is called:
+ * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
+ * kernel call.
+ * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
+ * shadow_blocked function must be executed, after this kernel call
+ * Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
*/
-ccl_device char kernel_direct_lighting(
- KernelGlobals *kg,
- ShaderData *sd, /* Required for direct lighting */
- ccl_global uint *rng_coop, /* Required for direct lighting */
- ccl_global PathState *PathState_coop, /* Required for direct lighting */
- ccl_global int *ISLamp_coop, /* Required for direct lighting */
- ccl_global Ray *LightRay_coop, /* Required for direct lighting */
- ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
{
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
char enqueue_flag = 0;
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global PathState *state = &PathState_coop[ray_index];
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
/* direct lighting */
#ifdef __EMISSION__
- if((kernel_data.integrator.use_direct_light &&
- (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
- {
+ bool flag = (kernel_data.integrator.use_direct_light &&
+ (sd->flag & SD_BSDF_HAS_EVAL));
+
+# ifdef __BRANCHED_PATH__
+ if(flag && kernel_data.integrator.branched) {
+ flag = false;
+ enqueue_flag = 1;
+ }
+# endif /* __BRANCHED_PATH__ */
+
+# ifdef __SHADOW_TRICKS__
+ if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
+ flag = false;
+ enqueue_flag = 1;
+ }
+# endif /* __SHADOW_TRICKS__ */
+
+ if(flag) {
/* Sample illumination from lights to find path contribution. */
- ccl_global RNG* rng = &rng_coop[ray_index];
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_state_rng_light_termination(kg, rng, state);
+ path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+ float terminate = path_state_rng_light_termination(kg, state);
LightSample ls;
if(light_sample(kg,
- light_t, light_u, light_v,
- ccl_fetch(sd, time),
- ccl_fetch(sd, P),
+ light_u, light_v,
+ sd->time,
+ sd->P,
state->bounce,
&ls)) {
Ray light_ray;
-#ifdef __OBJECT_MOTION__
- light_ray.time = ccl_fetch(sd, time);
-#endif
+ light_ray.time = sd->time;
BsdfEval L_light;
bool is_lamp;
- if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+ if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* Write intermediate data to global memory to access from
* the next kernel.
*/
- LightRay_coop[ray_index] = light_ray;
- BSDFEval_coop[ray_index] = L_light;
- ISLamp_coop[ray_index] = is_lamp;
+ kernel_split_state.light_ray[ray_index] = light_ray;
+ kernel_split_state.bsdf_eval[ray_index] = L_light;
+ kernel_split_state.is_lamp[ray_index] = is_lamp;
/* Mark ray state for next shadow kernel. */
- ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
enqueue_flag = 1;
}
}
}
#endif /* __EMISSION__ */
}
- return enqueue_flag;
+
+#ifdef __EMISSION__
+ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif
+
+#ifdef __BRANCHED_PATH__
+ /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+ * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+ */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_LIGHT_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#endif /* __BRANCHED_PATH__ */
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
new file mode 100644
index 00000000000..491487f1230
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ /* GPU: no decoupled ray marching, scatter probalistically */
+ int num_samples = kernel_data.integrator.volume_samples;
+ float num_samples_inv = 1.0f/num_samples;
+
+ Ray volume_ray = branched_state->ray;
+ volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+ for(int j = branched_state->next_sample; j < num_samples; j++) {
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+ *ps = branched_state->path_state;
+
+ ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+ *pray = branched_state->ray;
+
+ ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+ *tp = branched_state->throughput * num_samples_inv;
+
+ /* branch RNG state */
+ path_state_branch(ps, j, num_samples);
+
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, ps, sd, &volume_ray, L, tp, heterogeneous);
+
+# ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+ /* indirect light bounce */
+ if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
+ continue;
+ }
+
+ /* start the indirect path */
+ branched_state->next_closure = 0;
+ branched_state->next_sample = j+1;
+
+ /* Attempting to share too many samples is slow for volumes as it causes us to
+ * loop here more and have many calls to kernel_volume_integrate which evaluates
+ * shaders. The many expensive shader evaluations cause the work load to become
+ * unbalanced and many threads to become idle in this kernel. Limiting the
+ * number of shared samples here helps quite a lot.
+ */
+ if(branched_state->shared_sample_count < 2) {
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+ }
+
+ return true;
+ }
+# endif
+ }
+
+ branched_state->next_sample = num_samples;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ /* todo: avoid this calculation using decoupled ray marching */
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+ kernel_split_state.throughput[ray_index] = throughput;
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ && __VOLUME__ */
+
+ccl_device void kernel_do_volume(KernelGlobals *kg)
+{
+#ifdef __VOLUME__
+ /* We will empty this queue in this kernel. */
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+# ifdef __BRANCHED_PATH__
+ kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+# endif /* __BRANCHED_PATH__ */
+ }
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ if(*kernel_split_params.use_queues_flag) {
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+ }
+
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+ IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+
+ /* Sanitize volume stack. */
+ if(!hit) {
+ kernel_volume_clean_stack(kg, state->volume_stack);
+ }
+ /* volume attenuation, emission, scatter */
+ if(state->volume_stack[0].shader != SHADER_NONE) {
+ Ray volume_ray = *ray;
+ volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+# ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+# endif /* __BRANCHED_PATH__ */
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+ {
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+# ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_path_end(kg, ray_index);
+ }
+ }
+# endif /* __VOLUME_SCATTER__ */
+ }
+
+# ifdef __BRANCHED_PATH__
+ }
+ else {
+ kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+ }
+ }
+
+# ifdef __BRANCHED_PATH__
+ /* iter loop */
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_VOLUME_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+ path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+ if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+
+#endif /* __VOLUME__ */
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+ /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ char enqueue_flag = 0;
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+ enqueue_flag = 1;
+ }
+
+ enqueue_ray_index_local(ray_index,
+ QUEUE_INACTIVE_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 435d1171d5c..906bad8ceb6 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,247 +14,161 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
+/* This kernel takes care of the logic to process "material of type holdout",
+ * indirect primitive emission, bsdf blurring, probabilistic path termination
+ * and AO.
*
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with AO should be executed. Those rays for which a
+ * shadow_blocked() function for AO must be executed are marked with flag
+ * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS
*
* Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
*
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFFER, there is no problem.
*
- * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------| |--- PathState_coop
- * PathRadiance_coop ------------------------------------| |--- throughput_coop
- * Intersection_coop ------------------------------------| |--- L_transparent_coop
- * PathState_coop ---------------------------------------| |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------| |--- PathRadiance_coop
- * sd ---------------------------------------------------| |--- ShaderData
- * ray_state --------------------------------------------| |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop
- * kg (globals) -----------------------------------------| |--- AOBSDF_coop
- * parallel_samples -------------------------------------| |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------| |
- * sw ---------------------------------------------------| |
- * sh ---------------------------------------------------| |
- * sx ---------------------------------------------------| |
- * sy ---------------------------------------------------| |
- * stride -----------------------------------------------| |
- * work_array -------------------------------------------| |
- * queuesize --------------------------------------------| |
- * start_sample -----------------------------------------| |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
* At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ * RAY_REGENERATED rays
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ * RAY_TO_REGENERATE rays.
+ * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
* At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ * RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
+ * flag RAY_SHADOW_RAY_CAST_AO
*/
+
ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
KernelGlobals *kg,
- ShaderData *sd, /* Required throughout the kernel except probabilistic path termination and AO */
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */
- ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */
- ccl_global float *L_transparent_coop, /* Required for handling holdout material */
- PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
- ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */
- Intersection *Intersection_coop, /* Required for indirect primitive emission */
- ccl_global float3 *AOAlpha_coop, /* Required for AO */
- ccl_global float3 *AOBSDF_coop, /* Required for AO */
- ccl_global Ray *AOLightRay_coop, /* Required for AO */
- int sw, int sh, int sx, int sy, int stride,
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
- unsigned int start_sample,
-#endif
- int parallel_samples, /* Number of samples to be processed in parallel */
- int ray_index,
- char *enqueue_flag,
- char *enqueue_flag_AO_SHADOW_RAY_CAST)
+ ccl_local_param BackgroundAOLocals *locals)
{
-#ifdef __WORK_STEALING__
- unsigned int my_work;
- unsigned int pixel_x;
- unsigned int pixel_y;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ locals->queue_atomics_bg = 0;
+ locals->queue_atomics_ao = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+#ifdef __AO__
+ char enqueue_flag = 0;
+#endif
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not
+ * required.
+ *
+ * If we are executing on a CPU device, then we need to keep all threads
+ * active since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+#endif /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
#endif
- unsigned int tile_x;
- unsigned int tile_y;
- int my_sample_tile;
- unsigned int sample;
- ccl_global RNG *rng = 0x0;
ccl_global PathState *state = 0x0;
float3 throughput;
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
- throughput = throughput_coop[ray_index];
- state = &PathState_coop[ray_index];
- rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
- my_work = work_array[ray_index];
- sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
- get_pixel_tile_position(&pixel_x, &pixel_y,
- &tile_x, &tile_y,
- my_work,
- sw, sh, sx, sy,
- parallel_samples,
- ray_index);
- my_sample_tile = 0;
-#else /* __WORK_STEALING__ */
- sample = work_array[ray_index];
- /* Buffer's stride is "stride"; Find x and y using ray_index. */
- int tile_index = ray_index / parallel_samples;
- tile_x = tile_index % sw;
- tile_y = tile_index / sw;
- my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif /* __WORK_STEALING__ */
- per_sample_output_buffers +=
- (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
- kernel_data.film.pass_stride;
-
- /* holdout */
-#ifdef __HOLDOUT__
- if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) &&
- (state->flag & PATH_RAY_CAMERA))
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+ throughput = kernel_split_state.throughput[ray_index];
+ state = &kernel_split_state.path_state[ray_index];
+
+ if(!kernel_path_shader_apply(kg,
+ sd,
+ state,
+ ray,
+ throughput,
+ emission_sd,
+ L,
+ buffer))
{
- if(kernel_data.background.transparent) {
- float3 holdout_weight;
-
- if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
- holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
- else
- holdout_weight = shader_holdout_eval(kg, sd);
-
- /* any throughput is ok, should all be identical here */
- L_transparent_coop[ray_index] += average(holdout_weight*throughput);
- }
-
- if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- *enqueue_flag = 1;
- }
+ kernel_split_path_end(kg, ray_index);
}
-#endif /* __HOLDOUT__ */
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- PathRadiance *L = &PathRadiance_coop[ray_index];
- /* Holdout mask objects do not write data passes. */
- kernel_write_data_passes(kg,
- per_sample_output_buffers,
- L,
- sd,
- sample,
- state,
- throughput);
- /* Blurring of bsdf after bounces, for rays that have a small likelihood
- * of following this particular path (diffuse, rough glossy.
- */
- if(kernel_data.integrator.filter_glossy != FLT_MAX) {
- float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
- if(blur_pdf < 1.0f) {
- float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
- shader_bsdf_blur(kg, sd, blur_roughness);
- }
- }
-
-#ifdef __EMISSION__
- /* emission */
- if(ccl_fetch(sd, flag) & SD_EMISSION) {
- /* TODO(sergey): is isect.t wrong here for transparent surfaces? */
- float3 emission = indirect_primitive_emission(
- kg,
- sd,
- Intersection_coop[ray_index].t,
- state->flag,
- state->ray_pdf);
- path_radiance_accum_emission(L, throughput, emission, state->bounce);
- }
-#endif /* __EMISSION__ */
-
/* Path termination. this is a strange place to put the termination, it's
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate.
*/
- float probability = path_state_terminate_probability(kg, state, throughput);
+ float probability = path_state_continuation_probability(kg, state, throughput);
if(probability == 0.0f) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- *enqueue_flag = 1;
+ kernel_split_path_end(kg, ray_index);
+ }
+ else if(probability < 1.0f) {
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+ if(terminate >= probability) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else {
+ kernel_split_state.throughput[ray_index] = throughput/probability;
+ }
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
- if(terminate >= probability) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- *enqueue_flag = 1;
- }
- else {
- throughput_coop[ray_index] = throughput/probability;
- }
- }
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ kernel_update_denoising_features(kg, sd, state, L);
}
}
#ifdef __AO__
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
/* ambient occlusion */
- if(kernel_data.integrator.use_ambient_occlusion ||
- (ccl_fetch(sd, flag) & SD_AO))
- {
- /* todo: solve correlation */
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
- float ao_factor = kernel_data.background.ao_factor;
- float3 ao_N;
- AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
- AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
-
- float3 ao_D;
- float ao_pdf;
- sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
- if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray _ray;
- _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
- _ray.D = ao_D;
- _ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
- _ray.time = ccl_fetch(sd, time);
-#endif
- _ray.dP = ccl_fetch(sd, dP);
- _ray.dD = differential3_zero();
- AOLightRay_coop[ray_index] = _ray;
-
- ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
- *enqueue_flag_AO_SHADOW_RAY_CAST = 1;
- }
+ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+ enqueue_flag = 1;
}
}
#endif /* __AO__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+#ifdef __AO__
+ /* Enqueue to-shadow-ray-cast rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ &locals->queue_atomics_ao,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
new file mode 100644
index 00000000000..437043a5971
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_background(KernelGlobals *kg)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ int ray_index;
+
+ if(kernel_data.integrator.ao_bounces != INT_MAX) {
+ ray_index = get_ray_index(kg, thread_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ if(path_state_ao_bounce(kg, state)) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ }
+ }
+ }
+
+ ray_index = get_ray_index(kg, thread_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+ kernel_split_path_end(kg, ray_index);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
new file mode 100644
index 00000000000..e9fe5552e8c
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
+{
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index == 0) {
+ /* We will empty both queues in this kernel. */
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+ }
+
+ int ray_index;
+ get_ray_index(kg, thread_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+ ray_index = get_ray_index(kg, thread_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+#ifdef __SUBSURFACE__
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+#endif
+ if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+
+ /* Trace indirect subsurface rays by restarting the loop. this uses less
+ * stack memory than invoking kernel_path_indirect.
+ */
+ if(ss_indirect->num_rays) {
+ kernel_path_subsurface_setup_indirect(kg,
+ ss_indirect,
+ state,
+ ray,
+ L,
+ throughput);
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+#ifdef __BRANCHED_PATH__
+ }
+#endif
+
+#endif /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..448456d167d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,70 +14,55 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
+/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
* We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals) ---------------------------------------| |
- * Intersection_coop ----------------------------------| |
- * ray_state ------------------------------------------| |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| |
- * queuesize ------------------------------------------| |
- * use_queues_flag ------------------------------------| |
- * sw -------------------------------------------------| |
- * sh -------------------------------------------------| |
*/
-ccl_device void kernel_lamp_emission(
- KernelGlobals *kg,
- ccl_global float3 *throughput_coop, /* Required for lamp emission */
- PathRadiance *PathRadiance_coop, /* Required for lamp emission */
- ccl_global Ray *Ray_coop, /* Required for lamp emission */
- ccl_global PathState *PathState_coop, /* Required for lamp emission */
- Intersection *Intersection_coop, /* Required for lamp emission */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int sw, int sh,
- ccl_global char *use_queues_flag, /* Used to decide if this kernel should use
- * queues to fetch ray index
- */
- int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
{
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
- {
- PathRadiance *L = &PathRadiance_coop[ray_index];
- ccl_global PathState *state = &PathState_coop[ray_index];
+#ifndef __VOLUME__
+ /* We will empty this queue in this kernel. */
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ }
+#endif
+ /* Fetch use_queues_flag. */
+ char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
- float3 throughput = throughput_coop[ray_index];
- Ray ray = Ray_coop[ray_index];
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(local_use_queues_flag) {
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+#ifndef __VOLUME__
+ 1
+#else
+ 0
+#endif
+ );
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+ }
-#ifdef __LAMP_MIS__
- if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
- /* ray starting from previous non-transparent bounce */
- Ray light_ray;
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
+ {
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- light_ray.P = ray.P - state->ray_t*ray.D;
- state->ray_t += Intersection_coop[ray_index].t;
- light_ray.D = ray.D;
- light_ray.t = state->ray_t;
- light_ray.time = ray.time;
- light_ray.dD = ray.dD;
- light_ray.dP = ray.dP;
- /* intersect with lamp */
- float3 emission;
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
+ ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
- if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
- path_radiance_accum_emission(L, throughput, emission, state->bounce);
- }
- }
-#endif /* __LAMP_MIS__ */
+ kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L);
}
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..c3373174582 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,128 +14,230 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
+/*This kernel takes care of setting up ray for the next iteration of
+ * path-iteration and accumulating radiance corresponding to AO and
+ * direct-lighting
*
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ * Ray state of rays that are terminated in this kernel are changed
+ * to RAY_UPDATE_BUFFER.
*
- * The input and output are as follows,
+ * Note on queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFF state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFF, there is no problem.
*
- * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------| |--- throughput_coop
- * PathState_coop ---------------------------------------| |--- PathRadiance_coop
- * sd ---------------------------------------------------| |--- PathState_coop
- * ray_state --------------------------------------------| |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag
- * Ray_coop ---------------------------------------------| |
- * kg (globals) -----------------------------------------| |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
* At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ * RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
* At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ * RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
*/
-ccl_device char kernel_next_iteration_setup(
- KernelGlobals *kg,
- ShaderData *sd, /* Required for setting up ray for next iteration */
- ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */
- ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */
- PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
- ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */
- ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
- ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */
- ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */
- ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */
- ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */
- ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */
- ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should
- * use queues to fetch ray index */
- int ray_index)
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
{
- char enqueue_flag = 0;
-
- /* Load ShaderData structure. */
- PathRadiance *L = NULL;
- ccl_global PathState *state = NULL;
-
- /* Path radiance update for AO/Direct_lighting's shadow blocked. */
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
- IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
- {
- state = &PathState_coop[ray_index];
- L = &PathRadiance_coop[ray_index];
- float3 _throughput = throughput_coop[ray_index];
-
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
- float3 shadow = LightRay_ao_coop[ray_index].P;
- char update_path_radiance = LightRay_ao_coop[ray_index].t;
- if(update_path_radiance) {
- path_radiance_accum_ao(L,
- _throughput,
- AOAlpha_coop[ray_index],
- AOBSDF_coop[ray_index],
- shadow,
- state->bounce);
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+ /* continue in case of transparency */
+ *throughput *= shader_bsdf_transparency(kg, sd);
+
+ if(is_zero(*throughput)) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else {
+ /* Update Path State */
+ state->flag |= PATH_RAY_TRANSPARENT;
+ state->transparent_bounce++;
+
+ ray->P = ray_offset(sd->P, -sd->Ng);
+ ray->t -= sd->ray_length; /* clipping works through transparent */
+
+# ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+ ray->dD.dx = -sd->dI.dx;
+ ray->dD.dy = -sd->dI.dy;
+# endif /* __RAY_DIFFERENTIALS__ */
+
+# ifdef __VOLUME__
+ /* enter/exit volume */
+ kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+# endif /* __VOLUME__ */
+ }
+}
+#endif /* __BRANCHED_PATH__ */
+
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ /* If we are here, then it means that scene-intersect kernel
+ * has already been executed atleast once. From the next time,
+ * scene-intersect kernel may operate on queues to fetch ray index
+ */
+ *kernel_split_params.use_queues_flag = 1;
+
+ /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+ * previous kernel.
+ */
+ kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+ kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ }
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+ if(active) {
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+ /* Compute direct lighting and next bounce. */
+ if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
+ kernel_split_path_end(kg, ray_index);
}
- REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
}
+ else {
+ kernel_split_branched_indirect_light_init(kg, ray_index);
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
- float3 shadow = LightRay_dl_coop[ray_index].P;
- char update_path_radiance = LightRay_dl_coop[ray_index].t;
- if(update_path_radiance) {
- BsdfEval L_light = BSDFEval_coop[ray_index];
- path_radiance_accum_light(L,
- _throughput,
- &L_light,
- shadow,
- 1.0f,
- state->bounce,
- ISLamp_coop[ray_index]);
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ 1.0f,
+ &kernel_split_state.branched_state[ray_index].sd,
+ true,
+ true))
+ {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_branched_indirect_light_end(kg, ray_index);
}
- REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
}
+#endif /* __BRANCHED_PATH__ */
}
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global float3 *throughput = &throughput_coop[ray_index];
- ccl_global Ray *ray = &Ray_coop[ray_index];
- ccl_global RNG *rng = &rng_coop[ray_index];
- state = &PathState_coop[ray_index];
- L = &PathRadiance_coop[ray_index];
-
- /* Compute direct lighting and next bounce. */
- if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- enqueue_flag = 1;
+ /* Enqueue RAY_UPDATE_BUFFER rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+ /* iter loop */
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+ }
+
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_LIGHT_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
+
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ 1.0f,
+ &kernel_split_state.branched_state[ray_index].sd,
+ true,
+ true))
+ {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ else {
+ kernel_split_branched_indirect_light_end(kg, ray_index);
}
}
- return enqueue_flag;
+# ifdef __VOLUME__
+ /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_VOLUME_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+# endif /* __VOLUME__ */
+
+# ifdef __SUBSURFACE__
+ /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+# endif /* __SUBSURFACE__ */
+#endif /* __BRANCHED_PATH__ */
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..5ad62b585fe
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+ int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+ /* This is the first assignment to ray_state;
+ * So we dont use ASSIGN_RAY_STATE macro.
+ */
+ kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+ /* Get work. */
+ ccl_global uint *work_pools = kernel_split_params.work_pools;
+ uint total_work_size = kernel_split_params.total_work_size;
+ uint work_index;
+
+ if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
+ /* No more work, mark ray as inactive */
+ kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+ return;
+ }
+
+ ccl_global WorkTile *tile = &kernel_split_params.tile;
+ uint x, y, sample;
+ get_work_pixel(tile, work_index, &x, &y, &sample);
+
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
+
+ /* Initialize random numbers and ray. */
+ uint rng_hash;
+ kernel_path_trace_setup(kg,
+ sample,
+ x, y,
+ &rng_hash,
+ &kernel_split_state.ray[ray_index]);
+
+ if(kernel_split_state.ray[ray_index].t != 0.0f) {
+ /* Initialize throughput, path radiance, Ray, PathState;
+ * These rays proceed with path-iteration.
+ */
+ kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+ path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+ path_state_init(kg,
+ &kernel_split_state.sd_DL_shadow[ray_index],
+ &kernel_split_state.path_state[ray_index],
+ rng_hash,
+ sample,
+ &kernel_split_state.ray[ray_index]);
+#ifdef __SUBSURFACE__
+ kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+ }
+ else {
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..66ce2dfb6f1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel enqueues rays of different ray state into their
+ * appropriate queues:
+ *
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel are enqueued in
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in pat
+ * -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queue during other times this kernel is called:
+ * At entry,
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
+ * and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ * RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
+ ccl_local_param QueueEnqueueLocals *locals)
+{
+ /* We have only 2 cases (Hit/Not-Hit) */
+ int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ if(lidx == 0) {
+ locals->queue_atomics[0] = 0;
+ locals->queue_atomics[1] = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int queue_number = -1;
+
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+ queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ }
+ else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+ queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+ }
+
+ unsigned int my_lqidx;
+ if(queue_number != -1) {
+ my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ if(lidx == 0) {
+ locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+ get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ locals->queue_atomics,
+ kernel_split_params.queue_index);
+ locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+ get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ locals->queue_atomics,
+ kernel_split_params.queue_index);
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ unsigned int my_gqidx;
+ if(queue_number != -1) {
+ my_gqidx = get_global_queue_index(queue_number,
+ kernel_split_params.queue_size,
+ my_lqidx,
+ locals->queue_atomics);
+ kernel_split_state.queue_data[my_gqidx] = ray_index;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index fc4b4ee38e5..f5378bc172b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,119 +14,66 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
+/* This kernel takes care of scene_intersect function.
*
* This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
* This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------| |--- Intersection
- * ray_state --------------------------------------| |--- ray_state
- * use_queues_flag --------------------------------| |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| |
- * kg (globals) -----------------------------------| |
- * rng_coop ---------------------------------------| |
- * sw ---------------------------------------------| |
- * sh ---------------------------------------------| |
- * queuesize --------------------------------------| |
- *
- * Note on Queues :
- * Ideally we would want kernel_scene_intersect to work on queues.
- * But during the very first time, the queues will be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ * This kernel determines the rays that have hit the background and changes
+ * their ray state to RAY_HIT_BACKGROUND.
*/
-
-ccl_device void kernel_scene_intersect(
- KernelGlobals *kg,
- ccl_global uint *rng_coop,
- ccl_global Ray *Ray_coop, /* Required for scene_intersect */
- ccl_global PathState *PathState_coop, /* Required for scene_intersect */
- Intersection *Intersection_coop, /* Required for scene_intersect */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int sw, int sh,
- ccl_global char *use_queues_flag, /* used to decide if this kernel should use
- * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
{
- /* All regenerated rays become active here */
- if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-
- if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
- return;
+ /* Fetch use_queues_flag */
+ char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#ifdef __KERNEL_DEBUG__
- DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
- Intersection *isect = &Intersection_coop[ray_index];
- PathState state = PathState_coop[ray_index];
- Ray ray = Ray_coop[ray_index];
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(local_use_queues_flag) {
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
- /* intersect scene */
- uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
- float difl = 0.0f, extmax = 0.0f;
- uint lcg_state = 0;
- RNG rng = rng_coop[ray_index];
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+ }
- if(kernel_data.bvh.have_curves) {
- if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
- float3 pixdiff = ray.dD.dx + ray.dD.dy;
- /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
- difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+ /* All regenerated rays become active here */
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+ if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else
+#endif /* __BRANCHED_PATH__ */
+ {
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
}
+ }
- extmax = kernel_data.curve.maximum_width;
- lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+ if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ return;
}
- bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax);
-#else
- bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f);
-#endif
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-#ifdef __KERNEL_DEBUG__
- if(state.flag & PATH_RAY_CAMERA) {
- debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
- debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
- }
- debug_data->num_ray_bounces++;
-#endif
+ Intersection isect;
+ bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L);
+ kernel_split_state.isect[ray_index] = isect;
if(!hit) {
/* Change the state of rays that hit the background;
* These rays undergo special processing in the
* background_bufferUpdate kernel.
*/
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
}
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..7032461b04a 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,57 +14,53 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
-/* Note on kernel_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd
- * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------| |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| |
- * ray_state ------------------------------------------| |
- * kg (globals) ---------------------------------------| |
- * queuesize ------------------------------------------| |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+/* This kernel evaluates ShaderData structure from the values computed
+ * by the previous kernels.
*/
-ccl_device void kernel_shader_eval(
- KernelGlobals *kg,
- ShaderData *sd, /* Output ShaderData structure to be filled */
- ccl_global uint *rng_coop, /* Required for rbsdf calculation */
- ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */
- ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */
- Intersection *Intersection_coop, /* Required for setting up shader from ray */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int ray_index)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
{
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ /* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+ int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+ int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+ if(ray_index >= queue_index) {
+ return;
+ }
+ ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+ QUEUE_SHADER_SORTED_RAYS,
+#endif
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ ccl_global char *ray_state = kernel_split_state.ray_state;
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- Intersection *isect = &Intersection_coop[ray_index];
- ccl_global uint *rng = &rng_coop[ray_index];
- ccl_global PathState *state = &PathState_coop[ray_index];
- Ray ray = Ray_coop[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- shader_setup_from_ray(kg,
- sd,
- isect,
- &ray);
- float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
- shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
+#ifdef __BRANCHED_PATH__
+ if(kernel_data.integrator.branched) {
+ shader_merge_closures(&kernel_split_state.sd[ray_index]);
+ }
+ else
+#endif
+ {
+ shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
+ }
}
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..0432689d9fa
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+ /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+ if(ray_index >= queue_index) {
+ return;
+ }
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+ /* Continue on with shader evaluation. */
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ Intersection isect = kernel_split_state.isect[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
+
+ shader_setup_from_ray(kg,
+ &kernel_split_state.sd[ray_index],
+ &isect,
+ &ray);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..5a55b680695
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+ ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+ int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+ if(tid == 0) {
+ kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+ }
+
+ uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+ if(offset >= qsize) {
+ return;
+ }
+
+ int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+ uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+ uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+ ccl_local uint *local_value = &locals->local_value[0];
+ ccl_local ushort *local_index = &locals->local_index[0];
+
+ /* copy to local memory */
+ for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ uint idx = offset + i + lid;
+ uint add = input + idx;
+ uint value = (~0);
+ if(idx < qsize) {
+ int ray_index = kernel_split_state.queue_data[add];
+ bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ if(valid) {
+ value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+ }
+ }
+ local_value[i + lid] = value;
+ local_index[i + lid] = i + lid;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ /* skip sorting for cpu split kernel */
+# ifdef __KERNEL_OPENCL__
+
+ /* bitonic sort */
+ for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+ for(uint inc = length; inc > 0; inc >>= 1) {
+ for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+ uint i = lid + ii;
+ bool direction = ((i & (length << 1)) != 0);
+ uint j = i ^ inc;
+ ushort ioff = local_index[i];
+ ushort joff = local_index[j];
+ uint iKey = local_value[ioff];
+ uint jKey = local_value[joff];
+ bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+ bool swap = smaller ^ (j < i) ^ direction;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ local_index[i] = (swap) ? joff : ioff;
+ local_index[j] = (swap) ? ioff : joff;
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ }
+ }
+ }
+# endif /* __KERNEL_OPENCL__ */
+
+ /* copy to destination */
+ for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ uint idx = offset + i + lid;
+ uint lidx = local_index[i + lid];
+ uint outi = output + idx;
+ uint ini = input + offset + lidx;
+ uint value = local_value[lidx];
+ if(idx < qsize) {
+ kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+ }
+ }
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
deleted file mode 100644
index 6153af47f96..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop
- * ray_state ---------------------------------------| |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
- QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
- QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| |
- * kg (globals) ------------------------------------| |
- * queuesize ---------------------------------------| |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
- * these queues this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
- * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
-ccl_device void kernel_shadow_blocked(
- KernelGlobals *kg,
- ccl_global PathState *PathState_coop, /* Required for shadow blocked */
- ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */
- ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */
- ccl_global char *ray_state,
- char shadow_blocked_type,
- int ray_index)
-{
- /* Flag determining if we need to update L. */
- char update_path_radiance = 0;
-
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
- IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
- {
- ccl_global PathState *state = &PathState_coop[ray_index];
- ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
- ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
-
- ccl_global Ray *light_ray_global =
- shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
- ? light_ray_ao_global
- : light_ray_dl_global;
-
- float3 shadow;
- update_path_radiance = !(shadow_blocked(kg,
- kg->sd_input,
- state,
- light_ray_global,
- &shadow));
-
- /* We use light_ray_global's P and t to store shadow and
- * update_path_radiance.
- */
- light_ray_global->P = shadow;
- light_ray_global->t = update_path_radiance;
- }
-}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
new file mode 100644
index 00000000000..79aa2c9435b
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for AO. */
+ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
+{
+ unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = QUEUE_EMPTY_SLOT;
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index < ao_queue_length) {
+ ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+ }
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+ kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+ }
+ else {
+ kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
+ }
+#endif
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
new file mode 100644
index 00000000000..b52f9a5eb81
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for direct visible light. */
+ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
+{
+ unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = QUEUE_EMPTY_SLOT;
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index < dl_queue_length) {
+ ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+ }
+
+#ifdef __BRANCHED_PATH__
+ /* TODO(mai): move this somewhere else? */
+ if(thread_index == 0) {
+ /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+ kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+ }
+#endif /* __BRANCHED_PATH__ */
+
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ Ray ray = kernel_split_state.light_ray[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+
+ BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ bool is_lamp = kernel_split_state.is_lamp[ray_index];
+
+# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+ bool use_branched = false;
+ int all = 0;
+
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ use_branched = true;
+ all = 1;
+ }
+# if defined(__BRANCHED_PATH__)
+ else if(kernel_data.integrator.branched) {
+ use_branched = true;
+
+ if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ all = (kernel_data.integrator.sample_all_lights_indirect);
+ }
+ else
+ {
+ all = (kernel_data.integrator.sample_all_lights_direct);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+
+ if(use_branched) {
+ kernel_branched_path_surface_connect_light(kg,
+ sd,
+ emission_sd,
+ state,
+ throughput,
+ 1.0f,
+ L,
+ all);
+ }
+ else
+# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+ {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg,
+ sd,
+ emission_sd,
+ state,
+ &ray,
+ &shadow))
+ {
+ /* accumulate */
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+ }
+ else {
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..21886ee62ee 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,48 +17,78 @@
#ifndef __KERNEL_SPLIT_H__
#define __KERNEL_SPLIT_H__
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_image_opencl.h"
-
-#include "util_atomic.h"
-
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
-
-#include "geom/geom.h"
-#include "bvh/bvh.h"
-
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#include "kernel_subsurface.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/split/kernel_split_data.h"
+
+#include "kernel/kernel_globals.h"
+
+#ifdef __OSL__
+# include "kernel/osl/osl_shader.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+# include "kernel/kernels/opencl/kernel_opencl_image.h"
+#endif
+#ifdef __KERNEL_CUDA__
+# include "kernel/kernels/cuda/kernel_cuda_image.h"
+#endif
+#ifdef __KERNEL_CPU__
+# include "kernel/kernels/cpu/kernel_cpu_image.h"
+#endif
+
+#include "util/util_atomic.h"
+
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+# include "kernel/kernel_path_branched.h"
#endif
-#ifdef __VOLUME__
-#include "kernel_volume.h"
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
+
+#ifdef __BRANCHED_PATH__
+# include "kernel/split/kernel_branched.h"
#endif
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+ if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+ int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_accum_sample(orig_ray_L, L);
+
+ atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
-#ifdef __KERNEL_DEBUG__
-#include "kernel_debug.h"
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+ }
+ else {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ }
+#else
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
#endif
+}
-#include "kernel_queues.h"
-#include "kernel_work_stealing.h"
+CCL_NAMESPACE_END
#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..eac22050a38
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+ (void)kg; /* Unused on CPU. */
+
+ uint64_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+ size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+ return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+ ccl_global SplitData *split_data,
+ size_t num_elements,
+ ccl_global void *data,
+ ccl_global char *ray_state)
+{
+ (void)kg; /* Unused on CPU. */
+
+ ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+ split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+ SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+ split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..b0e6e5f5250
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+ WorkTile tile;
+ uint total_work_size;
+
+ ccl_global unsigned int *work_pools;
+
+ ccl_global int *queue_index;
+ int queue_size;
+ ccl_global char *use_queues_flag;
+
+ /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
+ int dummy_sd_flag;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+ /* various state that must be kept and restored after an indirect loop */
+ PathState path_state;
+ float3 throughput;
+ Ray ray;
+
+ struct ShaderData sd;
+ Intersection isect;
+
+ char ray_state;
+
+ /* indirect loop state */
+ int next_closure;
+ int next_sample;
+
+#ifdef __SUBSURFACE__
+ int ss_next_closure;
+ int ss_next_sample;
+ int next_hit;
+ int num_hits;
+
+ uint lcg_state;
+ SubsurfaceIntersection ss_isect;
+
+# ifdef __VOLUME__
+ VolumeStack volume_stack[VOLUME_STACK_SIZE];
+# endif /* __VOLUME__ */
+#endif /*__SUBSURFACE__ */
+
+ int shared_sample_count; /* number of branched samples shared with other threads */
+ int original_ray; /* index of original ray when sharing branched samples */
+ bool waiting_on_shared_samples;
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+ SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif /* __BRANCHED_PATH__ */
+
+#ifdef __SUBSURFACE__
+# define SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
+#else
+# define SPLIT_DATA_SUBSURFACE_ENTRIES
+#endif /* __SUBSURFACE__ */
+
+#ifdef __VOLUME__
+# define SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
+#else
+# define SPLIT_DATA_VOLUME_ENTRIES
+#endif /* __VOLUME__ */
+
+#define SPLIT_DATA_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+ SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
+
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+ SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+ /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+ * the host easily) but is still used the same as the other data so we have it here in this struct as well
+ */
+ ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+# define kernel_split_state (kg->split_data)
+# define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+# define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+# define kernel_split_params (__split_param_data)
+#endif /* __KERNEL_CUDA__ */
+
+/* Local storage for queue_enqueue kernel. */
+typedef struct QueueEnqueueLocals {
+ uint queue_atomics[2];
+} QueueEnqueueLocals;
+
+/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
+typedef struct BackgroundAOLocals {
+ uint queue_atomics_bg;
+ uint queue_atomics_ao;
+} BackgroundAOLocals;
+
+typedef struct ShaderSortLocals {
+ uint local_value[SHADER_SORT_BLOCK_SIZE];
+ ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
new file mode 100644
index 00000000000..3b957856aea
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
+
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+ kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ branched_state->ss_next_closure = 0;
+ branched_state->ss_next_sample = 0;
+
+ branched_state->num_hits = 0;
+ branched_state->next_hit = 0;
+
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+ SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+ ShaderData *sd = &branched_state->sd;
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSSRDF(sc->type))
+ continue;
+
+ /* set up random number generator */
+ if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+ branched_state->next_closure == 0 && branched_state->next_sample == 0)
+ {
+ branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+ 0x68bc21eb);
+ }
+ int num_samples = kernel_data.integrator.subsurface_samples;
+ float num_samples_inv = 1.0f/num_samples;
+ uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+ /* do subsurface scatter step with copy of shader data, this will
+ * replace the BSSRDF with a diffuse BSDF closure */
+ for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+ ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+ float bssrdf_u, bssrdf_v;
+ path_branched_rng_2D(kg,
+ bssrdf_rng_hash,
+ &branched_state->path_state,
+ j,
+ num_samples,
+ PRNG_BSDF_U,
+ &bssrdf_u,
+ &bssrdf_v);
+
+ /* intersection is expensive so avoid doing multiple times for the same input */
+ if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ uint lcg_state = branched_state->lcg_state;
+ SubsurfaceIntersection ss_isect_private;
+
+ branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+ &ss_isect_private,
+ sd,
+ sc,
+ &lcg_state,
+ bssrdf_u, bssrdf_v,
+ true);
+
+ branched_state->lcg_state = lcg_state;
+ *ss_isect = ss_isect_private;
+ }
+
+#ifdef __VOLUME__
+ Ray volume_ray = branched_state->ray;
+ bool need_update_volume_stack =
+ kernel_data.integrator.use_volumes &&
+ sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif /* __VOLUME__ */
+
+ /* compute lighting with the BSDF closure */
+ for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+ ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+ *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+ * important as the indirect path will write into bssrdf_sd */
+
+ SubsurfaceIntersection ss_isect_private = *ss_isect;
+ subsurface_scatter_multi_setup(kg,
+ &ss_isect_private,
+ hit,
+ bssrdf_sd,
+ &branched_state->path_state,
+ branched_state->path_state.flag,
+ sc,
+ true);
+ *ss_isect = ss_isect_private;
+
+ ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+ *hit_state = branched_state->path_state;
+
+ path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+ if(need_update_volume_stack) {
+ /* Setup ray from previous surface point to the new one. */
+ float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+ volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+ /* this next part is expensive as it does scene intersection so only do once */
+ if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+ branched_state->volume_stack[k] = hit_state->volume_stack[k];
+ }
+
+ kernel_volume_stack_update_for_subsurface(kg,
+ emission_sd,
+ &volume_ray,
+ branched_state->volume_stack);
+ }
+
+ for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+ hit_state->volume_stack[k] = branched_state->volume_stack[k];
+ }
+ }
+#endif /* __VOLUME__ */
+
+#ifdef __EMISSION__
+ if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+ /* direct light */
+ if(kernel_data.integrator.use_direct_light) {
+ int all = (kernel_data.integrator.sample_all_lights_direct) ||
+ (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_branched_path_surface_connect_light(kg,
+ bssrdf_sd,
+ emission_sd,
+ hit_state,
+ branched_state->throughput,
+ num_samples_inv,
+ L,
+ all);
+ }
+ }
+#endif /* __EMISSION__ */
+
+ /* indirect light */
+ if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+ ray_index,
+ num_samples_inv,
+ bssrdf_sd,
+ false,
+ false))
+ {
+ branched_state->ss_next_closure = i;
+ branched_state->ss_next_sample = j;
+ branched_state->next_hit = hit;
+
+ return true;
+ }
+
+ branched_state->next_closure = 0;
+ }
+
+ branched_state->next_hit = 0;
+ }
+
+ branched_state->ss_next_sample = 0;
+ }
+
+ branched_state->ss_next_closure = sd->num_closure;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
+ kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+ return false;
+}
+
+#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index == 0) {
+ /* We will empty both queues in this kernel. */
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+ }
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+ get_ray_index(kg, thread_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+#ifdef __SUBSURFACE__
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+ if(sd->flag & SD_BSSRDF) {
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+#endif
+ if(kernel_path_subsurface_scatter(kg,
+ sd,
+ emission_sd,
+ L,
+ state,
+ ray,
+ throughput,
+ ss_indirect))
+ {
+ kernel_split_path_end(kg, ray_index);
+ }
+#ifdef __BRANCHED_PATH__
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
+
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
+
+ /* do bssrdf scatter step if we picked a bssrdf closure */
+ if(sc) {
+ uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
+ subsurface_scatter_step(kg,
+ sd,
+ state,
+ state->flag,
+ sc,
+ &lcg_state,
+ bssrdf_u, bssrdf_v,
+ false);
+ }
+ }
+ else {
+ kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+ if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+#endif
+ }
+ }
+
+# ifdef __BRANCHED_PATH__
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
+ }
+
+ /* iter loop */
+ ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+ QUEUE_SUBSURFACE_INDIRECT_ITER,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+ if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+ path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+ if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ }
+ }
+# endif /* __BRANCHED_PATH__ */
+
+#endif /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
- ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */
- ccl_global float *buffer, /* Output buffer of RenderTile */
- ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */
- int parallel_samples, int sw, int sh, int stride,
- int buffer_offset_x,
- int buffer_offset_y,
- int buffer_stride,
- int start_sample)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if(x < sw && y < sh) {
- buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
- per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
- int sample_stride = (data->film.pass_stride);
-
- int sample_iterator = 0;
- int pass_stride_iterator = 0;
- int num_floats = data->film.pass_stride;
-
- for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
- for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
- *(buffer + pass_stride_iterator) =
- (start_sample == 0 && sample_iterator == 0)
- ? *(per_sample_output_buffer + pass_stride_iterator)
- : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
- }
- per_sample_output_buffer += sample_stride;
- }
- }
-}