diff options
Diffstat (limited to 'intern/cycles/kernel/split')
-rw-r--r-- | intern/cycles/kernel/split/kernel_background_buffer_update.h | 282 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_data_init.h | 400 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_direct_lighting.h | 138 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h | 283 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_lamp_emission.h | 209 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_next_iteration_setup.h | 176 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_queue_enqueue.h | 98 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_scene_intersect.h | 164 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_shader_eval.h | 93 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_shadow_blocked.h | 126 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_split_common.h | 62 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_sum_all_radiance.h | 59 |
12 files changed, 2090 insertions, 0 deletions
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h new file mode 100644 index 00000000000..95de1a4b2a9 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h @@ -0,0 +1,282 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_background_buffer_update kernel. + * This is the fourth kernel in the ray tracing logic, and the third + * of the path iteration kernels. This kernel takes care of rays that hit + * the background (sceneintersect kernel), and for the rays of + * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in + * the output buffer. This kernel also takes care of rays that have been determined + * to-be-regenerated. + * + * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel + * + * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER + * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state + * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop + * throughput_coop --------------------------------------| |--- L_transparent_coop + * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers + * Ray_coop ---------------------------------------------| |--- ray_state + * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array + * parallel_samples -------------------------------------| |--- PathState_coop + * end_sample -------------------------------------------| |--- throughput_coop + * kg (globals + data) ----------------------------------| |--- rng_coop + * rng_state --------------------------------------------| |--- Ray + * PathRadiance_coop ------------------------------------| | + * sw ---------------------------------------------------| | + * sh ---------------------------------------------------| | + * sx ---------------------------------------------------| | + * sy ---------------------------------------------------| | + * stride -----------------------------------------------| | + * work_array -------------------------------------------| |--- work_array + * queuesize --------------------------------------------| | + * start_sample -----------------------------------------| |--- work_pool_wgs + * work_pool_wgs ----------------------------------------| | + * num_samples ------------------------------------------| | + * + * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself. + * Note on Queues : + * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty + */ +ccl_device void kernel_background_buffer_update( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* Required for buffer Update */ + ccl_global float3 *throughput_coop, /* Required for background hit processing */ + PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ + ccl_global Ray *Ray_coop, /* Required for background hit processing */ + ccl_global PathState *PathState_coop, /* Required for background hit processing */ + ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ + ccl_global char *ray_state, /* Stores information on the current state of a ray */ + int sw, int sh, int sx, int sy, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global unsigned int *work_array, /* Denotes work of each ray */ + ccl_global int *Queue_data, /* Queues memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + int end_sample, + int start_sample, +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel */ + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required. + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + /* Load kernel globals structure and ShaderData strucuture */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &debugdata_coop[ray_index]; +#endif + ccl_global PathState *state = &PathState_coop[ray_index]; + PathRadiance *L = L = &PathRadiance_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global float *L_transparent = &L_transparent_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; + +#ifdef __WORK_STEALING__ + unsigned int my_work; + ccl_global float *initial_per_sample_output_buffers; + ccl_global uint *initial_rng; +#endif + unsigned int sample; + unsigned int tile_x; + unsigned int tile_y; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int my_sample_tile; + +#ifdef __WORK_STEALING__ + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + initial_per_sample_output_buffers = per_sample_output_buffers; + initial_rng = rng_state; +#else // __WORK_STEALING__ + sample = work_array[ray_index]; + int tile_index = ray_index / parallel_samples; + /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); +#endif + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + *L_transparent = (*L_transparent) + average((*throughput)); +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) + { +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, state, ray, sd); + path_radiance_accum_background(L, (*throughput), L_background, state->bounce); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + } + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + float3 L_sum = path_radiance_clamp_and_sum(kg, L); + kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); +#endif + float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); + + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { +#ifdef __WORK_STEALING__ + /* We have completed current work; So get next work */ + int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + if(!valid_work) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } +#else + if((sample + parallel_samples) >= end_sample) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } +#endif + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { +#ifdef __WORK_STEALING__ + work_array[ray_index] = my_work; + /* Get the sample associated with the current work */ + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + /* Get pixel and tile position associated with current work */ + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + + /* Remap rng_state according to the current work */ + rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); + /* Remap per_sample_output_buffers according to the current work */ + per_sample_output_buffers = initial_per_sample_output_buffers + + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; +#else + work_array[ray_index] = sample + parallel_samples; + sample = work_array[ray_index]; + + /* Get ray position from ray index */ + pixel_x = sx + ((ray_index / parallel_samples) % sw); + pixel_y = sy + ((ray_index / parallel_samples) / sw); +#endif + + /* initialize random numbers and ray */ + kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); + + if(ray->t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + *L_transparent = 0.0f; + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, state, rng, sample, ray); +#ifdef __KERNEL_DEBUG__ + debug_data_init(debug_data); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } else { + /*These rays do not participate in path-iteration */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + } +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays + * will be made active during next SceneIntersectkernel + */ + enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); +} diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h new file mode 100644 index 00000000000..b7a4d847d03 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -0,0 +1,400 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_data_initialization kernel + * This kernel Initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + * + * Its input and output are as follows, + * + * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng + * Un-initialized throughput -------| |--- Initialized throughput + * Un-initialized L_transparent ----| |--- Initialized L_transparent + * Un-initialized PathRadiance -----| |--- Initialized PathRadiance + * Un-initialized Ray --------------| |--- Initialized Ray + * Un-initialized PathState --------| |--- Initialized PathState + * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) + * Un-initilaized QueueIndex -------| |--- Initialized QueueIndex (to 0) + * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) + * Un-initialized ray_state --------| |--- Initialized ray_state + * parallel_samples --------------- | |--- Initialized per_sample_output_buffers + * rng_state -----------------------| |--- Initialized work_array + * data ----------------------------| |--- Initialized work_pool_wgs + * start_sample --------------------| | + * sx ------------------------------| | + * sy ------------------------------| | + * sw ------------------------------| | + * sh ------------------------------| | + * stride --------------------------| | + * queuesize -----------------------| | + * num_samples ---------------------| | + * + * Note on Queues : + * All slots in queues are initialized to queue empty slot; + * The number of elements in the queues is initialized to 0; + */ +ccl_device void kernel_data_init( + ccl_global char *globals, + ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ + ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ + + ccl_global float3 *P_sd, + ccl_global float3 *P_sd_DL_shadow, + + ccl_global float3 *N_sd, + ccl_global float3 *N_sd_DL_shadow, + + ccl_global float3 *Ng_sd, + ccl_global float3 *Ng_sd_DL_shadow, + + ccl_global float3 *I_sd, + ccl_global float3 *I_sd_DL_shadow, + + ccl_global int *shader_sd, + ccl_global int *shader_sd_DL_shadow, + + ccl_global int *flag_sd, + ccl_global int *flag_sd_DL_shadow, + + ccl_global int *prim_sd, + ccl_global int *prim_sd_DL_shadow, + + ccl_global int *type_sd, + ccl_global int *type_sd_DL_shadow, + + ccl_global float *u_sd, + ccl_global float *u_sd_DL_shadow, + + ccl_global float *v_sd, + ccl_global float *v_sd_DL_shadow, + + ccl_global int *object_sd, + ccl_global int *object_sd_DL_shadow, + + ccl_global float *time_sd, + ccl_global float *time_sd_DL_shadow, + + ccl_global float *ray_length_sd, + ccl_global float *ray_length_sd_DL_shadow, + + ccl_global int *ray_depth_sd, + ccl_global int *ray_depth_sd_DL_shadow, + + ccl_global int *transparent_depth_sd, + ccl_global int *transparent_depth_sd_DL_shadow, + + /* Ray differentials. */ + ccl_global differential3 *dP_sd, + ccl_global differential3 *dP_sd_DL_shadow, + + ccl_global differential3 *dI_sd, + ccl_global differential3 *dI_sd_DL_shadow, + + ccl_global differential *du_sd, + ccl_global differential *du_sd_DL_shadow, + + ccl_global differential *dv_sd, + ccl_global differential *dv_sd_DL_shadow, + + /* Dp/Du */ + ccl_global float3 *dPdu_sd, + ccl_global float3 *dPdu_sd_DL_shadow, + + ccl_global float3 *dPdv_sd, + ccl_global float3 *dPdv_sd_DL_shadow, + + /* Object motion. */ + ccl_global Transform *ob_tfm_sd, + ccl_global Transform *ob_tfm_sd_DL_shadow, + + ccl_global Transform *ob_itfm_sd, + ccl_global Transform *ob_itfm_sd_DL_shadow, + + ShaderClosure *closure_sd, + ShaderClosure *closure_sd_DL_shadow, + + ccl_global int *num_closure_sd, + ccl_global int *num_closure_sd_DL_shadow, + + ccl_global float *randb_closure_sd, + ccl_global float *randb_closure_sd_DL_shadow, + + ccl_global float3 *ray_P_sd, + ccl_global float3 *ray_P_sd_DL_shadow, + + ccl_global differential3 *ray_dP_sd, + ccl_global differential3 *ray_dP_sd_DL_shadow, + + ccl_constant KernelData *data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ + ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ + ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ + PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ + ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ + ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ + ccl_global char *ray_state, /* Stores information on current state of a ray */ + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "../kernel_textures.h" + + int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* size (capacity) of the queue */ + ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ + ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ + unsigned int num_samples, /* Total number of samples per pixel */ +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + + /* Load kernel globals structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + + kg->data = data; +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "../kernel_textures.h" + + /* Load ShaderData structure */ + ShaderData *sd = (ShaderData *)shader_data_sd; + ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow; + + sd->P = P_sd; + sd_DL_shadow->P = P_sd_DL_shadow; + + sd->N = N_sd; + sd_DL_shadow->N = N_sd_DL_shadow; + + sd->Ng = Ng_sd; + sd_DL_shadow->Ng = Ng_sd_DL_shadow; + + sd->I = I_sd; + sd_DL_shadow->I = I_sd_DL_shadow; + + sd->shader = shader_sd; + sd_DL_shadow->shader = shader_sd_DL_shadow; + + sd->flag = flag_sd; + sd_DL_shadow->flag = flag_sd_DL_shadow; + + sd->prim = prim_sd; + sd_DL_shadow->prim = prim_sd_DL_shadow; + + sd->type = type_sd; + sd_DL_shadow->type = type_sd_DL_shadow; + + sd->u = u_sd; + sd_DL_shadow->u = u_sd_DL_shadow; + + sd->v = v_sd; + sd_DL_shadow->v = v_sd_DL_shadow; + + sd->object = object_sd; + sd_DL_shadow->object = object_sd_DL_shadow; + + sd->time = time_sd; + sd_DL_shadow->time = time_sd_DL_shadow; + + sd->ray_length = ray_length_sd; + sd_DL_shadow->ray_length = ray_length_sd_DL_shadow; + + sd->ray_depth = ray_depth_sd; + sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow; + + sd->transparent_depth = transparent_depth_sd; + sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow; + +#ifdef __RAY_DIFFERENTIALS__ + sd->dP = dP_sd; + sd_DL_shadow->dP = dP_sd_DL_shadow; + + sd->dI = dI_sd; + sd_DL_shadow->dI = dI_sd_DL_shadow; + + sd->du = du_sd; + sd_DL_shadow->du = du_sd_DL_shadow; + + sd->dv = dv_sd; + sd_DL_shadow->dv = dv_sd_DL_shadow; +#ifdef __DPDU__ + sd->dPdu = dPdu_sd; + sd_DL_shadow->dPdu = dPdu_sd_DL_shadow; + + sd->dPdv = dPdv_sd; + sd_DL_shadow->dPdv = dPdv_sd_DL_shadow; +#endif +#endif + +#ifdef __OBJECT_MOTION__ + sd->ob_tfm = ob_tfm_sd; + sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow; + + sd->ob_itfm = ob_itfm_sd; + sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow; +#endif + + sd->closure = closure_sd; + sd_DL_shadow->closure = closure_sd_DL_shadow; + + sd->num_closure = num_closure_sd; + sd_DL_shadow->num_closure = num_closure_sd_DL_shadow; + + sd->randb_closure = randb_closure_sd; + sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow; + + sd->ray_P = ray_P_sd; + sd_DL_shadow->ray_P = ray_P_sd_DL_shadow; + + sd->ray_dP = ray_dP_sd; + sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow; + + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + +#ifdef __WORK_STEALING__ + int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); + /* Initialize work_pool_wgs */ + if(lid == 0) { + int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); + work_pool_wgs[group_index] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif // __WORK_STEALING__ + + /* Initialize queue data and queue index */ + if(thread_index < queuesize) { + /* Initialize active ray queue */ + Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize background and buffer update queue */ + Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize shadow ray cast of AO queue */ + Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize shadow ray cast of direct lighting queue */ + Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } + + if(thread_index == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + /* The scene-intersect kernel should not use the queues very first time. + * since the queue would be empty. + */ + use_queues_flag[0] = 0; + } + + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < (sw * parallel_samples) && y < sh) { + + int ray_index = x + y * (sw * parallel_samples); + + /* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */ + ray_state[ray_index] = RAY_ACTIVE; + + unsigned int my_sample; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int tile_x; + unsigned int tile_y; + unsigned int my_sample_tile; + +#ifdef __WORK_STEALING__ + unsigned int my_work = 0; + /* get work */ + get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + /* Get the sample associated with the work */ + my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + + my_sample_tile = 0; + + /* Get pixel and tile position associated with the work */ + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + work_array[ray_index] = my_work; +#else // __WORK_STEALING__ + + unsigned int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); + my_sample = my_sample_tile + start_sample; + + /* Initialize work array */ + work_array[ray_index] = my_sample ; + + /* Calculate pixel position of this ray */ + pixel_x = sx + tile_x; + pixel_y = sy + tile_y; +#endif // __WORK_STEALING__ + + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + + /* Initialise per_sample_output_buffers to all zeros */ + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; + int per_sample_output_buffers_iterator = 0; + for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) { + per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; + } + + /* initialize random numbers and ray */ + kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]); + + if(Ray_coop[ray_index].t != 0.0f) { + /* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/ + throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + L_transparent_coop[ray_index] = 0.0f; + path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]); +#ifdef __KERNEL_DEBUG__ + debug_data_init(&debugdata_coop[ray_index]); +#endif + } else { + /*These rays do not participate in path-iteration */ + + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); + path_rng_end(kg, rng_state, rng_coop[ray_index]); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + + /* Mark rest of the ray-state indices as RAY_INACTIVE */ + if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { + /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ + ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + } +} diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h new file mode 100644 index 00000000000..6b83d892057 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -0,0 +1,138 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_direct_lighting kernel. + * This is the eighth kernel in the ray tracing logic. This is the seventh + * of the path iteration kernels. This kernel takes care of direct lighting + * logic. However, the "shadow ray cast" part of direct lighting is handled + * in the next kernel. + * + * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed. + * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and + * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS + * + * The input and output are as follows, + * + * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop + * PathState_coop -----------------------------------| |--- ISLamp_coop + * shader_data --------------------------------------| |--- LightRay_coop + * ray_state ----------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | + * kg (globals + data) ------------------------------| | + * queuesize ----------------------------------------| | + * + * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself. + * Note on Queues : + * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes + * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked + * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag. + * + * State of queues when this kernel is called : + * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same + * before and after this kernel call. + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this + * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. + */ +ccl_device void kernel_direct_lighting( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for direct lighting */ + ccl_global char *shader_DL, /* Required for direct lighting */ + ccl_global uint *rng_coop, /* Required for direct lighting */ + ccl_global PathState *PathState_coop, /* Required for direct lighting */ + ccl_global int *ISLamp_coop, /* Required for direct lighting */ + ccl_global Ray *LightRay_coop, /* Required for direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize /* Size (capacity) of each queue */ + ) +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + ShaderData *sd_DL = (ShaderData *)shader_DL; + + ccl_global PathState *state = &PathState_coop[ray_index]; + + /* direct lighting */ +#ifdef __EMISSION__ + if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) { + /* sample illumination from lights to find path contribution */ + ccl_global RNG* rng = &rng_coop[ray_index]; + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); + + Ray light_ray; +#ifdef __OBJECT_MOTION__ + light_ray.time = ccl_fetch(sd, time); +#endif + + BsdfEval L_light; + bool is_lamp; + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) { + /* write intermediate data to global memory to access from the next kernel */ + LightRay_coop[ray_index] = light_ray; + BSDFEval_coop[ray_index] = L_light; + ISLamp_coop[ray_index] = is_lamp; + /// mark ray state for next shadow kernel + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + enqueue_flag = 1; + } + } +#endif + } +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays */ + enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); +#endif +} diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h new file mode 100644 index 00000000000..393ea4bcadc --- /dev/null +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -0,0 +1,283 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. + * This is the sixth kernel in the ray tracing logic. This is the fifth + * of the path iteration kernels. This kernel takes care of the logic to process + * "material of type holdout", indirect primitive emission, bsdf blurring, + * probabilistic path termination and AO. + * + * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed. + * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and + * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS + * + * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * throughput_coop --------------------------------------| |--- PathState_coop + * PathRadiance_coop ------------------------------------| |--- throughput_coop + * Intersection_coop ------------------------------------| |--- L_transparent_coop + * PathState_coop ---------------------------------------| |--- per_sample_output_buffers + * L_transparent_coop -----------------------------------| |--- PathRadiance_coop + * shader_data ------------------------------------------| |--- ShaderData + * ray_state --------------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop + * kg (globals + data) ----------------------------------| |--- AOBSDF_coop + * parallel_samples -------------------------------------| |--- AOLightRay_coop + * per_sample_output_buffers ----------------------------| | + * sw ---------------------------------------------------| | + * sh ---------------------------------------------------| | + * sx ---------------------------------------------------| | + * sy ---------------------------------------------------| | + * stride -----------------------------------------------| | + * work_array -------------------------------------------| | + * queuesize --------------------------------------------| | + * start_sample -----------------------------------------| | + * + * Note on Queues : + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only + * the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER + * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will + * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been + * changed to RAY_UPDATE_BUFFER, there is no problem. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO + */ + +ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */ + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ + ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ + ccl_global float *L_transparent_coop, /* Required for handling holdout material */ + PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ + ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ + Intersection *Intersection_coop, /* Required for indirect primitive emission */ + ccl_global float3 *AOAlpha_coop, /* Required for AO */ + ccl_global float3 *AOBSDF_coop, /* Required for AO */ + ccl_global Ray *AOLightRay_coop, /* Required for AO */ + int sw, int sh, int sx, int sy, int stride, + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ +#ifdef __WORK_STEALING__ + unsigned int start_sample, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + ccl_local unsigned int local_queue_atomics_bg; + ccl_local unsigned int local_queue_atomics_ao; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics_bg = 0; + local_queue_atomics_ao = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + char enqueue_flag_AO_SHADOW_RAY_CAST = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + +#ifdef __WORK_STEALING__ + unsigned int my_work; + unsigned int pixel_x; + unsigned int pixel_y; +#endif + unsigned int tile_x; + unsigned int tile_y; + int my_sample_tile; + unsigned int sample; + + ccl_global RNG *rng = 0x0; + ccl_global PathState *state = 0x0; + float3 throughput; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + throughput = throughput_coop[ray_index]; + state = &PathState_coop[ray_index]; + rng = &rng_coop[ray_index]; +#ifdef __WORK_STEALING__ + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; +#else // __WORK_STEALING__ + sample = work_array[ray_index]; + /* buffer's stride is "stride"; Find x and y using ray_index */ + int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); +#endif // __WORK_STEALING__ + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + + /* holdout */ +#ifdef __HOLDOUT__ + if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) { + if(kernel_data.background.transparent) { + float3 holdout_weight; + + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + else + holdout_weight = shader_holdout_eval(kg, sd); + + /* any throughput is ok, should all be identical here */ + L_transparent_coop[ray_index] += average(holdout_weight*throughput); + } + + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + } +#endif + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + PathRadiance *L = &PathRadiance_coop[ray_index]; + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput); + + /* blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); + } + } + +#ifdef __EMISSION__ + /* emission */ + if(ccl_fetch(sd, flag) & SD_EMISSION) { + /* todo: is isect.t wrong here for transparent surfaces? */ + float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, throughput, emission, state->bounce); + } +#endif + + /* path termination. this is a strange place to put the termination, it's + * mainly due to the mixed in MIS that we use. gives too many unneeded + * shader evaluations, only need emission if we are going to terminate */ + float probability = path_state_terminate_probability(kg, state, throughput); + + if(probability == 0.0f) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(probability != 1.0f) { + float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + + if(terminate >= probability) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } else { + throughput_coop[ray_index] = throughput/probability; + } + } + } + } + +#ifdef __AO__ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* ambient occlusion */ + if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) { + /* todo: solve correlation */ + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); + + float3 ao_D; + float ao_pdf; + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray _ray; + _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + _ray.D = ao_D; + _ray.t = kernel_data.background.ao_distance; +#ifdef __OBJECT_MOTION__ + _ray.time = ccl_fetch(sd, time); +#endif + _ray.dP = ccl_fetch(sd, dP); + _ray.dD = differential3_zero(); + AOLightRay_coop[ray_index] = _ray; + + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + enqueue_flag_AO_SHADOW_RAY_CAST = 1; + } + } + } +#endif +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays */ + enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index); +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays */ + enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index); +#endif +} diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h new file mode 100644 index 00000000000..f400a99e229 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -0,0 +1,209 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_lamp_emission + * This is the 3rd kernel in the ray-tracing logic. This is the second of the + * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. + * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE + * and RAY_HIT_BACKGROUND. + * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. + * The input/output of the kernel is as follows, + * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop + * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * kg (globals + data) --------------------------------| | + * Intersection_coop ----------------------------------| | + * ray_state ------------------------------------------| | + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| | + * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| | + * queuesize ------------------------------------------| | + * use_queues_flag ------------------------------------| | + * sw -------------------------------------------------| | + * sh -------------------------------------------------| | + * parallel_samples -----------------------------------| | + * + * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel. + */ +ccl_device void kernel_lamp_emission( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for lamp emission */ + ccl_global float3 *throughput_coop, /* Required for lamp emission */ + PathRadiance *PathRadiance_coop, /* Required for lamp emission */ + ccl_global Ray *Ray_coop, /* Required for lamp emission */ + ccl_global PathState *PathState_coop, /* Required for lamp emission */ + Intersection *Intersection_coop, /* Required for lamp emission */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + /* We will empty this queue in this kernel */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } + + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = &PathRadiance_coop[ray_index]; + + float3 throughput = throughput_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + PathState state = PathState_coop[ray_index]; + +#ifdef __LAMP_MIS__ + if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { + /* ray starting from previous non-transparent bounce */ + Ray light_ray; + + light_ray.P = ray.P - state.ray_t*ray.D; + state.ray_t += Intersection_coop[ray_index].t; + light_ray.D = ray.D; + light_ray.t = state.ray_t; + light_ray.time = ray.time; + light_ray.dD = ray.dD; + light_ray.dP = ray.dP; + /* intersect with lamp */ + float3 emission; + + if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) { + path_radiance_accum_emission(L, throughput, emission, state.bounce); + } + } +#endif + /* __VOLUME__ feature is disabled */ +#if 0 +#ifdef __VOLUME__ + /* volume attenuation, emission, scatter */ + if(state.volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = ray; + volume_ray.t = (hit)? isect.t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); + +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = false; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } + } + else +#endif + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } +#endif + } + } +#endif +#endif + } +} diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h new file mode 100644 index 00000000000..343dbb06e99 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -0,0 +1,176 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_setup_next_iteration kernel. + * This is the tenth kernel in the ray tracing logic. This is the ninth + * of the path iteration kernels. This kernel takes care of setting up + * Ray for the next iteration of path-iteration and accumulating radiance + * corresponding to AO and direct-lighting + * + * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * PathRadiance_coop ------------------------------------| |--- throughput_coop + * PathState_coop ---------------------------------------| |--- PathRadiance_coop + * shader_data ------------------------------------------| |--- PathState_coop + * ray_state --------------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop + * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag + * Ray_coop ---------------------------------------------| | + * kg (globals + data) ----------------------------------| | + * LightRay_dl_coop -------------------------------------| + * ISLamp_coop ------------------------------------------| + * BSDFEval_coop ----------------------------------------| + * LightRay_ao_coop -------------------------------------| + * AOBSDF_coop ------------------------------------------| + * AOAlpha_coop -----------------------------------------| + * + * Note on queues, + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only + * the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF + * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will + * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been + * changed to RAY_UPDATE_BUFF, there is no problem. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays + */ + +ccl_device void kernel_next_iteration_setup( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for setting up ray for next iteration */ + ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ + ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ + PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ + ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ + ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ + ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + ccl_global char *use_queues_flag /* flag to decide if scene_intersect kernel should use queues to fetch ray index */ + ) +{ + + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + use_queues_flag[0] = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS + * queues that were made empty during the previous kernel + */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = 0x0; + ccl_global PathState *state = 0x0; + + /* Path radiance update for AO/Direct_lighting's shadow blocked */ + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + float3 _throughput = throughput_coop[ray_index]; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + float3 shadow = LightRay_ao_coop[ray_index].P; + char update_path_radiance = LightRay_ao_coop[ray_index].t; + if(update_path_radiance) { + path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce); + } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + } + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { + float3 shadow = LightRay_dl_coop[ray_index].P; + char update_path_radiance = LightRay_dl_coop[ray_index].t; + if(update_path_radiance) { + BsdfEval L_light = BSDFEval_coop[ray_index]; + path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]); + } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + } + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global RNG* rng = &rng_coop[ray_index]; + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + + /* compute direct lighting and next bounce */ + if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + } +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays */ + enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); +} diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h new file mode 100644 index 00000000000..9bcf8f540b4 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -0,0 +1,98 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../kernel_compat_opencl.h" +#include "../kernel_math.h" +#include "../kernel_types.h" +#include "../kernel_globals.h" +#include "../kernel_queues.h" + +/* + * The kernel "kernel_queue_enqueue" enqueues rays of + * different ray state into their appropriate Queues; + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel + * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output of the kernel is as follows, + * + * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | + * queuesize -------------------------------------------| | + * + * Note on Queues : + * State of queues during the first time this kernel is called : + * At entry, + * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. + * + * State of queue during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ + +ccl_device void kernel_queue_enqueue( + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int queuesize /* Size (capacity) of each queue */ + ) +{ + /* We have only 2 cases (Hit/Not-Hit) */ + ccl_local unsigned int local_queue_atomics[2]; + + int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + + if(lidx < 2 ) { + local_queue_atomics[lidx] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lidx == 0) { + local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index); + local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index); + } + barrier(CLK_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics); + Queue_data[my_gqidx] = ray_index; + } +} diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h new file mode 100644 index 00000000000..01e0b1fd19e --- /dev/null +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -0,0 +1,164 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_scene_intersect kernel. + * This is the second kernel in the ray tracing logic. This is the first + * of the path iteration kernels. This kernel takes care of scene_intersect function. + * + * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. + * This kernel processes rays of ray state RAY_ACTIVE + * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND. + * + * The input and output are as follows, + * + * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState + * PathState_coop ---------------------------------| |--- Intersection + * ray_state --------------------------------------| |--- ray_state + * use_queues_flag --------------------------------| | + * parallel_samples -------------------------------| | + * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | + * kg (data + globals) ----------------------------| | + * rng_coop ---------------------------------------| | + * sw ---------------------------------------------| | + * sh ---------------------------------------------| | + * queuesize --------------------------------------| | + * + * Note on Queues : + * Ideally we would want kernel_scene_intersect to work on queues. + * But during the very first time, the queues wil be empty and hence we perform a direct mapping + * between ray-index and thread-index; From the next time onward, the queue will be filled and + * we may start operating on queues. + * + * State of queue during the first time this kernel is called : + * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel + * + * State of queues during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays; + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ; + * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These + * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing + * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from + * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays + * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues) + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and + * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change + */ + +ccl_device void kernel_scene_intersect( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global uint *rng_coop, + ccl_global Ray *Ray_coop, /* Required for scene_intersect */ + ccl_global PathState *PathState_coop, /* Required for scene_intersect */ + Intersection *Intersection_coop, /* Required for scene_intersect */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + + /* All regenerated rays become active here */ + if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); + + if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) + return; + + /* Load kernel globals structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &debugdata_coop[ray_index]; +#endif + Intersection *isect = &Intersection_coop[ray_index]; + PathState state = PathState_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + + /* intersect scene */ + uint visibility = path_state_ray_visibility(kg, &state); + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + RNG rng = rng_coop[ray_index]; + + if(kernel_data.bvh.have_curves) { + if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray.dD.dx + ray.dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init(&rng, &state, 0x51633e2d); + } + + bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data->num_bvh_traversal_steps += isect->num_traversal_steps; + } +#endif + + if(!hit) { + /* Change the state of rays that hit the background; + * These rays undergo special processing in the + * background_bufferUpdate kernel*/ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + } +} diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h new file mode 100644 index 00000000000..0a8d77f52b0 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_shader_eval kernel + * This kernel is the 5th kernel in the ray tracing logic. This is + * the 4rd kernel in path iteration. This kernel sets up the ShaderData + * structure from the values computed by the previous kernels. It also identifies + * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + * + * The input and output of the kernel is as follows, + * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- shader_data + * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Intersection_coop ----------------------------------| | + * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| | + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| | + * ray_state ------------------------------------------| | + * kg (globals + data) --------------------------------| | + * queuesize ------------------------------------------| | + * + * Note on Queues : + * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes + * only the rays of state RAY_ACTIVE; + * State of queues when this kernel is called, + * at entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * at exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays + */ + +ccl_device void kernel_shader_eval( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Output ShaderData structure to be filled */ + ccl_global uint *rng_coop, /* Required for rbsdf calculation */ + ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ + ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ + Intersection *Intersection_coop, /* Required for setting up shader from ray */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize /* Size (capacity) of each queue */ + ) +{ + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + + enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); + + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + /* Continue on with shader evaluation */ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + Intersection *isect = &Intersection_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; + ccl_global PathState *state = &PathState_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + + shader_setup_from_ray(kg, sd, isect, &ray, state->bounce, state->transparent_bounce); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); + shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + } +} diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h new file mode 100644 index 00000000000..71fab19518c --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h @@ -0,0 +1,126 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split_common.h" + +/* + * Note on kernel_shadow_blocked kernel. + * This is the ninth kernel in the ray tracing logic. This is the eighth + * of the path iteration kernels. This kernel takes care of "shadow ray cast" + * logic of the direct lighting and AO part of ray tracing. + * + * The input and output are as follows, + * + * PathState_coop ----------------------------------|--- kernel_shadow_blocked --| + * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop + * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop + * ray_state ---------------------------------------| |--- ray_state + * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + * kg (globals + data) -----------------------------| | + * queuesize ---------------------------------------| | + * + * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself. + * Note on queues : + * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty + * these queues this kernel. + * State of queues when this kernel is called : + * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same + * before and after this kernel call. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO + * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. + */ + +ccl_device void kernel_shadow_blocked( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_shadow, /* Required for shadow blocked */ + ccl_global PathState *PathState_coop, /* Required for shadow blocked */ + ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ + ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ + Intersection *Intersection_coop_AO, + Intersection *Intersection_coop_DL, + ccl_global char *ray_state, + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + int total_num_rays + ) +{ +#if 0 + /* we will make the Queue_index entries '0' in the next kernel */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* We empty this queue here */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } +#endif + + int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); + + ccl_local unsigned int ao_queue_length; + ccl_local unsigned int dl_queue_length; + if(lidx == 0) { + ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* flag determining if the current ray is to process shadow ray for AO or DL */ + char shadow_blocked_type = -1; + /* flag determining if we need to update L */ + char update_path_radiance = 0; + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(thread_index < ao_queue_length + dl_queue_length) { + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; + } else { + ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; + } + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + /* Load kernel global structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd_shadow = (ShaderData *)shader_shadow; + + ccl_global PathState *state = &PathState_coop[ray_index]; + ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; + ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; + Intersection *isect_ao_global = &Intersection_coop_AO[ray_index]; + Intersection *isect_dl_global = &Intersection_coop_DL[ray_index]; + + ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global; + Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global; + + float3 shadow; + update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global)); + + /* We use light_ray_global's P and t to store shadow and update_path_radiance */ + light_ray_global->P = shadow; + light_ray_global->t = update_path_radiance; + } +} diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h new file mode 100644 index 00000000000..f5830be6bb1 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -0,0 +1,62 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _KERNEL_SPLIT_H_ +#define _KERNEL_SPLIT_H_ + +#include "../kernel_compat_opencl.h" +#include "../kernel_math.h" +#include "../kernel_types.h" +#include "../kernel_globals.h" + +#include "../util_atomic.h" + +#include "../kernel_random.h" +#include "../kernel_projection.h" +#include "../kernel_montecarlo.h" +#include "../kernel_differential.h" +#include "../kernel_camera.h" + +#include "../geom/geom.h" + +#include "../kernel_accumulate.h" +#include "../kernel_shader.h" +#include "../kernel_light.h" +#include "../kernel_passes.h" + +#ifdef __SUBSURFACE__ +#include "../kernel_subsurface.h" +#endif + +#ifdef __VOLUME__ +#include "../kernel_volume.h" +#endif + +#include "../kernel_path_state.h" +#include "../kernel_shadow.h" +#include "../kernel_emission.h" +#include "../kernel_path_common.h" +#include "../kernel_path_surface.h" +#include "../kernel_path_volume.h" + +#ifdef __KERNEL_DEBUG__ +#include "../kernel_debug.h" +#endif + +#include "../kernel_queues.h" +#include "../kernel_work_stealing.h" + +#endif diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h new file mode 100644 index 00000000000..eeb7da76e73 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../kernel_compat_opencl.h" +#include "../kernel_math.h" +#include "../kernel_types.h" +#include "../kernel_globals.h" + +/* +* Since we process various samples in parallel; The output radiance of different samples +* are stored in different locations; This kernel combines the output radiance contributed +* by all different samples and stores them in the RenderTile's output buffer. +*/ + +ccl_device void kernel_sum_all_radiance( + ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ + ccl_global float *buffer, /* Output buffer of RenderTile */ + ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ + int parallel_samples, int sw, int sh, int stride, + int buffer_offset_x, + int buffer_offset_y, + int buffer_stride, + int start_sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < sw && y < sh) { + buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); + per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); + + int sample_stride = (data->film.pass_stride); + + int sample_iterator = 0; + int pass_stride_iterator = 0; + int num_floats = data->film.pass_stride; + + for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { + for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { + *(buffer + pass_stride_iterator) = (start_sample == 0 && sample_iterator == 0) ? *(per_sample_output_buffer + pass_stride_iterator) + : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); + } + per_sample_output_buffer += sample_stride; + } + } +} |