diff options
22 files changed, 1140 insertions, 1105 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 89dd3542ef6..fa8f36bad9a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -169,7 +169,6 @@ set(SRC_SPLIT_HEADERS split/kernel_holdout_emission_blurring_pathtermination_ao.h split/kernel_lamp_emission.h split/kernel_next_iteration_setup.h - split/kernel_queue_enqueue.h split/kernel_scene_intersect.h split/kernel_shader_eval.h split/kernel_shadow_blocked.h diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl index 2d1944d01e6..eff77b89a0a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl @@ -48,34 +48,81 @@ __kernel void kernel_ocl_path_trace_background_buffer_update( #endif int parallel_samples) /* Number of samples to be processed in parallel */ { - kernel_background_buffer_update(globals, - data, - shader_data, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - sw, sh, sx, sy, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - work_array, - Queue_data, - Queue_index, - queuesize, - end_sample, - start_sample, + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel. */ + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + Queue_data, + queuesize, + 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + enqueue_flag = + kernel_background_buffer_update(globals, + data, + shader_data, + per_sample_output_buffers, + rng_state, + rng_coop, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + L_transparent_coop, + ray_state, + sw, sh, sx, sy, stride, + rng_state_offset_x, + rng_state_offset_y, + rng_state_stride, + work_array, + end_sample, + start_sample, #ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, + work_pool_wgs, + num_samples, #endif #ifdef __KERNEL_DEBUG__ - debugdata_coop, + debugdata_coop, +#endif + parallel_samples, + ray_index); +#ifndef __COMPUTE_DEVICE_GPU__ + } #endif - parallel_samples); + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; + * These rays will be made active during next SceneIntersectkernel. + */ + enqueue_ray_index_local(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 015f0872413..c3277676029 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -17,130 +17,129 @@ #include "split/kernel_data_init.h" __kernel void kernel_ocl_path_trace_data_init( - ccl_global char *globals, - ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ - ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ + ccl_global char *globals, + ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ + ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ - ccl_global float3 *P_sd, - ccl_global float3 *P_sd_DL_shadow, + ccl_global float3 *P_sd, + ccl_global float3 *P_sd_DL_shadow, - ccl_global float3 *N_sd, - ccl_global float3 *N_sd_DL_shadow, + ccl_global float3 *N_sd, + ccl_global float3 *N_sd_DL_shadow, - ccl_global float3 *Ng_sd, - ccl_global float3 *Ng_sd_DL_shadow, + ccl_global float3 *Ng_sd, + ccl_global float3 *Ng_sd_DL_shadow, - ccl_global float3 *I_sd, - ccl_global float3 *I_sd_DL_shadow, + ccl_global float3 *I_sd, + ccl_global float3 *I_sd_DL_shadow, - ccl_global int *shader_sd, - ccl_global int *shader_sd_DL_shadow, + ccl_global int *shader_sd, + ccl_global int *shader_sd_DL_shadow, - ccl_global int *flag_sd, - ccl_global int *flag_sd_DL_shadow, + ccl_global int *flag_sd, + ccl_global int *flag_sd_DL_shadow, - ccl_global int *prim_sd, - ccl_global int *prim_sd_DL_shadow, + ccl_global int *prim_sd, + ccl_global int *prim_sd_DL_shadow, - ccl_global int *type_sd, - ccl_global int *type_sd_DL_shadow, + ccl_global int *type_sd, + ccl_global int *type_sd_DL_shadow, - ccl_global float *u_sd, - ccl_global float *u_sd_DL_shadow, + ccl_global float *u_sd, + ccl_global float *u_sd_DL_shadow, - ccl_global float *v_sd, - ccl_global float *v_sd_DL_shadow, + ccl_global float *v_sd, + ccl_global float *v_sd_DL_shadow, - ccl_global int *object_sd, - ccl_global int *object_sd_DL_shadow, + ccl_global int *object_sd, + ccl_global int *object_sd_DL_shadow, - ccl_global float *time_sd, - ccl_global float *time_sd_DL_shadow, + ccl_global float *time_sd, + ccl_global float *time_sd_DL_shadow, - ccl_global float *ray_length_sd, - ccl_global float *ray_length_sd_DL_shadow, + ccl_global float *ray_length_sd, + ccl_global float *ray_length_sd_DL_shadow, - ccl_global int *ray_depth_sd, - ccl_global int *ray_depth_sd_DL_shadow, + ccl_global int *ray_depth_sd, + ccl_global int *ray_depth_sd_DL_shadow, - ccl_global int *transparent_depth_sd, - ccl_global int *transparent_depth_sd_DL_shadow, + ccl_global int *transparent_depth_sd, + ccl_global int *transparent_depth_sd_DL_shadow, - /* Ray differentials. */ - ccl_global differential3 *dP_sd, - ccl_global differential3 *dP_sd_DL_shadow, + /* Ray differentials. */ + ccl_global differential3 *dP_sd, + ccl_global differential3 *dP_sd_DL_shadow, - ccl_global differential3 *dI_sd, - ccl_global differential3 *dI_sd_DL_shadow, + ccl_global differential3 *dI_sd, + ccl_global differential3 *dI_sd_DL_shadow, - ccl_global differential *du_sd, - ccl_global differential *du_sd_DL_shadow, + ccl_global differential *du_sd, + ccl_global differential *du_sd_DL_shadow, - ccl_global differential *dv_sd, - ccl_global differential *dv_sd_DL_shadow, + ccl_global differential *dv_sd, + ccl_global differential *dv_sd_DL_shadow, - /* Dp/Du */ - ccl_global float3 *dPdu_sd, - ccl_global float3 *dPdu_sd_DL_shadow, + /* Dp/Du */ + ccl_global float3 *dPdu_sd, + ccl_global float3 *dPdu_sd_DL_shadow, - ccl_global float3 *dPdv_sd, - ccl_global float3 *dPdv_sd_DL_shadow, + ccl_global float3 *dPdv_sd, + ccl_global float3 *dPdv_sd_DL_shadow, - /* Object motion. */ - ccl_global Transform *ob_tfm_sd, - ccl_global Transform *ob_tfm_sd_DL_shadow, + /* Object motion. */ + ccl_global Transform *ob_tfm_sd, + ccl_global Transform *ob_tfm_sd_DL_shadow, - ccl_global Transform *ob_itfm_sd, - ccl_global Transform *ob_itfm_sd_DL_shadow, + ccl_global Transform *ob_itfm_sd, + ccl_global Transform *ob_itfm_sd_DL_shadow, - ShaderClosure *closure_sd, - ShaderClosure *closure_sd_DL_shadow, + ShaderClosure *closure_sd, + ShaderClosure *closure_sd_DL_shadow, - ccl_global int *num_closure_sd, - ccl_global int *num_closure_sd_DL_shadow, + ccl_global int *num_closure_sd, + ccl_global int *num_closure_sd_DL_shadow, - ccl_global float *randb_closure_sd, - ccl_global float *randb_closure_sd_DL_shadow, + ccl_global float *randb_closure_sd, + ccl_global float *randb_closure_sd_DL_shadow, - ccl_global float3 *ray_P_sd, - ccl_global float3 *ray_P_sd_DL_shadow, + ccl_global float3 *ray_P_sd, + ccl_global float3 *ray_P_sd_DL_shadow, - ccl_global differential3 *ray_dP_sd, - ccl_global differential3 *ray_dP_sd_DL_shadow, + ccl_global differential3 *ray_dP_sd, + ccl_global differential3 *ray_dP_sd_DL_shadow, - ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - ccl_global char *ray_state, /* Stores information on current state of a ray */ + ccl_constant KernelData *data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ + ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ + ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ + PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ + ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ + ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ + ccl_global char *ray_state, /* Stores information on current state of a ray */ -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, #include "../../kernel_textures.h" - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* size (capacity) of the queue */ - ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ + int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* size (capacity) of the queue */ + ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ + ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ #ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ + ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ + unsigned int num_samples, /* Total number of samples per pixel */ #endif #ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, + DebugData *debugdata_coop, #endif - int parallel_samples /* Number of samples to be processed in parallel */ - ) + int parallel_samples) /* Number of samples to be processed in parallel */ { kernel_data_init(globals, shader_data_sd, diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index 0b22c6d0864..6ec75013b3a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -31,17 +31,60 @@ __kernel void kernel_ocl_path_trace_direct_lighting( ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ int queuesize) /* Size (capacity) of each queue */ { - kernel_direct_lighting(globals, - data, - shader_data, - shader_DL, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - Queue_data, - Queue_index, - queuesize); + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + enqueue_flag = kernel_direct_lighting(globals, + data, + shader_data, + shader_DL, + rng_coop, + PathState_coop, + ISLamp_coop, + LightRay_coop, + BSDFEval_coop, + ray_state, + ray_index); + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); +#endif } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index 502f10a7a59..ae5f5cd1b3b 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -41,27 +41,84 @@ __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao #endif int parallel_samples) /* Number of samples to be processed in parallel */ { - kernel_holdout_emission_blurring_pathtermination_ao(globals, - data, - shader_data, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - sw, sh, sx, sy, stride, - ray_state, - work_array, - Queue_data, - Queue_index, - queuesize, + ccl_local unsigned int local_queue_atomics_bg; + ccl_local unsigned int local_queue_atomics_ao; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics_bg = 0; + local_queue_atomics_ao = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + char enqueue_flag_AO_SHADOW_RAY_CAST = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif /* __COMPUTE_DEVICE_GPU__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + kernel_holdout_emission_blurring_pathtermination_ao( + globals, + data, + shader_data, + per_sample_output_buffers, + rng_coop, + throughput_coop, + L_transparent_coop, + PathRadiance_coop, + PathState_coop, + Intersection_coop, + AOAlpha_coop, + AOBSDF_coop, + AOLightRay_coop, + sw, sh, sx, sy, stride, + ray_state, + work_array, #ifdef __WORK_STEALING__ - start_sample, + start_sample, +#endif + parallel_samples, + ray_index, + &enqueue_flag, + &enqueue_flag_AO_SHADOW_RAY_CAST); +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics_bg, + Queue_data, + Queue_index); + +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, + enqueue_flag_AO_SHADOW_RAY_CAST, + queuesize, + &local_queue_atomics_ao, + Queue_data, + Queue_index); #endif - parallel_samples); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index af83e68b53e..1bc7808d834 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -17,23 +17,57 @@ #include "split/kernel_lamp_emission.h" __kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, /* Required for lamp emission */ - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ - int parallel_samples /* Number of samples to be processed in parallel */ - ) + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for lamp emission */ + ccl_global float3 *throughput_coop, /* Required for lamp emission */ + PathRadiance *PathRadiance_coop, /* Required for lamp emission */ + ccl_global Ray *Ray_coop, /* Required for lamp emission */ + ccl_global PathState *PathState_coop, /* Required for lamp emission */ + Intersection *Intersection_coop, /* Required for lamp emission */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* Used to decide if this kernel should use + * queues to fetch ray index + */ + int parallel_samples) /* Number of samples to be processed in parallel */ { + int x = get_global_id(0); + int y = get_global_id(1); + + /* We will empty this queue in this kernel. */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } + /* Fetch use_queues_flag. */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 1); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + kernel_lamp_emission(globals, data, shader_data, @@ -44,9 +78,7 @@ __kernel void kernel_ocl_path_trace_lamp_emission( Intersection_coop, ray_state, sw, sh, - Queue_data, - Queue_index, - queuesize, use_queues_flag, - parallel_samples); + parallel_samples, + ray_index); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 4acd991f0b4..dcf4db40411 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -35,25 +35,81 @@ __kernel void kernel_ocl_path_trace_next_iteration_setup( ccl_global int *Queue_data, /* Queue memory */ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ int queuesize, /* Size (capacity) of each queue */ - ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should use queues to fetch ray index */ + ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should + * use queues to fetch ray index */ { - kernel_next_iteration_setup(globals, - data, - shader_data, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_dl_coop, - ISLamp_coop, - BSDFEval_coop, - LightRay_ao_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - Queue_data, - Queue_index, - queuesize, - use_queues_flag); + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + use_queues_flag[0] = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the + * previous kernel. + */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + enqueue_flag = kernel_next_iteration_setup(globals, + data, + shader_data, + rng_coop, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + LightRay_dl_coop, + ISLamp_coop, + BSDFEval_coop, + LightRay_ao_coop, + AOBSDF_coop, + AOAlpha_coop, + ray_state, + use_queues_flag, + ray_index); +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 62cf08c387d..3156dc255fb 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -14,16 +14,93 @@ * limitations under the License. */ -#include "split/kernel_queue_enqueue.h" +#include "../../kernel_compat_opencl.h" +#include "../../kernel_math.h" +#include "../../kernel_types.h" +#include "../../kernel_globals.h" +#include "../../kernel_queues.h" +/* + * The kernel "kernel_queue_enqueue" enqueues rays of + * different ray state into their appropriate Queues; + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel + * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output of the kernel is as follows, + * + * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | + * queuesize -------------------------------------------| | + * + * Note on Queues : + * State of queues during the first time this kernel is called : + * At entry, + * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. + * + * State of queue during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ __kernel void kernel_ocl_path_trace_queue_enqueue( ccl_global int *Queue_data, /* Queue memory */ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ ccl_global char *ray_state, /* Denotes the state of each ray */ int queuesize) /* Size (capacity) of each queue */ { - kernel_queue_enqueue(Queue_data, - Queue_index, - ray_state, - queuesize); + /* We have only 2 cases (Hit/Not-Hit) */ + ccl_local unsigned int local_queue_atomics[2]; + + int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + + if(lidx < 2 ) { + local_queue_atomics[lidx] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } + else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lidx == 0) { + local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = + get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, + local_queue_atomics, + Queue_index); + local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = + get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + local_queue_atomics, + Queue_index); + } + barrier(CLK_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, + queuesize, + my_lqidx, + local_queue_atomics); + Queue_data[my_gqidx] = ray_index; + } } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index d219874d391..e5fad7bce50 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -28,12 +28,43 @@ __kernel void kernel_ocl_path_trace_scene_intersect( ccl_global int *Queue_data, /* Memory for queues */ ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ + ccl_global char *use_queues_flag, /* used to decide if this kernel should use + * queues to fetch ray index */ #ifdef __KERNEL_DEBUG__ DebugData *debugdata_coop, #endif int parallel_samples) /* Number of samples to be processed in parallel */ { + int x = get_global_id(0); + int y = get_global_id(1); + + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + kernel_scene_intersect(globals, data, rng_coop, @@ -42,12 +73,10 @@ __kernel void kernel_ocl_path_trace_scene_intersect( Intersection_coop, ray_state, sw, sh, - Queue_data, - Queue_index, - queuesize, use_queues_flag, #ifdef __KERNEL_DEBUG__ debugdata_coop, #endif - parallel_samples); + parallel_samples, + ray_index); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index 04769d7d792..b9f616e6bdf 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -29,6 +29,34 @@ __kernel void kernel_ocl_path_trace_shader_eval( ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ int queuesize) /* Size (capacity) of each queue */ { + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + Queue_data, + queuesize, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + queuesize, + &local_queue_atomics, + Queue_data, + Queue_index); + + /* Continue on with shader evaluation. */ kernel_shader_eval(globals, data, shader_data, @@ -37,7 +65,5 @@ __kernel void kernel_ocl_path_trace_shader_eval( PathState_coop, Intersection_coop, ray_state, - Queue_data, - Queue_index, - queuesize); + ray_index); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl index 9d57364c8d6..03886c0a030 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl @@ -31,6 +31,43 @@ __kernel void kernel_ocl_path_trace_shadow_blocked( int queuesize, /* Size (capacity) of each queue */ int total_num_rays) { +#if 0 + /* We will make the Queue_index entries '0' in the next kernel. */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* We empty this queue here */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } +#endif + + int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); + + ccl_local unsigned int ao_queue_length; + ccl_local unsigned int dl_queue_length; + if(lidx == 0) { + ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* flag determining if the current ray is to process shadow ray for AO or DL */ + char shadow_blocked_type = -1; + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(thread_index < ao_queue_length + dl_queue_length) { + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; + } else { + ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; + } + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + kernel_shadow_blocked(globals, data, shader_shadow, @@ -40,8 +77,7 @@ __kernel void kernel_ocl_path_trace_shadow_blocked( Intersection_coop_AO, Intersection_coop_DL, ray_state, - Queue_data, - Queue_index, - queuesize, - total_num_rays); + total_num_rays, + shadow_blocked_type, + ray_index); } diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h index 32c7e6f8c0a..87ea0348175 100644 --- a/intern/cycles/kernel/split/kernel_background_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_background_buffer_update kernel. +/* Note on kernel_background_buffer_update kernel. * This is the fourth kernel in the ray tracing logic, and the third * of the path iteration kernels. This kernel takes care of rays that hit * the background (sceneintersect kernel), and for the rays of @@ -70,121 +69,93 @@ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty */ -__kernel void kernel_background_buffer_update( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - ccl_global int *Queue_data, /* Queues memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - int end_sample, - int start_sample, +ccl_device char kernel_background_buffer_update( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* Required for buffer Update */ + ccl_global float3 *throughput_coop, /* Required for background hit processing */ + PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ + ccl_global Ray *Ray_coop, /* Required for background hit processing */ + ccl_global PathState *PathState_coop, /* Required for background hit processing */ + ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ + ccl_global char *ray_state, /* Stores information on the current state of a ray */ + int sw, int sh, int sx, int sy, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global unsigned int *work_array, /* Denotes work of each ray */ + int end_sample, + int start_sample, #ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, #endif #ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, + DebugData *debugdata_coop, #endif - int parallel_samples /* Number of samples to be processed in parallel */ - ) + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index) { - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(ray_index == 0) { - /* We will empty this queue in this kernel */ - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } char enqueue_flag = 0; - ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not required. - * If we are executing on a CPU device, then we need to keep all threads active - * since we have barrier() calls later in the kernel. CPU devices - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) - return; -#endif -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - /* Load kernel globals structure and ShaderData strucuture */ - KernelGlobals *kg = (KernelGlobals *)globals; - ShaderData *sd = (ShaderData *)shader_data; + /* Load kernel globals structure and ShaderData strucuture */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; #ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; + DebugData *debug_data = &debugdata_coop[ray_index]; #endif - ccl_global PathState *state = &PathState_coop[ray_index]; - PathRadiance *L = L = &PathRadiance_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global float *L_transparent = &L_transparent_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; + ccl_global PathState *state = &PathState_coop[ray_index]; + PathRadiance *L = L = &PathRadiance_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global float *L_transparent = &L_transparent_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; #ifdef __WORK_STEALING__ - unsigned int my_work; - ccl_global float *initial_per_sample_output_buffers; - ccl_global uint *initial_rng; + unsigned int my_work; + ccl_global float *initial_per_sample_output_buffers; + ccl_global uint *initial_rng; #endif - unsigned int sample; - unsigned int tile_x; - unsigned int tile_y; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int my_sample_tile; + unsigned int sample; + unsigned int tile_x; + unsigned int tile_y; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int my_sample_tile; #ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; - initial_per_sample_output_buffers = per_sample_output_buffers; - initial_rng = rng_state; + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + initial_per_sample_output_buffers = per_sample_output_buffers; + initial_rng = rng_state; #else // __WORK_STEALING__ - sample = work_array[ray_index]; - int tile_index = ray_index / parallel_samples; - /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); + sample = work_array[ray_index]; + int tile_index = ray_index / parallel_samples; + /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); #endif - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { - *L_transparent = (*L_transparent) + average((*throughput)); + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + *L_transparent = (*L_transparent) + average((*throughput)); #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) #endif ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } + } - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { #ifdef __BACKGROUND__ /* sample background shader */ @@ -193,90 +164,83 @@ __kernel void kernel_background_buffer_update( #endif ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); } - } + } - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum = path_radiance_clamp_and_sum(kg, L); - kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + float3 L_sum = path_radiance_clamp_and_sum(kg, L); + kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); #ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); + kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); #endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); + float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); - /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { #ifdef __WORK_STEALING__ - /* We have completed current work; So get next work */ - int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - if(!valid_work) { - /* If work is invalid, this means no more work is available and the thread may exit */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } + /* We have completed current work; So get next work */ + int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + if(!valid_work) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } #else - if((sample + parallel_samples) >= end_sample) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } + if((sample + parallel_samples) >= end_sample) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } #endif - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { #ifdef __WORK_STEALING__ - work_array[ray_index] = my_work; - /* Get the sample associated with the current work */ - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - /* Get pixel and tile position associated with current work */ - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; - - /* Remap rng_state according to the current work */ - rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); - /* Remap per_sample_output_buffers according to the current work */ - per_sample_output_buffers = initial_per_sample_output_buffers - + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + work_array[ray_index] = my_work; + /* Get the sample associated with the current work */ + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + /* Get pixel and tile position associated with current work */ + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + + /* Remap rng_state according to the current work */ + rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); + /* Remap per_sample_output_buffers according to the current work */ + per_sample_output_buffers = initial_per_sample_output_buffers + + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; #else - work_array[ray_index] = sample + parallel_samples; - sample = work_array[ray_index]; + work_array[ray_index] = sample + parallel_samples; + sample = work_array[ray_index]; - /* Get ray position from ray index */ - pixel_x = sx + ((ray_index / parallel_samples) % sw); - pixel_y = sy + ((ray_index / parallel_samples) / sw); + /* Get ray position from ray index */ + pixel_x = sx + ((ray_index / parallel_samples) % sw); + pixel_y = sy + ((ray_index / parallel_samples) / sw); #endif - /* initialize random numbers and ray */ - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); + /* initialize random numbers and ray */ + kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); - if(ray->t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/ - *throughput = make_float3(1.0f, 1.0f, 1.0f); - *L_transparent = 0.0f; - path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, state, rng, sample, ray); + if(ray->t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + *L_transparent = 0.0f; + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, state, rng, sample, ray); #ifdef __KERNEL_DEBUG__ - debug_data_init(debug_data); + debug_data_init(debug_data); #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - enqueue_flag = 1; - } else { - /*These rays do not participate in path-iteration */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } else { + /*These rays do not participate in path-iteration */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } } -#ifndef __COMPUTE_DEVICE_GPU__ } -#endif - - /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays - * will be made active during next SceneIntersectkernel - */ - enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); + return enqueue_flag; } diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 006f2c8e4df..5e054bdab32 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_data_initialization kernel +/* Note on kernel_data_initialization kernel * This kernel Initializes structures needed in path-iteration kernels. * This is the first kernel in ray-tracing logic. * @@ -51,131 +50,130 @@ * All slots in queues are initialized to queue empty slot; * The number of elements in the queues is initialized to 0; */ -__kernel void kernel_data_init( - ccl_global char *globals, - ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ - ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ +ccl_device void kernel_data_init( + ccl_global char *globals, + ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ + ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ - ccl_global float3 *P_sd, - ccl_global float3 *P_sd_DL_shadow, + ccl_global float3 *P_sd, + ccl_global float3 *P_sd_DL_shadow, - ccl_global float3 *N_sd, - ccl_global float3 *N_sd_DL_shadow, + ccl_global float3 *N_sd, + ccl_global float3 *N_sd_DL_shadow, - ccl_global float3 *Ng_sd, - ccl_global float3 *Ng_sd_DL_shadow, + ccl_global float3 *Ng_sd, + ccl_global float3 *Ng_sd_DL_shadow, - ccl_global float3 *I_sd, - ccl_global float3 *I_sd_DL_shadow, + ccl_global float3 *I_sd, + ccl_global float3 *I_sd_DL_shadow, - ccl_global int *shader_sd, - ccl_global int *shader_sd_DL_shadow, + ccl_global int *shader_sd, + ccl_global int *shader_sd_DL_shadow, - ccl_global int *flag_sd, - ccl_global int *flag_sd_DL_shadow, + ccl_global int *flag_sd, + ccl_global int *flag_sd_DL_shadow, - ccl_global int *prim_sd, - ccl_global int *prim_sd_DL_shadow, + ccl_global int *prim_sd, + ccl_global int *prim_sd_DL_shadow, - ccl_global int *type_sd, - ccl_global int *type_sd_DL_shadow, + ccl_global int *type_sd, + ccl_global int *type_sd_DL_shadow, - ccl_global float *u_sd, - ccl_global float *u_sd_DL_shadow, + ccl_global float *u_sd, + ccl_global float *u_sd_DL_shadow, - ccl_global float *v_sd, - ccl_global float *v_sd_DL_shadow, + ccl_global float *v_sd, + ccl_global float *v_sd_DL_shadow, - ccl_global int *object_sd, - ccl_global int *object_sd_DL_shadow, + ccl_global int *object_sd, + ccl_global int *object_sd_DL_shadow, - ccl_global float *time_sd, - ccl_global float *time_sd_DL_shadow, + ccl_global float *time_sd, + ccl_global float *time_sd_DL_shadow, - ccl_global float *ray_length_sd, - ccl_global float *ray_length_sd_DL_shadow, + ccl_global float *ray_length_sd, + ccl_global float *ray_length_sd_DL_shadow, - ccl_global int *ray_depth_sd, - ccl_global int *ray_depth_sd_DL_shadow, + ccl_global int *ray_depth_sd, + ccl_global int *ray_depth_sd_DL_shadow, - ccl_global int *transparent_depth_sd, - ccl_global int *transparent_depth_sd_DL_shadow, + ccl_global int *transparent_depth_sd, + ccl_global int *transparent_depth_sd_DL_shadow, - /* Ray differentials. */ - ccl_global differential3 *dP_sd, - ccl_global differential3 *dP_sd_DL_shadow, + /* Ray differentials. */ + ccl_global differential3 *dP_sd, + ccl_global differential3 *dP_sd_DL_shadow, - ccl_global differential3 *dI_sd, - ccl_global differential3 *dI_sd_DL_shadow, + ccl_global differential3 *dI_sd, + ccl_global differential3 *dI_sd_DL_shadow, - ccl_global differential *du_sd, - ccl_global differential *du_sd_DL_shadow, + ccl_global differential *du_sd, + ccl_global differential *du_sd_DL_shadow, - ccl_global differential *dv_sd, - ccl_global differential *dv_sd_DL_shadow, + ccl_global differential *dv_sd, + ccl_global differential *dv_sd_DL_shadow, - /* Dp/Du */ - ccl_global float3 *dPdu_sd, - ccl_global float3 *dPdu_sd_DL_shadow, + /* Dp/Du */ + ccl_global float3 *dPdu_sd, + ccl_global float3 *dPdu_sd_DL_shadow, - ccl_global float3 *dPdv_sd, - ccl_global float3 *dPdv_sd_DL_shadow, + ccl_global float3 *dPdv_sd, + ccl_global float3 *dPdv_sd_DL_shadow, - /* Object motion. */ - ccl_global Transform *ob_tfm_sd, - ccl_global Transform *ob_tfm_sd_DL_shadow, + /* Object motion. */ + ccl_global Transform *ob_tfm_sd, + ccl_global Transform *ob_tfm_sd_DL_shadow, - ccl_global Transform *ob_itfm_sd, - ccl_global Transform *ob_itfm_sd_DL_shadow, + ccl_global Transform *ob_itfm_sd, + ccl_global Transform *ob_itfm_sd_DL_shadow, - ShaderClosure *closure_sd, - ShaderClosure *closure_sd_DL_shadow, + ShaderClosure *closure_sd, + ShaderClosure *closure_sd_DL_shadow, - ccl_global int *num_closure_sd, - ccl_global int *num_closure_sd_DL_shadow, + ccl_global int *num_closure_sd, + ccl_global int *num_closure_sd_DL_shadow, - ccl_global float *randb_closure_sd, - ccl_global float *randb_closure_sd_DL_shadow, + ccl_global float *randb_closure_sd, + ccl_global float *randb_closure_sd_DL_shadow, - ccl_global float3 *ray_P_sd, - ccl_global float3 *ray_P_sd_DL_shadow, + ccl_global float3 *ray_P_sd, + ccl_global float3 *ray_P_sd_DL_shadow, - ccl_global differential3 *ray_dP_sd, - ccl_global differential3 *ray_dP_sd_DL_shadow, + ccl_global differential3 *ray_dP_sd, + ccl_global differential3 *ray_dP_sd_DL_shadow, - ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - ccl_global char *ray_state, /* Stores information on current state of a ray */ + ccl_constant KernelData *data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ + ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ + ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ + PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ + ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ + ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ + ccl_global char *ray_state, /* Stores information on current state of a ray */ -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, #include "../kernel_textures.h" - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* size (capacity) of the queue */ - ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ + int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* size (capacity) of the queue */ + ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ + ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ #ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ + ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ + unsigned int num_samples, /* Total number of samples per pixel */ #endif #ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, + DebugData *debugdata_coop, #endif - int parallel_samples /* Number of samples to be processed in parallel */ - ) + int parallel_samples) /* Number of samples to be processed in parallel */ { /* Load kernel globals structure */ diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 91c3ef11682..a39e47b9b96 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_direct_lighting kernel. +/* Note on kernel_direct_lighting kernel. * This is the eighth kernel in the ray tracing logic. This is the seventh * of the path iteration kernels. This kernel takes care of direct lighting * logic. However, the "shadow ray cast" part of direct lighting is handled @@ -49,90 +48,58 @@ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. */ -__kernel void kernel_direct_lighting( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, /* Required for direct lighting */ - ccl_global char *shader_DL, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize /* Size (capacity) of each queue */ - ) +ccl_device char kernel_direct_lighting( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for direct lighting */ + ccl_global char *shader_DL, /* Required for direct lighting */ + ccl_global uint *rng_coop, /* Required for direct lighting */ + ccl_global PathState *PathState_coop, /* Required for direct lighting */ + ccl_global int *ISLamp_coop, /* Required for direct lighting */ + ccl_global Ray *LightRay_coop, /* Required for direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int ray_index) { - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not required - * If we are executing on a CPU device, then we need to keep all threads active - * since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) - return; -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - /* Load kernel globals structure and ShaderData structure */ - KernelGlobals *kg = (KernelGlobals *)globals; - ShaderData *sd = (ShaderData *)shader_data; - ShaderData *sd_DL = (ShaderData *)shader_DL; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + ShaderData *sd_DL = (ShaderData *)shader_DL; - ccl_global PathState *state = &PathState_coop[ray_index]; + ccl_global PathState *state = &PathState_coop[ray_index]; - /* direct lighting */ + /* direct lighting */ #ifdef __EMISSION__ - if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) { - /* sample illumination from lights to find path contribution */ - ccl_global RNG* rng = &rng_coop[ray_index]; - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) { + /* sample illumination from lights to find path contribution */ + ccl_global RNG* rng = &rng_coop[ray_index]; + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - LightSample ls; - light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); + LightSample ls; + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); - Ray light_ray; + Ray light_ray; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = ccl_fetch(sd, time); #endif - BsdfEval L_light; - bool is_lamp; - if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) { - /* write intermediate data to global memory to access from the next kernel */ - LightRay_coop[ray_index] = light_ray; - BSDFEval_coop[ray_index] = L_light; - ISLamp_coop[ray_index] = is_lamp; - /// mark ray state for next shadow kernel - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); - enqueue_flag = 1; - } + BsdfEval L_light; + bool is_lamp; + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) { + /* write intermediate data to global memory to access from the next kernel */ + LightRay_coop[ray_index] = light_ray; + BSDFEval_coop[ray_index] = L_light; + ISLamp_coop[ray_index] = is_lamp; + /// mark ray state for next shadow kernel + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + enqueue_flag = 1; } -#endif } -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - -#ifdef __EMISSION__ - /* Enqueue RAY_SHADOW_RAY_CAST_DL rays */ - enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); #endif + } + return enqueue_flag; } diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 174070ad5bb..8a7c4e11dcf 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. +/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. * This is the sixth kernel in the ray tracing logic. This is the fifth * of the path iteration kernels. This kernel takes care of the logic to process * "material of type holdout", indirect primitive emission, bsdf blurring, @@ -71,213 +70,175 @@ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO */ - -__kernel void kernel_holdout_emission_blurring_pathtermination_ao( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ +ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */ + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ + ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ + ccl_global float *L_transparent_coop, /* Required for handling holdout material */ + PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ + ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ + Intersection *Intersection_coop, /* Required for indirect primitive emission */ + ccl_global float3 *AOAlpha_coop, /* Required for AO */ + ccl_global float3 *AOBSDF_coop, /* Required for AO */ + ccl_global Ray *AOLightRay_coop, /* Required for AO */ + int sw, int sh, int sx, int sy, int stride, + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ #ifdef __WORK_STEALING__ - unsigned int start_sample, + unsigned int start_sample, #endif - int parallel_samples /* Number of samples to be processed in parallel */ - ) + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index, + char *enqueue_flag, + char *enqueue_flag_AO_SHADOW_RAY_CAST) { - ccl_local unsigned int local_queue_atomics_bg; - ccl_local unsigned int local_queue_atomics_ao; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics_bg = 0; - local_queue_atomics_ao = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not required - * If we are executing on a CPU device, then we need to keep all threads active - * since we have barrier() calls later in the kernel. CPU devices - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) - return; -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - /* Load kernel globals structure and ShaderData structure */ - KernelGlobals *kg = (KernelGlobals *)globals; - ShaderData *sd = (ShaderData *)shader_data; + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; #ifdef __WORK_STEALING__ - unsigned int my_work; - unsigned int pixel_x; - unsigned int pixel_y; + unsigned int my_work; + unsigned int pixel_x; + unsigned int pixel_y; #endif - unsigned int tile_x; - unsigned int tile_y; - int my_sample_tile; - unsigned int sample; + unsigned int tile_x; + unsigned int tile_y; + int my_sample_tile; + unsigned int sample; - ccl_global RNG *rng = 0x0; - ccl_global PathState *state = 0x0; - float3 throughput; + ccl_global RNG *rng = 0x0; + ccl_global PathState *state = 0x0; + float3 throughput; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - throughput = throughput_coop[ray_index]; - state = &PathState_coop[ray_index]; - rng = &rng_coop[ray_index]; + throughput = throughput_coop[ray_index]; + state = &PathState_coop[ray_index]; + rng = &rng_coop[ray_index]; #ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; #else // __WORK_STEALING__ - sample = work_array[ray_index]; - /* buffer's stride is "stride"; Find x and y using ray_index */ - int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); + sample = work_array[ray_index]; + /* buffer's stride is "stride"; Find x and y using ray_index */ + int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); #endif // __WORK_STEALING__ - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; - /* holdout */ + /* holdout */ #ifdef __HOLDOUT__ - if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; + if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) { + if(kernel_data.background.transparent) { + float3 holdout_weight; - if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - else - holdout_weight = shader_holdout_eval(kg, sd); + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + else + holdout_weight = shader_holdout_eval(kg, sd); - /* any throughput is ok, should all be identical here */ - L_transparent_coop[ray_index] += average(holdout_weight*throughput); - } + /* any throughput is ok, should all be identical here */ + L_transparent_coop[ray_index] += average(holdout_weight*throughput); + } - if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; - } + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + *enqueue_flag = 1; } -#endif } +#endif + } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput); + PathRadiance *L = &PathRadiance_coop[ray_index]; + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput); - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + /* blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); } + } #ifdef __EMISSION__ - /* emission */ - if(ccl_fetch(sd, flag) & SD_EMISSION) { - /* todo: is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } + /* emission */ + if(ccl_fetch(sd, flag) & SD_EMISSION) { + /* todo: is isect.t wrong here for transparent surfaces? */ + float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, throughput, emission, state->bounce); + } #endif - /* path termination. this is a strange place to put the termination, it's - * mainly due to the mixed in MIS that we use. gives too many unneeded - * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, state, throughput); + /* path termination. this is a strange place to put the termination, it's + * mainly due to the mixed in MIS that we use. gives too many unneeded + * shader evaluations, only need emission if we are going to terminate */ + float probability = path_state_terminate_probability(kg, state, throughput); - if(probability == 0.0f) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; - } + if(probability == 0.0f) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + *enqueue_flag = 1; + } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(probability != 1.0f) { + float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); - if(terminate >= probability) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; - } else { - throughput_coop[ray_index] = throughput/probability; - } + if(terminate >= probability) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + *enqueue_flag = 1; + } else { + throughput_coop[ray_index] = throughput/probability; } } } + } #ifdef __AO__ - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) { - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); - - float3 ao_D; - float ao_pdf; - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray _ray; - _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); - _ray.D = ao_D; - _ray.t = kernel_data.background.ao_distance; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* ambient occlusion */ + if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) { + /* todo: solve correlation */ + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); + + float3 ao_D; + float ao_pdf; + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray _ray; + _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + _ray.D = ao_D; + _ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - _ray.time = ccl_fetch(sd, time); + _ray.time = ccl_fetch(sd, time); #endif - _ray.dP = ccl_fetch(sd, dP); - _ray.dD = differential3_zero(); - AOLightRay_coop[ray_index] = _ray; + _ray.dP = ccl_fetch(sd, dP); + _ray.dD = differential3_zero(); + AOLightRay_coop[ray_index] = _ray; - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - enqueue_flag_AO_SHADOW_RAY_CAST = 1; - } + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + *enqueue_flag_AO_SHADOW_RAY_CAST = 1; } } -#endif -#ifndef __COMPUTE_DEVICE_GPU__ } #endif - - /* Enqueue RAY_UPDATE_BUFFER rays */ - enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index); -#ifdef __AO__ - /* Enqueue to-shadow-ray-cast rays */ - enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index); -#endif } diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h index b804bfc8630..e5fdb637a50 100644 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_lamp_emission +/* Note on kernel_lamp_emission * This is the 3rd kernel in the ray-tracing logic. This is the second of the * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE @@ -40,55 +39,23 @@ * * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel. */ -__kernel void kernel_lamp_emission( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, /* Required for lamp emission */ - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ - int parallel_samples /* Number of samples to be processed in parallel */ - ) +ccl_device void kernel_lamp_emission( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for lamp emission */ + ccl_global float3 *throughput_coop, /* Required for lamp emission */ + PathRadiance *PathRadiance_coop, /* Required for lamp emission */ + ccl_global Ray *Ray_coop, /* Required for lamp emission */ + ccl_global PathState *PathState_coop, /* Required for lamp emission */ + Intersection *Intersection_coop, /* Required for lamp emission */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global char *use_queues_flag, /* Used to decide if this kernel should use + * queues to fetch ray index + */ + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index) { - int x = get_global_id(0); - int y = get_global_id(1); - - /* We will empty this queue in this kernel */ - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - } - - /* Fetch use_queues_flag */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh){ - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { KernelGlobals *kg = (KernelGlobals *)globals; ShaderData *sd = (ShaderData *)shader_data; diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 6ce56e45733..ea07a5f3447 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_setup_next_iteration kernel. +/* Note on kernel_setup_next_iteration kernel. * This is the tenth kernel in the ray tracing logic. This is the ninth * of the path iteration kernels. This kernel takes care of setting up * Ray for the next iteration of path-iteration and accumulating radiance @@ -60,117 +59,74 @@ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays */ - -__kernel void kernel_next_iteration_setup( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - ccl_global char *use_queues_flag /* flag to decide if scene_intersect kernel should use queues to fetch ray index */ - ) +ccl_device char kernel_next_iteration_setup( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for setting up ray for next iteration */ + ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ + ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ + PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ + ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ + ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ + ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should + * use queues to fetch ray index */ + int ray_index) { - - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - /* If we are here, then it means that scene-intersect kernel - * has already been executed atleast once. From the next time, - * scene-intersect kernel may operate on queues to fetch ray index - */ - use_queues_flag[0] = 1; - - /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS - * queues that were made empty during the previous kernel - */ - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not required - * If we are executing on a CPU device, then we need to keep all threads active - * since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) - return; -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - /* Load kernel globals structure and ShaderData structure */ - KernelGlobals *kg = (KernelGlobals *)globals; - ShaderData *sd = (ShaderData *)shader_data; - PathRadiance *L = 0x0; - ccl_global PathState *state = 0x0; - - /* Path radiance update for AO/Direct_lighting's shadow blocked */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - float3 _throughput = throughput_coop[ray_index]; - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = LightRay_ao_coop[ray_index].P; - char update_path_radiance = LightRay_ao_coop[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce); - } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = 0x0; + ccl_global PathState *state = 0x0; + + /* Path radiance update for AO/Direct_lighting's shadow blocked */ + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + float3 _throughput = throughput_coop[ray_index]; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + float3 shadow = LightRay_ao_coop[ray_index].P; + char update_path_radiance = LightRay_ao_coop[ray_index].t; + if(update_path_radiance) { + path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce); } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + } - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = LightRay_dl_coop[ray_index].P; - char update_path_radiance = LightRay_dl_coop[ray_index].t; - if(update_path_radiance) { - BsdfEval L_light = BSDFEval_coop[ray_index]; - path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]); - } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { + float3 shadow = LightRay_dl_coop[ray_index].P; + char update_path_radiance = LightRay_dl_coop[ray_index].t; + if(update_path_radiance) { + BsdfEval L_light = BSDFEval_coop[ray_index]; + path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]); } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } + } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global RNG* rng = &rng_coop[ray_index]; - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global RNG* rng = &rng_coop[ray_index]; + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; - /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; - } + /* compute direct lighting and next bounce */ + if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; } -#ifndef __COMPUTE_DEVICE_GPU__ } -#endif - /* Enqueue RAY_UPDATE_BUFFER rays */ - enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); + return enqueue_flag; } diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h deleted file mode 100644 index 5a9838f44d7..00000000000 --- a/intern/cycles/kernel/split/kernel_queue_enqueue.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../kernel_compat_opencl.h" -#include "../kernel_math.h" -#include "../kernel_types.h" -#include "../kernel_globals.h" -#include "../kernel_queues.h" - -/* - * The kernel "kernel_queue_enqueue" enqueues rays of - * different ray state into their appropriate Queues; - * 1. Rays that have been determined to hit the background from the - * "kernel_scene_intersect" kernel - * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output of the kernel is as follows, - * - * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | - * queuesize -------------------------------------------| | - * - * Note on Queues : - * State of queues during the first time this kernel is called : - * At entry, - * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. - * - * State of queue during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. - */ - -__kernel void kernel_queue_enqueue( - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int queuesize /* Size (capacity) of each queue */ - ) -{ - /* We have only 2 cases (Hit/Not-Hit) */ - ccl_local unsigned int local_queue_atomics[2]; - - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - - if(lidx < 2 ) { - local_queue_atomics[lidx] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int queue_number = -1; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - } else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; - } - - unsigned int my_lqidx; - if(queue_number != -1) { - my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidx == 0) { - local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index); - local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index); - } - barrier(CLK_LOCAL_MEM_FENCE); - - unsigned int my_gqidx; - if(queue_number != -1) { - my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics); - Queue_data[my_gqidx] = ray_index; - } -} diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 9f4754e8852..09e3e5ddd7e 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_scene_intersect kernel. +/* Note on kernel_scene_intersect kernel. * This is the second kernel in the ray tracing logic. This is the first * of the path iteration kernels. This kernel takes care of scene_intersect function. * @@ -63,51 +62,23 @@ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change */ -__kernel void kernel_scene_intersect( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ +ccl_device void kernel_scene_intersect( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global uint *rng_coop, + ccl_global Ray *Ray_coop, /* Required for scene_intersect */ + ccl_global PathState *PathState_coop, /* Required for scene_intersect */ + Intersection *Intersection_coop, /* Required for scene_intersect */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global char *use_queues_flag, /* used to decide if this kernel should use + * queues to fetch ray index */ #ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, + DebugData *debugdata_coop, #endif - int parallel_samples /* Number of samples to be processed in parallel */ - ) + int parallel_samples, /* Number of samples to be processed in parallel */ + int ray_index) { - int x = get_global_id(0); - int y = get_global_id(1); - - /* Fetch use_queues_flag */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh){ - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - /* All regenerated rays become active here */ if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 924c7fab2a3..92813c20832 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_shader_eval kernel +/* Note on kernel_shader_eval kernel * This kernel is the 5th kernel in the ray tracing logic. This is * the 4rd kernel in path iteration. This kernel sets up the ShaderData * structure from the values computed by the previous kernels. It also identifies @@ -45,39 +44,17 @@ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays */ - -__kernel void kernel_shader_eval( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_data, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize /* Size (capacity) of each queue */ - ) +ccl_device void kernel_shader_eval( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Output ShaderData structure to be filled */ + ccl_global uint *rng_coop, /* Required for rbsdf calculation */ + ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ + ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ + Intersection *Intersection_coop, /* Required for setting up shader from ray */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int ray_index) { - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; - - enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); - - ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); - - if(ray_index == QUEUE_EMPTY_SLOT) - return; - - /* Continue on with shader evaluation */ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { KernelGlobals *kg = (KernelGlobals *)globals; ShaderData *sd = (ShaderData *)shader_data; diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h index 52bd9eb3bbc..154ec53ffbb 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h @@ -16,8 +16,7 @@ #include "kernel_split_common.h" -/* - * Note on kernel_shadow_blocked kernel. +/* Note on kernel_shadow_blocked kernel. * This is the ninth kernel in the ray tracing logic. This is the eighth * of the path iteration kernels. This kernel takes care of "shadow ray cast" * logic of the direct lighting and AO part of ray tracing. @@ -29,9 +28,9 @@ * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop * ray_state ---------------------------------------| |--- ray_state * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | * kg (globals + data) -----------------------------| | * queuesize ---------------------------------------| | * @@ -46,63 +45,26 @@ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. */ - -__kernel void kernel_shadow_blocked( - ccl_global char *globals, - ccl_constant KernelData *data, - ccl_global char *shader_shadow, /* Required for shadow blocked */ - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - Intersection *Intersection_coop_AO, - Intersection *Intersection_coop_DL, - ccl_global char *ray_state, - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - int total_num_rays - ) +ccl_device void kernel_shadow_blocked( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_shadow, /* Required for shadow blocked */ + ccl_global PathState *PathState_coop, /* Required for shadow blocked */ + ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ + ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ + Intersection *Intersection_coop_AO, + Intersection *Intersection_coop_DL, + ccl_global char *ray_state, + int total_num_rays, + char shadow_blocked_type, + int ray_index) { -#if 0 - /* we will make the Queue_index entries '0' in the next kernel */ - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - /* We empty this queue here */ - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } -#endif - - int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); - - ccl_local unsigned int ao_queue_length; - ccl_local unsigned int dl_queue_length; - if(lidx == 0) { - ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; - dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* flag determining if the current ray is to process shadow ray for AO or DL */ - char shadow_blocked_type = -1; - /* flag determining if we need to update L */ + /* Flag determining if we need to update L. */ char update_path_radiance = 0; - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(thread_index < ao_queue_length + dl_queue_length) { - if(thread_index < ao_queue_length) { - ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; - } else { - ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; - } - } - - if(ray_index == QUEUE_EMPTY_SLOT) - return; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || + IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) + { /* Load kernel global structure */ KernelGlobals *kg = (KernelGlobals *)globals; ShaderData *sd_shadow = (ShaderData *)shader_shadow; @@ -113,13 +75,24 @@ __kernel void kernel_shadow_blocked( Intersection *isect_ao_global = &Intersection_coop_AO[ray_index]; Intersection *isect_dl_global = &Intersection_coop_DL[ray_index]; - ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global; - Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global; + ccl_global Ray *light_ray_global = + shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO + ? light_ray_ao_global + : light_ray_dl_global; + Intersection *isect_global = + RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global; float3 shadow; - update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global)); + update_path_radiance = !(shadow_blocked(kg, + state, + light_ray_global, + &shadow, + sd_shadow, + isect_global)); - /* We use light_ray_global's P and t to store shadow and update_path_radiance */ + /* We use light_ray_global's P and t to store shadow and + * update_path_radiance. + */ light_ray_global->P = shadow; light_ray_global->t = update_path_radiance; } diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h index faa4162b46f..54d1c5983e8 100644 --- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h +++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h @@ -19,21 +19,19 @@ #include "../kernel_types.h" #include "../kernel_globals.h" -/* -* Since we process various samples in parallel; The output radiance of different samples -* are stored in different locations; This kernel combines the output radiance contributed -* by all different samples and stores them in the RenderTile's output buffer. -*/ - -__kernel void kernel_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) +/* Since we process various samples in parallel; The output radiance of different samples + * are stored in different locations; This kernel combines the output radiance contributed + * by all different samples and stores them in the RenderTile's output buffer. + */ +ccl_device void kernel_sum_all_radiance( + ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ + ccl_global float *buffer, /* Output buffer of RenderTile */ + ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ + int parallel_samples, int sw, int sh, int stride, + int buffer_offset_x, + int buffer_offset_y, + int buffer_stride, + int start_sample) { int x = get_global_id(0); int y = get_global_id(1); |