Fix T44833: Can't use ccl_local space in non-kernel functions

This commit re-shuffles code in split kernel once again and makes it so common parts which is in the headers is only responsible to making all the work needed for specified ray index. Getting ray index, checking for it's validity and enqueuing tasks are now happening in the device specified part of the kernel. This actually makes sense because enqueuing is indeed device-specified and i.e. with CUDA we'll want to enqueue kernels from kernel and avoid CPU roundtrip. TODO: - Kernel comments are still placed in the common header files, but since queue related stuff is not passed to those functions those comments might need to be split as well. Just currently read them considering that they're also covering the way how all devices are invoking the common code path. - Arguments might need to be wrapped into KernelGlobals, so we don't ened to pass all them around as function arguments.
author: Sergey Sharybin <sergey.vfx@gmail.com> 2015-05-26 17:12:49 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2015-05-26 20:54:02 +0300
commit: 84ad20acef4c0db91c9a850e81c7dc0a57aef42a (patch)
tree: c789ed8b455b6870ea12b87a2dc7ed3c28d77102 /intern/cycles/kernel/kernels
parent: 4ffcc6ff56b60d1cc69e12a80c9c2cacd604688f (diff)
10 files changed, 607 insertions, 205 deletions
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
index 2d1944d01e6..eff77b89a0a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -48,34 +48,81 @@ __kernel void kernel_ocl_path_trace_background_buffer_update(
 #endif
         int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
-	kernel_background_buffer_update(globals,
-	                                data,
-	                                shader_data,
-	                                per_sample_output_buffers,
-	                                rng_state,
-	                                rng_coop,
-	                                throughput_coop,
-	                                PathRadiance_coop,
-	                                Ray_coop,
-	                                PathState_coop,
-	                                L_transparent_coop,
-	                                ray_state,
-	                                sw, sh, sx, sy, stride,
-	                                rng_state_offset_x,
-	                                rng_state_offset_y,
-	                                rng_state_stride,
-	                                work_array,
-	                                Queue_data,
-	                                Queue_index,
-	                                queuesize,
-	                                end_sample,
-	                                start_sample,
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag =
+			kernel_background_buffer_update(globals,
+			                                data,
+			                                shader_data,
+			                                per_sample_output_buffers,
+			                                rng_state,
+			                                rng_coop,
+			                                throughput_coop,
+			                                PathRadiance_coop,
+			                                Ray_coop,
+			                                PathState_coop,
+			                                L_transparent_coop,
+			                                ray_state,
+			                                sw, sh, sx, sy, stride,
+			                                rng_state_offset_x,
+			                                rng_state_offset_y,
+			                                rng_state_stride,
+			                                work_array,
+			                                end_sample,
+			                                start_sample,
 #ifdef __WORK_STEALING__
-	                                work_pool_wgs,
-	                                num_samples,
+			                                work_pool_wgs,
+			                                num_samples,
 #endif
 #ifdef __KERNEL_DEBUG__
-	                                debugdata_coop,
+			                                debugdata_coop,
+#endif
+			                                parallel_samples,
+			                                ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
 #endif
-	                                parallel_samples);
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 015f0872413..c3277676029 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -17,130 +17,129 @@
 #include "split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
-	ccl_global char *globals,
-	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
-	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
 
-	ccl_global float3 *P_sd,
-	ccl_global float3 *P_sd_DL_shadow,
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
 
-	ccl_global float3 *N_sd,
-	ccl_global float3 *N_sd_DL_shadow,
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
 
-	ccl_global float3 *Ng_sd,
-	ccl_global float3 *Ng_sd_DL_shadow,
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
 
-	ccl_global float3 *I_sd,
-	ccl_global float3 *I_sd_DL_shadow,
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
 
-	ccl_global int *shader_sd,
-	ccl_global int *shader_sd_DL_shadow,
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
 
-	ccl_global int *flag_sd,
-	ccl_global int *flag_sd_DL_shadow,
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
 
-	ccl_global int *prim_sd,
-	ccl_global int *prim_sd_DL_shadow,
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
 
-	ccl_global int *type_sd,
-	ccl_global int *type_sd_DL_shadow,
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
 
-	ccl_global float *u_sd,
-	ccl_global float *u_sd_DL_shadow,
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
 
-	ccl_global float *v_sd,
-	ccl_global float *v_sd_DL_shadow,
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
 
-	ccl_global int *object_sd,
-	ccl_global int *object_sd_DL_shadow,
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
 
-	ccl_global float *time_sd,
-	ccl_global float *time_sd_DL_shadow,
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
 
-	ccl_global float *ray_length_sd,
-	ccl_global float *ray_length_sd_DL_shadow,
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
 
-	ccl_global int *ray_depth_sd,
-	ccl_global int *ray_depth_sd_DL_shadow,
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
 
-	ccl_global int *transparent_depth_sd,
-	ccl_global int *transparent_depth_sd_DL_shadow,
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
 
-	/* Ray differentials. */
-	ccl_global differential3 *dP_sd,
-	ccl_global differential3 *dP_sd_DL_shadow,
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
 
-	ccl_global differential3 *dI_sd,
-	ccl_global differential3 *dI_sd_DL_shadow,
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
 
-	ccl_global differential *du_sd,
-	ccl_global differential *du_sd_DL_shadow,
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
 
-	ccl_global differential *dv_sd,
-	ccl_global differential *dv_sd_DL_shadow,
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
 
-	/* Dp/Du */
-	ccl_global float3 *dPdu_sd,
-	ccl_global float3 *dPdu_sd_DL_shadow,
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
 
-	ccl_global float3 *dPdv_sd,
-	ccl_global float3 *dPdv_sd_DL_shadow,
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
 
-	/* Object motion. */
-	ccl_global Transform *ob_tfm_sd,
-	ccl_global Transform *ob_tfm_sd_DL_shadow,
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
 
-	ccl_global Transform *ob_itfm_sd,
-	ccl_global Transform *ob_itfm_sd_DL_shadow,
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
 
-	ShaderClosure *closure_sd,
-	ShaderClosure *closure_sd_DL_shadow,
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
 
-	ccl_global int *num_closure_sd,
-	ccl_global int *num_closure_sd_DL_shadow,
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
 
-	ccl_global float *randb_closure_sd,
-	ccl_global float *randb_closure_sd_DL_shadow,
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
 
-	ccl_global float3 *ray_P_sd,
-	ccl_global float3 *ray_P_sd_DL_shadow,
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
 
-	ccl_global differential3 *ray_dP_sd,
-	ccl_global differential3 *ray_dP_sd_DL_shadow,
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
 
-	ccl_constant KernelData *data,
-	ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_state,
-	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
-	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
 #include "../../kernel_textures.h"
 
-	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-	int rng_state_offset_x,
-	int rng_state_offset_y,
-	int rng_state_stride,
-	ccl_global int *Queue_data,                  /* Memory for queues */
-	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-	int queuesize,                               /* size (capacity) of the queue */
-	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
 #ifdef __WORK_STEALING__
-	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-	unsigned int num_samples,                    /* Total number of samples per pixel */
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
 #endif
 #ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
+        DebugData *debugdata_coop,
 #endif
-	int parallel_samples                         /* Number of samples to be processed in parallel */
-	)
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
 {
 	kernel_data_init(globals,
 	                 shader_data_sd,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index 0b22c6d0864..6ec75013b3a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -31,17 +31,60 @@ __kernel void kernel_ocl_path_trace_direct_lighting(
         ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
         int queuesize)                          /* Size (capacity) of each queue */
 {
-	kernel_direct_lighting(globals,
-	                       data,
-	                       shader_data,
-	                       shader_DL,
-	                       rng_coop,
-	                       PathState_coop,
-	                       ISLamp_coop,
-	                       LightRay_coop,
-	                       BSDFEval_coop,
-	                       ray_state,
-	                       Queue_data,
-	                       Queue_index,
-	                       queuesize);
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_direct_lighting(globals,
+		                                      data,
+		                                      shader_data,
+		                                      shader_DL,
+		                                      rng_coop,
+		                                      PathState_coop,
+		                                      ISLamp_coop,
+		                                      LightRay_coop,
+		                                      BSDFEval_coop,
+		                                      ray_state,
+		                                      ray_index);
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+#endif
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index 502f10a7a59..ae5f5cd1b3b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -41,27 +41,84 @@ __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao
 #endif
         int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
-	kernel_holdout_emission_blurring_pathtermination_ao(globals,
-	                                                    data,
-	                                                    shader_data,
-	                                                    per_sample_output_buffers,
-	                                                    rng_coop,
-	                                                    throughput_coop,
-	                                                    L_transparent_coop,
-	                                                    PathRadiance_coop,
-	                                                    PathState_coop,
-	                                                    Intersection_coop,
-	                                                    AOAlpha_coop,
-	                                                    AOBSDF_coop,
-	                                                    AOLightRay_coop,
-	                                                    sw, sh, sx, sy, stride,
-	                                                    ray_state,
-	                                                    work_array,
-	                                                    Queue_data,
-	                                                    Queue_index,
-	                                                    queuesize,
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		kernel_holdout_emission_blurring_pathtermination_ao(
+		        globals,
+		        data,
+		        shader_data,
+		        per_sample_output_buffers,
+		        rng_coop,
+		        throughput_coop,
+		        L_transparent_coop,
+		        PathRadiance_coop,
+		        PathState_coop,
+		        Intersection_coop,
+		        AOAlpha_coop,
+		        AOBSDF_coop,
+		        AOLightRay_coop,
+		        sw, sh, sx, sy, stride,
+		        ray_state,
+		        work_array,
 #ifdef __WORK_STEALING__
-	                                                    start_sample,
+		        start_sample,
+#endif
+		        parallel_samples,
+		        ray_index,
+		        &enqueue_flag,
+		        &enqueue_flag_AO_SHADOW_RAY_CAST);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics_bg,
+	                        Queue_data,
+	                        Queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        queuesize,
+	                        &local_queue_atomics_ao,
+	                        Queue_data,
+	                        Queue_index);
 #endif
-	                                                    parallel_samples);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index af83e68b53e..1bc7808d834 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -17,23 +17,57 @@
 #include "split/kernel_lamp_emission.h"
 
 __kernel void kernel_ocl_path_trace_lamp_emission(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required for lamp emission */
-	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
-	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
-	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
-	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
-	Intersection *Intersection_coop, /* Required for lamp emission */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
-	int sw, int sh,
-	ccl_global int *Queue_data,                 /* Memory for queues */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
-	int queuesize,                              /* Size (capacity) of queues */
-	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
-	int parallel_samples                        /* Number of samples to be processed in parallel */
-	)
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
 	kernel_lamp_emission(globals,
 	                     data,
 	                     shader_data,
@@ -44,9 +78,7 @@ __kernel void kernel_ocl_path_trace_lamp_emission(
 	                     Intersection_coop,
 	                     ray_state,
 	                     sw, sh,
-	                     Queue_data,
-	                     Queue_index,
-	                     queuesize,
 	                     use_queues_flag,
-	                     parallel_samples);
+	                     parallel_samples,
+	                     ray_index);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 4acd991f0b4..dcf4db40411 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -35,25 +35,81 @@ __kernel void kernel_ocl_path_trace_next_iteration_setup(
         ccl_global int *Queue_data,           /* Queue memory */
         ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
         int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)      /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
 {
-	kernel_next_iteration_setup(globals,
-	                            data,
-	                            shader_data,
-	                            rng_coop,
-	                            throughput_coop,
-	                            PathRadiance_coop,
-	                            Ray_coop,
-	                            PathState_coop,
-	                            LightRay_dl_coop,
-	                            ISLamp_coop,
-	                            BSDFEval_coop,
-	                            LightRay_ao_coop,
-	                            AOBSDF_coop,
-	                            AOAlpha_coop,
-	                            ray_state,
-	                            Queue_data,
-	                            Queue_index,
-	                            queuesize,
-	                            use_queues_flag);
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_next_iteration_setup(globals,
+		                                           data,
+		                                           shader_data,
+		                                           rng_coop,
+		                                           throughput_coop,
+		                                           PathRadiance_coop,
+		                                           Ray_coop,
+		                                           PathState_coop,
+		                                           LightRay_dl_coop,
+		                                           ISLamp_coop,
+		                                           BSDFEval_coop,
+		                                           LightRay_ao_coop,
+		                                           AOBSDF_coop,
+		                                           AOAlpha_coop,
+		                                           ray_state,
+		                                           use_queues_flag,
+		                                           ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 62cf08c387d..3156dc255fb 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,16 +14,93 @@
  * limitations under the License.
  */
 
-#include "split/kernel_queue_enqueue.h"
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_queues.h"
 
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
 __kernel void kernel_ocl_path_trace_queue_enqueue(
         ccl_global int *Queue_data,   /* Queue memory */
         ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
         ccl_global char *ray_state,   /* Denotes the state of each ray */
         int queuesize)                /* Size (capacity) of each queue */
 {
-	kernel_queue_enqueue(Queue_data,
-	                     Queue_index,
-	                     ray_state,
-	                     queuesize);
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  queuesize,
+		                                  my_lqidx,
+		                                  local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index d219874d391..e5fad7bce50 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -28,12 +28,43 @@ __kernel void kernel_ocl_path_trace_scene_intersect(
         ccl_global int *Queue_data,            /* Memory for queues */
         ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
         int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use queues to fetch ray index */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
 #ifdef __KERNEL_DEBUG__
         DebugData *debugdata_coop,
 #endif
         int parallel_samples)                  /* Number of samples to be processed in parallel */
 {
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
 	kernel_scene_intersect(globals,
 	                       data,
 	                       rng_coop,
@@ -42,12 +73,10 @@ __kernel void kernel_ocl_path_trace_scene_intersect(
 	                       Intersection_coop,
 	                       ray_state,
 	                       sw, sh,
-	                       Queue_data,
-	                       Queue_index,
-	                       queuesize,
 	                       use_queues_flag,
 #ifdef __KERNEL_DEBUG__
 	                       debugdata_coop,
 #endif
-	                       parallel_samples);
+	                       parallel_samples,
+	                       ray_index);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 04769d7d792..b9f616e6bdf 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -29,6 +29,34 @@ __kernel void kernel_ocl_path_trace_shader_eval(
         ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
         int queuesize)                         /* Size (capacity) of each queue */
 {
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+
+	/* Continue on with shader evaluation. */
 	kernel_shader_eval(globals,
 	                   data,
 	                   shader_data,
@@ -37,7 +65,5 @@ __kernel void kernel_ocl_path_trace_shader_eval(
 	                   PathState_coop,
 	                   Intersection_coop,
 	                   ray_state,
-	                   Queue_data,
-	                   Queue_index,
-	                   queuesize);
+	                   ray_index);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
index 9d57364c8d6..03886c0a030 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -31,6 +31,43 @@ __kernel void kernel_ocl_path_trace_shadow_blocked(
         int queuesize,                         /* Size (capacity) of each queue */
         int total_num_rays)
 {
+#if 0
+	/* We will make the Queue_index entries '0' in the next kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
 	kernel_shadow_blocked(globals,
 	                      data,
 	                      shader_shadow,
@@ -40,8 +77,7 @@ __kernel void kernel_ocl_path_trace_shadow_blocked(
 	                      Intersection_coop_AO,
 	                      Intersection_coop_DL,
 	                      ray_state,
-	                      Queue_data,
-	                      Queue_index,
-	                      queuesize,
-	                      total_num_rays);
+	                      total_num_rays,
+	                      shadow_blocked_type,
+	                      ray_index);
 }
author	Sergey Sharybin <sergey.vfx@gmail.com>	2015-05-26 17:12:49 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2015-05-26 20:54:02 +0300
commit	84ad20acef4c0db91c9a850e81c7dc0a57aef42a (patch)
tree	c789ed8b455b6870ea12b87a2dc7ed3c28d77102 /intern/cycles/kernel/kernels
parent	4ffcc6ff56b60d1cc69e12a80c9c2cacd604688f (diff)