Fix T44833: Can't use ccl_local space in non-kernel functions

This commit re-shuffles code in split kernel once again and makes it so common parts which is in the headers is only responsible to making all the work needed for specified ray index. Getting ray index, checking for it's validity and enqueuing tasks are now happening in the device specified part of the kernel. This actually makes sense because enqueuing is indeed device-specified and i.e. with CUDA we'll want to enqueue kernels from kernel and avoid CPU roundtrip. TODO: - Kernel comments are still placed in the common header files, but since queue related stuff is not passed to those functions those comments might need to be split as well. Just currently read them considering that they're also covering the way how all devices are invoking the common code path. - Arguments might need to be wrapped into KernelGlobals, so we don't ened to pass all them around as function arguments.
author: Sergey Sharybin <sergey.vfx@gmail.com> 2015-05-26 17:12:49 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2015-05-26 20:54:02 +0300
commit: 84ad20acef4c0db91c9a850e81c7dc0a57aef42a (patch)
tree: c789ed8b455b6870ea12b87a2dc7ed3c28d77102 /intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
parent: 4ffcc6ff56b60d1cc69e12a80c9c2cacd604688f (diff)
1 files changed, 129 insertions, 168 deletions
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 174070ad5bb..8a7c4e11dcf 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -16,8 +16,7 @@
 
 #include "kernel_split_common.h"
 
-/*
- * Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
+/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
  * This is the sixth kernel in the ray tracing logic. This is the fifth
  * of the path iteration kernels. This kernel takes care of the logic to process
  * "material of type holdout", indirect primitive emission, bsdf blurring,
@@ -71,213 +70,175 @@
  * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
  * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
  */
-
-__kernel void kernel_holdout_emission_blurring_pathtermination_ao(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required throughout the kernel except probabilistic path termination and AO */
-	ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_coop,                  /* Required for "kernel_write_data_passes" and AO */
-	ccl_global float3 *throughput_coop,         /* Required for handling holdout material and AO */
-	ccl_global float *L_transparent_coop,       /* Required for handling holdout material */
-	PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
-	ccl_global PathState *PathState_coop,       /* Required throughout the kernel and AO */
-	Intersection *Intersection_coop, /* Required for indirect primitive emission */
-	ccl_global float3 *AOAlpha_coop,            /* Required for AO */
-	ccl_global float3 *AOBSDF_coop,             /* Required for AO */
-	ccl_global Ray *AOLightRay_coop,            /* Required for AO */
-	int sw, int sh, int sx, int sy, int stride,
-	ccl_global char *ray_state,                /* Denotes the state of each ray */
-	ccl_global unsigned int *work_array,       /* Denotes the work that each ray belongs to */
-	ccl_global int *Queue_data,                /* Queue memory */
-	ccl_global int *Queue_index,               /* Tracks the number of elements in each queue */
-	int queuesize,                             /* Size (capacity) of each queue */
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
 #ifdef __WORK_STEALING__
-	unsigned int start_sample,
+        unsigned int start_sample,
 #endif
-	int parallel_samples                       /* Number of samples to be processed in parallel */
-	)
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index,
+        char *enqueue_flag,
+        char *enqueue_flag_AO_SHADOW_RAY_CAST)
 {
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not required
-	 * If we are executing on a CPU device, then we need to keep all threads active
-	 * since we have barrier() calls later in the kernel. CPU devices
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		/* Load kernel globals structure and ShaderData structure */
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd = (ShaderData *)shader_data;
+	/* Load kernel globals structure and ShaderData structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
 
 #ifdef __WORK_STEALING__
-		unsigned int my_work;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
+	unsigned int my_work;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
 #endif
-		unsigned int tile_x;
-		unsigned int tile_y;
-		int my_sample_tile;
-		unsigned int sample;
+	unsigned int tile_x;
+	unsigned int tile_y;
+	int my_sample_tile;
+	unsigned int sample;
 
-		ccl_global RNG *rng = 0x0;
-		ccl_global PathState *state = 0x0;
-		float3 throughput;
+	ccl_global RNG *rng = 0x0;
+	ccl_global PathState *state = 0x0;
+	float3 throughput;
 
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
-			throughput = throughput_coop[ray_index];
-			state = &PathState_coop[ray_index];
-			rng = &rng_coop[ray_index];
+		throughput = throughput_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		rng = &rng_coop[ray_index];
 #ifdef __WORK_STEALING__
-			my_work = work_array[ray_index];
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
+		my_work = work_array[ray_index];
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+		my_sample_tile = 0;
 #else // __WORK_STEALING__
-			sample = work_array[ray_index];
-			/* buffer's stride is "stride"; Find x and y using ray_index */
-			int tile_index = ray_index / parallel_samples;
-			tile_x = tile_index % sw;
-			tile_y = tile_index / sw;
-			my_sample_tile = ray_index - (tile_index * parallel_samples);
+		sample = work_array[ray_index];
+		/* buffer's stride is "stride"; Find x and y using ray_index */
+		int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
 #endif // __WORK_STEALING__
-			per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
 
-			/* holdout */
+		/* holdout */
 #ifdef __HOLDOUT__
-			if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
-				if(kernel_data.background.transparent) {
-					float3 holdout_weight;
+		if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
 
-					if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
-						holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-					else
-						holdout_weight = shader_holdout_eval(kg, sd);
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, sd);
 
-					/* any throughput is ok, should all be identical here */
-					L_transparent_coop[ray_index] += average(holdout_weight*throughput);
-				}
+				/* any throughput is ok, should all be identical here */
+				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+			}
 
-				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					enqueue_flag = 1;
-				}
+			if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				*enqueue_flag = 1;
 			}
-#endif
 		}
+#endif
+	}
 
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
-			PathRadiance *L = &PathRadiance_coop[ray_index];
-			/* holdout mask objects do not write data passes */
-			kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+		/* holdout mask objects do not write data passes */
+		kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
 
-			/* blurring of bsdf after bounces, for rays that have a small likelihood
-				* of following this particular path (diffuse, rough glossy) */
-			if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-				float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+		/* blurring of bsdf after bounces, for rays that have a small likelihood
+		 * of following this particular path (diffuse, rough glossy) */
+		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 
-				if(blur_pdf < 1.0f) {
-					float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-					shader_bsdf_blur(kg, sd, blur_roughness);
-				}
+			if(blur_pdf < 1.0f) {
+				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+				shader_bsdf_blur(kg, sd, blur_roughness);
 			}
+		}
 
 #ifdef __EMISSION__
-			/* emission */
-			if(ccl_fetch(sd, flag) & SD_EMISSION) {
-				/* todo: is isect.t wrong here for transparent surfaces? */
-				float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
+		/* emission */
+		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+			/* todo: is isect.t wrong here for transparent surfaces? */
+			float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
+			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		}
 #endif
 
-			/* path termination. this is a strange place to put the termination, it's
-			 * mainly due to the mixed in MIS that we use. gives too many unneeded
-			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, state, throughput);
+		/* path termination. this is a strange place to put the termination, it's
+		 * mainly due to the mixed in MIS that we use. gives too many unneeded
+		 * shader evaluations, only need emission if we are going to terminate */
+		float probability = path_state_terminate_probability(kg, state, throughput);
 
-			if(probability == 0.0f) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
-			}
+		if(probability == 0.0f) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			*enqueue_flag = 1;
+		}
 
-			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-				if(probability != 1.0f) {
-					float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
 
-					if(terminate >= probability) {
-						ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-						enqueue_flag = 1;
-					} else {
-						throughput_coop[ray_index] = throughput/probability;
-					}
+				if(terminate >= probability) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					*enqueue_flag = 1;
+				} else {
+					throughput_coop[ray_index] = throughput/probability;
 				}
 			}
 		}
+	}
 
 #ifdef __AO__
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			/* ambient occlusion */
-			if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
-				/* todo: solve correlation */
-				float bsdf_u, bsdf_v;
-				path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-				float ao_factor = kernel_data.background.ao_factor;
-				float3 ao_N;
-				AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-				AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
-
-				float3 ao_D;
-				float ao_pdf;
-				sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-				if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-					Ray _ray;
-					_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-					_ray.D = ao_D;
-					_ray.t = kernel_data.background.ao_distance;
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
+			/* todo: solve correlation */
+			float bsdf_u, bsdf_v;
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+			float ao_factor = kernel_data.background.ao_factor;
+			float3 ao_N;
+			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+			float3 ao_D;
+			float ao_pdf;
+			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+				Ray _ray;
+				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.D = ao_D;
+				_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-					_ray.time = ccl_fetch(sd, time);
+				_ray.time = ccl_fetch(sd, time);
 #endif
-					_ray.dP = ccl_fetch(sd, dP);
-					_ray.dD = differential3_zero();
-					AOLightRay_coop[ray_index] = _ray;
+				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dD = differential3_zero();
+				AOLightRay_coop[ray_index] = _ray;
 
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-					enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-				}
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
 			}
 		}
-#endif
-#ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays */
-	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays */
-	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
-#endif
 }
author	Sergey Sharybin <sergey.vfx@gmail.com>	2015-05-26 17:12:49 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2015-05-26 20:54:02 +0300
commit	84ad20acef4c0db91c9a850e81c7dc0a57aef42a (patch)
tree	c789ed8b455b6870ea12b87a2dc7ed3c28d77102 /intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
parent	4ffcc6ff56b60d1cc69e12a80c9c2cacd604688f (diff)