/*
 * Copyright 2011-2017 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

CCL_NAMESPACE_BEGIN

#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)

ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
{
	kernel_split_branched_path_indirect_loop_init(kg, ray_index);

	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
}

ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
{
	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];

	ShaderData *sd = &kernel_split_state.sd[ray_index];
	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];

	/* GPU: no decoupled ray marching, scatter probalistically */
	int num_samples = kernel_data.integrator.volume_samples;
	float num_samples_inv = 1.0f/num_samples;

	Ray volume_ray = branched_state->ray;
	volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;

	bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);

	for(int j = branched_state->next_sample; j < num_samples; j++) {
		ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
		*ps = branched_state->path_state;

		ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
		*pray = branched_state->ray;

		ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
		*tp = branched_state->throughput * num_samples_inv;

		/* branch RNG state */
		path_state_branch(ps, j, num_samples);

		/* integrate along volume segment with distance sampling */
		VolumeIntegrateResult result = kernel_volume_integrate(
			kg, ps, sd, &volume_ray, L, tp, heterogeneous);

#  ifdef __VOLUME_SCATTER__
		if(result == VOLUME_PATH_SCATTERED) {
			/* direct lighting */
			kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);

			/* indirect light bounce */
			if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
				continue;
			}

			/* start the indirect path */
			branched_state->next_closure = 0;
			branched_state->next_sample = j+1;

			/* Attempting to share too many samples is slow for volumes as it causes us to
			 * loop here more and have many calls to kernel_volume_integrate which evaluates
			 * shaders. The many expensive shader evaluations cause the work load to become
			 * unbalanced and many threads to become idle in this kernel. Limiting the
			 * number of shared samples here helps quite a lot.
			 */
			if(branched_state->shared_sample_count < 2) {
				if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
					continue;
				}
			}

			return true;
		}
#  endif
	}

	branched_state->next_sample = num_samples;

	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
	if(branched_state->waiting_on_shared_samples) {
		return true;
	}

	kernel_split_branched_path_indirect_loop_end(kg, ray_index);

	/* todo: avoid this calculation using decoupled ray marching */
	float3 throughput = kernel_split_state.throughput[ray_index];
	kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
	kernel_split_state.throughput[ray_index] = throughput;

	return false;
}

#endif  /* __BRANCHED_PATH__ && __VOLUME__ */

ccl_device void kernel_do_volume(KernelGlobals *kg)
{
#ifdef __VOLUME__
	/* We will empty this queue in this kernel. */
	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
#  ifdef __BRANCHED_PATH__
		kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
#  endif  /* __BRANCHED_PATH__ */
	}

	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);

	if(*kernel_split_params.use_queues_flag) {
		ray_index = get_ray_index(kg, ray_index,
		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
		                          kernel_split_state.queue_data,
		                          kernel_split_params.queue_size,
		                          1);
	}

	ccl_global char *ray_state = kernel_split_state.ray_state;

	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];

	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
		ShaderData *sd = &kernel_split_state.sd[ray_index];
		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];

		bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);

		/* Sanitize volume stack. */
		if(!hit) {
			kernel_volume_clean_stack(kg, state->volume_stack);
		}
		/* volume attenuation, emission, scatter */
		if(state->volume_stack[0].shader != SHADER_NONE) {
			Ray volume_ray = *ray;
			volume_ray.t = (hit)? isect->t: FLT_MAX;

#  ifdef __BRANCHED_PATH__
			if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
#  endif  /* __BRANCHED_PATH__ */
				bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);

				{
					/* integrate along volume segment with distance sampling */
					VolumeIntegrateResult result = kernel_volume_integrate(
						kg, state, sd, &volume_ray, L, throughput, heterogeneous);

#  ifdef __VOLUME_SCATTER__
					if(result == VOLUME_PATH_SCATTERED) {
						/* direct lighting */
						kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);

						/* indirect light bounce */
						if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
						}
						else {
							kernel_split_path_end(kg, ray_index);
						}
					}
#  endif  /* __VOLUME_SCATTER__ */
				}

#  ifdef __BRANCHED_PATH__
			}
			else {
				kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);

				if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
				}
			}
#  endif  /* __BRANCHED_PATH__ */
		}
	}

#  ifdef __BRANCHED_PATH__
	/* iter loop */
	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
	                          QUEUE_VOLUME_INDIRECT_ITER,
	                          kernel_split_state.queue_data,
	                          kernel_split_params.queue_size,
	                          1);

	if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
		/* for render passes, sum and reset indirect light pass variables
		 * for the next samples */
		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);

		if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
		}
	}
#  endif  /* __BRANCHED_PATH__ */

#endif  /* __VOLUME__ */
}


CCL_NAMESPACE_END