1 files changed, 340 insertions, 167 deletions
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index faaa68e3309..ce20f20e75a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -116,6 +116,36 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st
 	return false;
 }
 
+ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
+{
+	if(kernel_data.integrator.num_all_lights == 0)
+		return 0;
+
+	int method = -1;
+
+	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
+		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2);
+
+		if(shader_flag & SD_VOLUME_MIS) {
+			return SD_VOLUME_MIS;
+		}
+		else if(shader_flag & SD_VOLUME_EQUIANGULAR) {
+			if(method == 0)
+				return SD_VOLUME_MIS;
+
+			method = SD_VOLUME_EQUIANGULAR;
+		}
+		else {
+			if(method == SD_VOLUME_EQUIANGULAR)
+				return SD_VOLUME_MIS;
+
+			method = 0;
+		}
+	}
+
+	return method;
+}
+
 /* Volume Shadows
  *
  * These functions are used to attenuate shadow rays to lights. Both absorption
@@ -136,7 +166,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -146,6 +176,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	/* compute extinction at the start */
 	float t = 0.0f;
 
+	float3 sum = make_float3(0.0f, 0.0f, 0.0f);
+
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
 		float new_t = min(ray->t, (i+1) * step);
@@ -160,20 +192,26 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 		/* compute attenuation over segment */
 		if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-			/* todo: we could avoid computing expf() for each step by summing,
-			 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
-			 * tp_eps check too */
-			tp *= volume_color_transmittance(sigma_t, new_t - t);
-
-			/* stop if nearly all light blocked */
-			if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
-				break;
+			/* Compute expf() only for every Nth step, to save some calculations
+			 * because exp(a)*exp(b) = exp(a+b), also do a quick tp_eps check then. */
+
+			sum += (-sigma_t * (new_t - t));
+			if((i & 0x07) == 0) { /* ToDo: Other interval? */
+				tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
+
+				/* stop if nearly all light is blocked */
+				if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
+					break;
+			}
 		}
 
 		/* stop if at the end of the volume */
 		t = new_t;
-		if(t == ray->t)
+		if(t == ray->t) {
+			/* Update throughput in case we haven't done it above */
+			tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
 			break;
+		}
 	}
 
 	*throughput = tp;
@@ -226,33 +264,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s
 	return pdf;
 }
 
-ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P)
-{
-	/* light RNGs */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-	/* light sample */
-	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls);
-	if(ls.pdf == 0.0f)
-		return false;
-	
-	*light_P = ls.P;
-	return true;
-}
-
-ccl_device float kernel_volume_decoupled_equiangular_pdf(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float sample_t)
-{
-	float3 light_P;
-
-	if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
-		return 0.0f;
-
-	return kernel_volume_equiangular_pdf(ray, light_P, sample_t);
-}
-
 /* Distance sampling */
 
 ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
@@ -312,7 +323,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
  * the volume shading coefficient for the entire line segment */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng)
+	RNG *rng, bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -323,6 +334,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	float t = ray->t;
 	float3 new_tp;
 
+#ifdef __VOLUME_SCATTER__
 	/* randomly scatter, and if we do t is shortened */
 	if(closure_flag & SD_SCATTER) {
 		/* extinction coefficient */
@@ -330,43 +342,41 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 
 		/* pick random color channel, we use the Veach one-sample
 		 * model with balance heuristic for the channels */
-		float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
+		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 		int channel = (int)(rphase*3.0f);
 		sd->randb_closure = rphase*3.0f - channel;
 
-		float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
 		/* decide if we will hit or miss */
-		float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-		float sample_transmittance = expf(-sample_sigma_t * t);
+		bool scatter = true;
+		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+		if(probalistic_scatter) {
+			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+			float sample_transmittance = expf(-sample_sigma_t * t);
+
+			if(1.0f - xi >= sample_transmittance) {
+				scatter = true;
+
+				/* rescale random number so we can reuse it */
+				xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
 
-		if(xi >= sample_transmittance) {
+			}
+			else
+				scatter = false;
+		}
+
+		if(scatter) {
 			/* scattering */
 			float3 pdf;
 			float3 transmittance;
 			float sample_t;
 
-			/* rescale random number so we can reuse it */
-			xi = (xi - sample_transmittance)/(1.0f - sample_transmittance);
-
-			if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
-				/* distance sampling */
-				sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
-			}
-			else {
-				/* equiangular sampling */
-				float3 light_P;
-				float equi_pdf;
-				if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
-					return VOLUME_PATH_MISSED;
-
-				sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf);
-				transmittance = volume_color_transmittance(sigma_t, sample_t);
-				pdf = make_float3(equi_pdf, equi_pdf, equi_pdf);
-			}
+			/* distance sampling */
+			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 
 			/* modifiy pdf for hit/miss decision */
-			pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+			if(probalistic_scatter)
+				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
 
 			new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
 			t = sample_t;
@@ -378,14 +388,16 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 			new_tp = *throughput * transmittance / pdf;
 		}
 	}
-	else if(closure_flag & SD_ABSORPTION) {
+	else 
+#endif
+	if(closure_flag & SD_ABSORPTION) {
 		/* absorption only, no sampling needed */
 		float3 transmittance = volume_color_transmittance(coeff.sigma_a, t);
 		new_tp = *throughput * transmittance;
 	}
 
 	/* integrate emission attenuated by extinction */
-	if(closure_flag & SD_EMISSION) {
+	if(L && (closure_flag & SD_EMISSION)) {
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
@@ -408,13 +420,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	return VOLUME_PATH_ATTENUATED;
 }
 
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probalistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -425,9 +439,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	float t = 0.0f;
 	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
 
-	/* cache some constant variables */
-	float xi;
-	int channel = -1;
+	/* pick random color channel, we use the Veach one-sample
+	 * model with balance heuristic for the channels */
+	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+	int channel = (int)(rphase*3.0f);
+	sd->randb_closure = rphase*3.0f - channel;
 	bool has_scatter = false;
 
 	for(int i = 0; i < max_steps; i++) {
@@ -449,25 +466,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			float3 transmittance;
 			bool scatter = false;
 
-			/* randomly scatter, and if we do dt and new_t are shortened */
+			/* distance sampling */
+#ifdef __VOLUME_SCATTER__
 			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
 				has_scatter = true;
 
-				/* average sigma_t and sigma_s over segment */
 				float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 				float3 sigma_s = coeff.sigma_s;
 
-				/* lazily set up variables for sampling */
-				if(channel == -1) {
-					/* pick random color channel, we use the Veach one-sample
-					 * model with balance heuristic for the channels */
-					xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
-					float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
-					channel = (int)(rphase*3.0f);
-					sd->randb_closure = rphase*3.0f - channel;
-				}
-
 				/* compute transmittance over full step */
 				transmittance = volume_color_transmittance(sigma_t, dt);
 
@@ -480,10 +486,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					float new_dt = -logf(1.0f - xi)/sample_sigma_t;
 					new_t = t + new_dt;
 
-					/* transmittance, throughput */
+					/* transmittance and pdf */
 					float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
-					float pdf = average(sigma_t * new_transmittance);
-					new_tp = tp * sigma_s * new_transmittance / pdf;
+					float3 pdf = sigma_t * new_transmittance;
+
+					/* throughput */
+					new_tp = tp * sigma_s * new_transmittance / average(pdf);
 					scatter = true;
 				}
 				else {
@@ -495,7 +503,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					xi = 1.0f - (1.0f - xi)/sample_transmittance;
 				}
 			}
-			else if(closure_flag & SD_ABSORPTION) {
+			else 
+#endif
+			if(closure_flag & SD_ABSORPTION) {
 				/* absorption only, no sampling needed */
 				float3 sigma_a = coeff.sigma_a;
 
@@ -504,7 +514,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			}
 
 			/* integrate emission attenuated by absorption */
-			if(closure_flag & SD_EMISSION) {
+			if(L && (closure_flag & SD_EMISSION)) {
 				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
 				path_radiance_accum_emission(L, tp, emission, state->bounce);
 			}
@@ -518,19 +528,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					tp = make_float3(0.0f, 0.0f, 0.0f);
 					break;
 				}
+			}
 
-				/* prepare to scatter to new direction */
-				if(scatter) {
-					/* adjust throughput and move to new location */
-					sd->P = ray->P + new_t*ray->D;
-					*throughput = tp;
+			/* prepare to scatter to new direction */
+			if(scatter) {
+				/* adjust throughput and move to new location */
+				sd->P = ray->P + new_t*ray->D;
+				*throughput = tp;
 
-					return VOLUME_PATH_SCATTERED;
-				}
-				else {
-					/* accumulate transmittance */
-					accum_transmittance *= transmittance;
-				}
+				return VOLUME_PATH_SCATTERED;
+			}
+			else {
+				/* accumulate transmittance */
+				accum_transmittance *= transmittance;
 			}
 		}
 
@@ -545,14 +555,34 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	return VOLUME_PATH_ATTENUATED;
 }
 
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
+	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+{
+	/* workaround to fix correlation bug in T38710, can find better solution
+	 * in random number generator later, for now this is done here to not impact
+	 * performance of rendering without volumes */
+	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
+
+	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+
+	if(heterogeneous)
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
+	else
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
+}
+
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
  * through a volume. This can then latter be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-/* CPU only because of malloc/free */
-#ifdef __KERNEL_CPU__
+ * "Importance Sampling Techniques for Path Tracing in Participating Media"
+ *
+ * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * no support for malloc/free and too much stack usage with a fix size array. */
 
 typedef struct VolumeStep {
 	float3 sigma_s;				/* scatter coefficient */
@@ -571,6 +601,8 @@ typedef struct VolumeSegment {
 
 	float3 accum_emission;		/* accumulated emission at end of segment */
 	float3 accum_transmittance;	/* accumulated transmittance at end of segment */
+
+	int sampling_method;		/* volume sampling method */
 } VolumeSegment;
 
 /* record volume steps to the end of the volume.
@@ -578,10 +610,12 @@ typedef struct VolumeSegment {
  * it would be nice if we could only record up to the point that we need to scatter,
  * but the entire segment is needed to do always scattering, rather than probalistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratitied distance samples up to that transmittance */
+ * volume we can't generate stratified distance samples up to that transmittance */
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
+
 	/* prepare for volume stepping */
 	int max_steps;
 	float step_size, random_jitter_offset;
@@ -608,6 +642,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 	segment->closure_flag = 0;
 	segment->numsteps = 0;
+
 	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
 
 	VolumeStep *step = segment->steps;
@@ -669,6 +704,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		t = new_t;
 		if(t == ray->t)
 			break;
+
+		/* stop if nearly all light blocked */
+		if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps)
+			break;
 	}
 
 	/* store total emission and transmittance */
@@ -698,35 +737,70 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
  * scattering, they always scatter if there is any non-zero scattering
  * coefficient.
  *
- * these also do not do emission or modify throughput. */
+ * these also do not do emission or modify throughput. 
+ * 
+ * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
-	float3 *throughput, RNG *rng, VolumeSegment *segment)
+	float3 *throughput, float rphase, float rscatter,
+	const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter)
 {
-	int closure_flag = segment->closure_flag;
-
-	if(!(closure_flag & SD_SCATTER))
-		return VOLUME_PATH_MISSED;
+	kernel_assert(segment->closure_flag & SD_SCATTER);
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
-	float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
 	int channel = (int)(rphase*3.0f);
 	sd->randb_closure = rphase*3.0f - channel;
+	float xi = rscatter;
 
-	float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	/* probalistic scattering decision based on transmittance */
+	if(probalistic_scatter) {
+		float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
+
+		if(1.0f - xi >= sample_transmittance) {
+			/* rescale random number so we can reuse it */
+			xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
+		}
+		else {
+			*throughput /= sample_transmittance;
+			return VOLUME_PATH_MISSED;
+		}
+	}
 
 	VolumeStep *step;
 	float3 transmittance;
 	float pdf, sample_t;
+	float mis_weight = 1.0f;
+	bool distance_sample = true;
+	bool use_mis = false;
+
+	if(segment->sampling_method && light_P) {
+		if(segment->sampling_method == SD_VOLUME_MIS) {
+			/* multiple importance sample: randomly pick between
+			 * equiangular and distance sampling strategy */
+			if(xi < 0.5f) {
+				xi *= 2.0f;
+			}
+			else {
+				xi = (xi - 0.5f)*2.0f;
+				distance_sample = false;
+			}
+
+			use_mis = true;
+		}
+		else {
+			/* only equiangular sampling */
+			distance_sample = false;
+		}
+	}
 
 	/* distance sampling */
-	if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
+	if(distance_sample) {
 		/* find step in cdf */
 		step = segment->steps;
 
 		float prev_t = 0.0f;
-		float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f);
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
 			float prev_cdf = 0.0f;
@@ -749,7 +823,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			xi = (xi - prev_cdf)/(step_cdf - prev_cdf);
 
 			/* pdf for picking step */
-			step_pdf = step->cdf_distance - prev_cdf_distance;
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
 
 		/* determine range in which we will sample */
@@ -758,35 +832,77 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		/* sample distance and compute transmittance */
 		float3 distance_pdf;
 		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-		pdf = average(distance_pdf * step_pdf);
+
+		/* modifiy pdf for hit/miss decision */
+		if(probalistic_scatter)
+			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
+
+		pdf = average(distance_pdf * step_pdf_distance);
+
+		/* multiple importance sampling */
+		if(use_mis) {
+			float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
+			mis_weight = 2.0f*power_heuristic(pdf, equi_pdf);
+		}
 	}
 	/* equi-angular sampling */
 	else {
-		/* pick position on light */
-		float3 light_P;
-		if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
-			return VOLUME_PATH_MISSED;
-
 		/* sample distance */
-		sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf);
+		sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
 
 		/* find step in which sampled distance is located */
 		step = segment->steps;
 
 		float prev_t = 0.0f;
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
-			/* todo: optimize using binary search */
-			for(int i = 0; i < segment->numsteps-1; i++, step++) {
-				if(sample_t < step->t)
+			float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
+
+			int numsteps = segment->numsteps;
+			int high = numsteps - 1;
+			int low = 0;
+			int mid;
+
+			while(low < high) {
+				mid = (low + high) >> 1;
+
+				if(sample_t < step[mid].t)
+					high = mid;
+				else if(sample_t >= step[mid + 1].t)
+					low = mid + 1;
+				else {
+					/* found our interval in step[mid] .. step[mid+1] */
+					prev_t = step[mid].t;
+					prev_cdf_distance = step[mid].cdf_distance;
+					step += mid+1;
 					break;
+				}
+			}
 
-				prev_t = step->t;
+			if(low >= numsteps - 1) {
+				prev_t = step[numsteps - 1].t;
+				prev_cdf_distance = step[numsteps-1].cdf_distance;
+				step += numsteps - 1;
 			}
+
+			/* pdf for picking step with distance sampling */
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
-		
+
+		/* determine range in which we will sample */
+		float step_t = step->t - prev_t;
+		float step_sample_t = sample_t - prev_t;
+
 		/* compute transmittance */
-		transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t);
+		transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
+
+		/* multiple importance sampling */
+		if(use_mis) {
+			float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
+			float distance_pdf = average(distance_pdf3 * step_pdf_distance);
+			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
+		}
 	}
 
 	/* compute transmittance up to this step */
@@ -794,7 +910,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		transmittance *= (step-1)->accum_transmittance;
 
 	/* modify throughput */
-	*throughput *= step->sigma_s * transmittance / pdf;
+	*throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
 
 	/* evaluate shader to create closures at shading point */
 	if(segment->numsteps > 1) {
@@ -810,40 +926,27 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	return VOLUME_PATH_SCATTERED;
 }
 
-#endif
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
+/* decide if we need to use decoupled or not */
+ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
 {
-	/* workaround to fix correlation bug in T38710, can find better solution
-	 * in random number generator later, for now this is done here to not impact
-	 * performance of rendering without volumes */
-	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
-	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
-
-#if 0
-	/* debugging code to compare decoupled ray marching */
-	VolumeSegment segment;
-
-	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
-	kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous);
-
-	VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment);
-
-	kernel_volume_decoupled_free(kg, &segment);
+	/* decoupled ray marching for heterogenous volumes not supported on the GPU,
+	 * which also means equiangular and multiple importance sampling is not
+	 * support for that case */
+#ifdef __KERNEL_GPU__
+	if(heterogeneous)
+		return false;
+#endif
 
-	return result;
-#else
-	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+	/* equiangular and multiple importance sampling only implemented for decoupled */
+	if(sampling_method != 0)
+		return true;
 
-	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
+	/* for all light sampling use decoupled, reusing shader evaluations is
+	 * typically faster in that case */
+	if(direct)
+		return kernel_data.integrator.sample_all_lights_direct;
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
-#endif
+		return kernel_data.integrator.sample_all_lights_indirect;
 }
 
 /* Volume Stack
@@ -851,17 +954,88 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
  * This is an array of object/shared ID's that the current segment of the path
  * is inside of. */
 
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg, VolumeStack *stack)
+ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
+                                         Ray *ray,
+                                         VolumeStack *stack)
 {
-	/* todo: this assumes camera is always in air, need to detect when it isn't */
-	if(kernel_data.background.volume_shader == SHADER_NONE) {
-		stack[0].shader = SHADER_NONE;
+	/* NULL ray happens in the baker, does it need proper initialization of
+	 * camera in volume?
+	 */
+	if(!kernel_data.cam.is_inside_volume || ray == NULL) {
+		/* Camera is guaranteed to be in the air, only take background volume
+		 * into account in this case.
+		 */
+		if(kernel_data.background.volume_shader != SHADER_NONE) {
+			stack[0].shader = kernel_data.background.volume_shader;
+			stack[0].object = PRIM_NONE;
+			stack[1].shader = SHADER_NONE;
+		}
+		else {
+			stack[0].shader = SHADER_NONE;
+		}
+		return;
 	}
-	else {
+
+	Ray volume_ray = *ray;
+	volume_ray.t = FLT_MAX;
+
+	int stack_index = 0, enclosed_index = 0;
+	int enclosed_volumes[VOLUME_STACK_SIZE];
+
+	while(stack_index < VOLUME_STACK_SIZE - 1 &&
+	      enclosed_index < VOLUME_STACK_SIZE - 1)
+	{
+		Intersection isect;
+		if(!scene_intersect_volume(kg, &volume_ray, &isect)) {
+			break;
+		}
+
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		if(sd.flag & SD_HAS_VOLUME) {
+			if(sd.flag & SD_BACKFACING) {
+				/* If ray exited the volume and never entered to that volume
+				 * it means that camera is inside such a volume.
+				 */
+				bool is_enclosed = false;
+				for(int i = 0; i < enclosed_index; ++i) {
+					if(enclosed_volumes[i] == sd.object) {
+						is_enclosed = true;
+						break;
+					}
+				}
+				if(is_enclosed == false) {
+					stack[stack_index].object = sd.object;
+					stack[stack_index].shader = sd.shader;
+					++stack_index;
+				}
+			}
+			else {
+				/* If ray from camera enters the volume, this volume shouldn't
+				 * be added to the stak on exit.
+				 */
+				enclosed_volumes[enclosed_index++] = sd.object;
+			}
+		}
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+	}
+	/* stack_index of 0 means quick checks outside of the kernel gave false
+	 * positive, nothing to worry about, just we've wasted quite a few of
+	 * ticks just to come into conclusion that camera is in the air.
+	 *
+	 * In this case we're doing the same above -- check whether background has
+	 * volume.
+	 */
+	if(stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
 		stack[0].shader = kernel_data.background.volume_shader;
 		stack[0].object = PRIM_NONE;
 		stack[1].shader = SHADER_NONE;
 	}
+	else {
+		stack[stack_index].shader = SHADER_NONE;
+	}
 }
 
 ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
@@ -910,4 +1084,3 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 }
 
 CCL_NAMESPACE_END
-