10 files changed, 676 insertions, 288 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 7205a272395..12babd95ed8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -111,6 +111,7 @@ enum_integrator = (
 enum_volume_homogeneous_sampling = (
     ('DISTANCE', "Distance", "Use Distance Sampling"),
     ('EQUI_ANGULAR', "Equi-angular", "Use Equi-angular Sampling"),
+    ('MULTIPLE_IMPORTANCE', "Multiple Importance", "Combine distance and equi-angular sampling"),
     )
 
 
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index b382e2c833b..bda98b84da8 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -194,6 +194,17 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 
 		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
 
+#ifdef __VOLUME__
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			/* shadow attenuation */
+			Ray volume_ray = *ray;
+			volume_ray.t = ls.t;
+			float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f);
+			kernel_volume_shadow(kg, state, &volume_ray, &volume_tp);
+			L *= volume_tp;
+		}
+#endif
+
 		if(!(state->flag & PATH_RAY_MIS_SKIP)) {
 			/* multiple importance sampling, get regular light pdf,
 			 * and compute weight with respect to BSDF pdf */
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index ac432d3fe04..0adf9ed4666 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -208,8 +208,8 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
 	return t*t/cos_pi;
 }
 
-ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
-	float randu, float randv, float3 P, LightSample *ls)
+ccl_device bool lamp_light_sample(KernelGlobals *kg, int lamp,
+	float randu, float randv, float3 P, LightSample *ls, bool for_volume)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
@@ -224,6 +224,11 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	ls->v = randv;
 
 	if(type == LIGHT_DISTANT) {
+#ifdef __VOLUME__
+		if(for_volume)
+			return false;
+#endif
+
 		/* distant light */
 		float3 lightD = make_float3(data0.y, data0.z, data0.w);
 		float3 D = lightD;
@@ -244,6 +249,11 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	}
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
+#ifdef __VOLUME__
+		if(for_volume)
+			return false;
+#endif
+
 		/* infinite area light (e.g. light dome or env light) */
 		float3 D = background_light_sample(kg, randu, randv, &ls->pdf);
 
@@ -299,6 +309,8 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
 		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
+
+	return true;
 }
 
 ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
@@ -514,7 +526,7 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 
 /* Generic Light */
 
-ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
+ccl_device bool light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls, bool for_volume)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -533,10 +545,12 @@ ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float
 		ls->D = normalize_len(ls->P - P, &ls->t);
 		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		ls->shader |= shader_flag;
+
+		return true;
 	}
 	else {
 		int lamp = -prim-1;
-		lamp_light_sample(kg, lamp, randu, randv, P, ls);
+		return lamp_light_sample(kg, lamp, randu, randv, P, ls, for_volume);
 	}
 }
 
@@ -546,9 +560,9 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
 
-ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls)
+ccl_device bool light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls, bool for_volume)
 {
-	lamp_light_sample(kg, index, randu, randv, P, ls);
+	return lamp_light_sample(kg, index, randu, randv, P, ls, for_volume);
 }
 
 ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 9a5a85abae1..22024483bfc 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -29,7 +29,6 @@
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
 #include "kernel_light.h"
-#include "kernel_emission.h"
 #include "kernel_passes.h"
 
 #ifdef __SUBSURFACE__
@@ -42,6 +41,7 @@
 
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
+#include "kernel_emission.h"
 #include "kernel_path_surface.h"
 #include "kernel_path_volume.h"
 
@@ -88,17 +88,73 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
-			ShaderData volume_sd;
-			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
-				&volume_sd, &volume_ray, L, &throughput, rng);
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false);
 
-			if(result == VOLUME_PATH_SCATTERED) {
-				kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L, 1.0f);
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
 
-				if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
-					continue;
-				else
-					break;
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+				bool scatter = false;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = kernel_data.integrator.sample_all_lights_indirect;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+
+					if(result == VOLUME_PATH_SCATTERED)
+						scatter = kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f);
+				}
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(scatter)
+						continue;
+					else
+						break;
+				}
+			}
+			else {
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L, 1.0f);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
+						continue;
+					else
+						break;
+				}
 			}
 		}
 #endif
@@ -411,17 +467,73 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
-			ShaderData volume_sd;
-			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
-				&volume_sd, &volume_ray, &L, &throughput, rng);
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
 
-			if(result == VOLUME_PATH_SCATTERED) {
-				kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L, 1.0f);
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
 
-				if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
-					continue;
-				else
-					break;
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+				bool scatter = false;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+
+					if(result == VOLUME_PATH_SCATTERED)
+						scatter = kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f);
+				}
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(scatter)
+						continue;
+					else
+						break;
+				}
+			}
+			else {
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L, 1.0f);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
+						continue;
+					else
+						break;
+				}
 			}
 		}
 #endif
@@ -700,37 +812,47 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			kernel_volume_decoupled_record(kg, &state,
 				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
 
-			/* sample scattering */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			for(int j = 0; j < num_samples; j++) {
-				/* workaround to fix correlation bug in T38710, can find better solution
-				 * in random number generator later, for now this is done here to not impact
-				 * performance of rendering without volumes */
-				RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-				PathState ps = state;
-				Ray pray = ray;
-				float3 tp = throughput;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-					&ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment);
-				
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: use all-light sampling */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L, 1.0f);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+			/* direct light sampling */
+			if(volume_segment.closure_flag & SD_SCATTER) {
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+					throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+				/* indirect light sampling */
+				int num_samples = kernel_data.integrator.volume_samples;
+				float num_samples_inv = 1.0f/num_samples;
+
+				for(int j = 0; j < num_samples; j++) {
+					/* workaround to fix correlation bug in T38710, can find better solution
+					 * in random number generator later, for now this is done here to not impact
+					 * performance of rendering without volumes */
+					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+					PathState ps = state;
+					Ray pray = ray;
+					float3 tp = throughput;
+
+					/* branch RNG state */
+					path_state_branch(&ps, j, num_samples);
+
+					/* scatter sample. if we use distance sampling and take just one
+					 * sample for direct and indirect light, we could share this
+					 * computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+
+					if(result == VOLUME_PATH_SCATTERED) {
+						if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+							kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+
+							/* for render passes, sum and reset indirect light pass variables
+							 * for the next samples */
+							path_radiance_sum_indirect(&L);
+							path_radiance_reset_indirect(&L);
+						}
 					}
 				}
 			}
@@ -759,12 +881,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				/* branch RNG state */
 				path_state_branch(&ps, j, num_samples);
 
-				VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps,
-					&volume_sd, &volume_ray, &L, &tp, rng);
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng);
 				
 				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: use all-light sampling */
-					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+					/* todo: support equiangular, MIS and all light sampling.
+					 * alternatively get decoupled ray marching working on the GPU */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, &volume_ray, throughput, &state, &L, num_samples_inv);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
 						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
 
 						/* for render passes, sum and reset indirect light pass variables
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 85bdebf195e..11fadcc6bcf 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -22,97 +22,101 @@ CCL_NAMESPACE_BEGIN
 ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
 {
+#ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(sd->flag & SD_BSDF_HAS_EVAL) {
-		Ray light_ray;
-		BsdfEval L_light;
-		bool is_lamp;
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
+		return;
+
+	Ray light_ray;
+	BsdfEval L_light;
+	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
+	light_ray.time = sd->time;
 #endif
 
-		if(sample_all_lights) {
-			/* lamp sampling */
-			for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-				int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
-				float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-				RNG lamp_rng = cmj_hash(*rng, i);
+	if(sample_all_lights) {
+		/* lamp sampling */
+		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			RNG lamp_rng = cmj_hash(*rng, i);
 
-				if(kernel_data.integrator.pdf_triangles != 0.0f)
-					num_samples_inv *= 0.5f;
+			if(kernel_data.integrator.pdf_triangles != 0.0f)
+				num_samples_inv *= 0.5f;
 
-				for(int j = 0; j < num_samples; j++) {
-					float light_u, light_v;
-					path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+			for(int j = 0; j < num_samples; j++) {
+				float light_u, light_v;
+				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
-					LightSample ls;
-					light_select(kg, i, light_u, light_v, sd->P, &ls);
+				LightSample ls;
+				light_select(kg, i, light_u, light_v, sd->P, &ls, false);
 
-					if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-						/* trace shadow ray */
-						float3 shadow;
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
 
-						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
-						}
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
 				}
 			}
+		}
 
-			/* mesh light sampling */
-			if(kernel_data.integrator.pdf_triangles != 0.0f) {
-				int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
-				float num_samples_inv = num_samples_adjust/num_samples;
+		/* mesh light sampling */
+		if(kernel_data.integrator.pdf_triangles != 0.0f) {
+			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+			float num_samples_inv = num_samples_adjust/num_samples;
 
-				if(kernel_data.integrator.num_all_lights)
-					num_samples_inv *= 0.5f;
+			if(kernel_data.integrator.num_all_lights)
+				num_samples_inv *= 0.5f;
 
-				for(int j = 0; j < num_samples; j++) {
-					float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
-					float light_u, light_v;
-					path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+			for(int j = 0; j < num_samples; j++) {
+				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
+				float light_u, light_v;
+				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
-					/* only sample triangle lights */
-					if(kernel_data.integrator.num_all_lights)
-						light_t = 0.5f*light_t;
+				/* only sample triangle lights */
+				if(kernel_data.integrator.num_all_lights)
+					light_t = 0.5f*light_t;
 
-					LightSample ls;
-					light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				LightSample ls;
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
 
-					if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-						/* trace shadow ray */
-						float3 shadow;
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
 
-						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
-						}
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
 				}
 			}
 		}
-		else {
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-			LightSample ls;
-			light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
-
-			/* sample random light */
-			if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-				/* trace shadow ray */
-				float3 shadow;
-
-				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-					/* accumulate */
-					path_radiance_accum_light(L, throughput, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
-				}
+	}
+	else {
+		/* sample one light at random */
+		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+		float light_u, light_v;
+		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+		LightSample ls;
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
+
+		/* sample random light */
+		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+			/* trace shadow ray */
+			float3 shadow;
+
+			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+				/* accumulate */
+				path_radiance_accum_light(L, throughput, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
 			}
 		}
 	}
+#endif
 }
 
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
@@ -196,7 +200,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
 
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 6196a35a184..6cc3085b96e 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -18,11 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
 
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L, float num_samples_adjust)
+ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
+	float num_samples_adjust)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+	if(!kernel_data.integrator.use_direct_light)
 		return;
 
 	/* sample illumination from lights to find path contribution */
@@ -32,15 +33,19 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *
 
 	Ray light_ray;
 	BsdfEval L_light;
+	LightSample ls;
 	bool is_lamp;
 
+	/* connect to light from given point where shader has been evaluated */
 #ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
 #endif
 
-	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
-
+	if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
+		return;
+	else if(ls.pdf == 0.0f)
+		return;
+	
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
 		float3 shadow;
@@ -53,7 +58,7 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *
 #endif
 }
 
-ccl_device_inline bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray,
 	float num_samples_adjust)
 {
@@ -98,6 +103,178 @@ ccl_device_inline bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	return true;
 }
 
+#ifdef __KERNEL_CPU__
+
+ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
+	float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+{
+#ifdef __EMISSION__
+	if(!kernel_data.integrator.use_direct_light)
+		return;
+
+	Ray light_ray;
+	BsdfEval L_light;
+	bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+	light_ray.time = sd->time;
+#endif
+
+	if(sample_all_lights) {
+		/* lamp sampling */
+		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			RNG lamp_rng = cmj_hash(*rng, i);
+
+			if(kernel_data.integrator.pdf_triangles != 0.0f)
+				num_samples_inv *= 0.5f;
+
+			for(int j = 0; j < num_samples; j++) {
+				/* sample random position on given light */
+				float light_u, light_v;
+				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+				LightSample ls;
+				if(!light_select(kg, i, light_u, light_v, ray->P, &ls, true))
+					continue;
+
+				float3 tp = throughput;
+
+				/* sample position on volume segment */
+				if(segment) {
+					float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+					float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+
+					if(result != VOLUME_PATH_SCATTERED)
+						continue;
+
+					/* todo: split up light_sample so we don't have to call it again with new position */
+					if(!light_select(kg, i, light_u, light_v, sd->P, &ls, true))
+						continue;
+				}
+
+				if(ls.pdf == 0.0f)
+					continue;
+
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
+
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+					}
+				}
+			}
+		}
+
+		/* mesh light sampling */
+		if(kernel_data.integrator.pdf_triangles != 0.0f) {
+			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+			float num_samples_inv = num_samples_adjust/num_samples;
+
+			if(kernel_data.integrator.num_all_lights)
+				num_samples_inv *= 0.5f;
+
+			for(int j = 0; j < num_samples; j++) {
+				/* sample random position on random triangle */
+				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
+				float light_u, light_v;
+				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+				/* only sample triangle lights */
+				if(kernel_data.integrator.num_all_lights)
+					light_t = 0.5f*light_t;
+
+				LightSample ls;
+				if(!light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls, true))
+					continue;
+
+				float3 tp = throughput;
+
+				/* sample position on volume segment */
+				if(segment) {
+					float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+					float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+
+					if(result != VOLUME_PATH_SCATTERED)
+						continue;
+
+					/* todo: split up light_sample so we don't have to call it again with new position */
+					if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
+						continue;
+				}
+
+				if(ls.pdf == 0.0f)
+					continue;
+
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
+
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+					}
+				}
+			}
+		}
+	}
+	else {
+		/* sample random position on random light */
+		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+		float light_u, light_v;
+		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+		LightSample ls;
+		if(!light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls, true))
+			return;
+
+		float3 tp = throughput;
+
+		/* sample position on volume segment */
+		if(segment) {
+			float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+			float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+			VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+				state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+
+			if(result != VOLUME_PATH_SCATTERED)
+				return;
+
+			/* todo: split up light_sample so we don't have to call it again with new position */
+			if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
+				return;
+		}
+
+		if(ls.pdf == 0.0f)
+			return;
+
+		/* sample random light */
+		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+			/* trace shadow ray */
+			float3 shadow;
+
+			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+				/* accumulate */
+				path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			}
+		}
+	}
+#endif
+}
+
+#endif
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index ac04b3168a1..236f74c0a82 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -279,17 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+{
+	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
+	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
@@ -303,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
 	state->num_samples = state->num_samples*num_branches;
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng, PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
 {
 	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 0ea21a85fcb..842b9f68840 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -858,7 +858,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 
 			if(phase_pdf != 0.0f) {
 				bsdf_eval_accum(result_eval, sc->type, eval);
-				sum_pdf += phase_pdf;
+				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 
 			sum_sample_weight += sc->sample_weight;
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 49e4cf64de4..61e4989147b 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -136,7 +136,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -226,25 +226,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s
 	return pdf;
 }
 
-ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P, bool *distant)
-{
-	/* light RNGs */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-	/* light sample */
-	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls);
-	if(ls.pdf == 0.0f)
-		return false;
-	
-	*light_P = ls.P;
-	*distant = ls.t == FLT_MAX;
-
-	return true;
-}
-
 /* Distance sampling */
 
 ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
@@ -304,7 +285,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
  * the volume shading coefficient for the entire line segment */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng)
+	RNG *rng, bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -326,47 +307,37 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 		int channel = (int)(rphase*3.0f);
 		sd->randb_closure = rphase*3.0f - channel;
 
+		/* decide if we will hit or miss */
+		bool scatter = true;
 		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 
-		/* decide if we will hit or miss */
-		float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-		float sample_transmittance = expf(-sample_sigma_t * t);
+		if(probalistic_scatter) {
+			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+			float sample_transmittance = expf(-sample_sigma_t * t);
 
-		if(xi >= sample_transmittance) {
-			/* scattering */
-			float3 pdf;
-			float3 transmittance;
-			float sample_t;
+			if(1.0f - xi >= sample_transmittance) {
+				scatter = true;
 
-			/* rescale random number so we can reuse it */
-			xi = (xi - sample_transmittance)/(1.0f - sample_transmittance);
+				/* rescale random number so we can reuse it */
+				xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
 
-			if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
-				/* distance sampling */
-				sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 			}
-			else {
-				/* equiangular sampling */
-				float3 light_P;
-				float equi_pdf;
-				bool light_distant;
+			else
+				scatter = false;
+		}
 
-				if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P, &light_distant))
-					return VOLUME_PATH_MISSED;
+		if(scatter) {
+			/* scattering */
+			float3 pdf;
+			float3 transmittance;
+			float sample_t;
 
-				if(light_distant) {
-					/* distant light, revert to distance sampling because position is infinitely far away */
-					sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
-				}
-				else {
-					sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf);
-					transmittance = volume_color_transmittance(sigma_t, sample_t);
-					pdf = make_float3(equi_pdf, equi_pdf, equi_pdf);
-				}
-			}
+			/* distance sampling */
+			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 
 			/* modifiy pdf for hit/miss decision */
-			pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+			if(probalistic_scatter)
+				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
 
 			new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
 			t = sample_t;
@@ -385,7 +356,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	}
 
 	/* integrate emission attenuated by extinction */
-	if(closure_flag & SD_EMISSION) {
+	if(L && (closure_flag & SD_EMISSION)) {
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
@@ -408,13 +379,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	return VOLUME_PATH_ATTENUATED;
 }
 
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probalistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -425,9 +398,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	float t = 0.0f;
 	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
 
-	/* cache some constant variables */
-	float xi;
-	int channel = -1;
+	/* pick random color channel, we use the Veach one-sample
+	 * model with balance heuristic for the channels */
+	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+	int channel = (int)(rphase*3.0f);
+	sd->randb_closure = rphase*3.0f - channel;
 	bool has_scatter = false;
 
 	for(int i = 0; i < max_steps; i++) {
@@ -449,25 +425,13 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			float3 transmittance;
 			bool scatter = false;
 
-			/* randomly scatter, and if we do dt and new_t are shortened */
+			/* distance sampling */
 			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
 				has_scatter = true;
 
-				/* average sigma_t and sigma_s over segment */
 				float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 				float3 sigma_s = coeff.sigma_s;
 
-				/* lazily set up variables for sampling */
-				if(channel == -1) {
-					/* pick random color channel, we use the Veach one-sample
-					 * model with balance heuristic for the channels */
-					xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
-					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-					channel = (int)(rphase*3.0f);
-					sd->randb_closure = rphase*3.0f - channel;
-				}
-
 				/* compute transmittance over full step */
 				transmittance = volume_color_transmittance(sigma_t, dt);
 
@@ -480,10 +444,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					float new_dt = -logf(1.0f - xi)/sample_sigma_t;
 					new_t = t + new_dt;
 
-					/* transmittance, throughput */
+					/* transmittance and pdf */
 					float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
-					float pdf = average(sigma_t * new_transmittance);
-					new_tp = tp * sigma_s * new_transmittance / pdf;
+					float3 pdf = sigma_t * new_transmittance;
+
+					/* throughput */
+					new_tp = tp * sigma_s * new_transmittance / average(pdf);
 					scatter = true;
 				}
 				else {
@@ -504,7 +470,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			}
 
 			/* integrate emission attenuated by absorption */
-			if(closure_flag & SD_EMISSION) {
+			if(L && (closure_flag & SD_EMISSION)) {
 				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
 				path_radiance_accum_emission(L, tp, emission, state->bounce);
 			}
@@ -518,19 +484,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					tp = make_float3(0.0f, 0.0f, 0.0f);
 					break;
 				}
+			}
 
-				/* prepare to scatter to new direction */
-				if(scatter) {
-					/* adjust throughput and move to new location */
-					sd->P = ray->P + new_t*ray->D;
-					*throughput = tp;
+			/* prepare to scatter to new direction */
+			if(scatter) {
+				/* adjust throughput and move to new location */
+				sd->P = ray->P + new_t*ray->D;
+				*throughput = tp;
 
-					return VOLUME_PATH_SCATTERED;
-				}
-				else {
-					/* accumulate transmittance */
-					accum_transmittance *= transmittance;
-				}
+				return VOLUME_PATH_SCATTERED;
+			}
+			else {
+				/* accumulate transmittance */
+				accum_transmittance *= transmittance;
 			}
 		}
 
@@ -545,14 +511,35 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	return VOLUME_PATH_ATTENUATED;
 }
 
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
+	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
+{
+	/* workaround to fix correlation bug in T38710, can find better solution
+	 * in random number generator later, for now this is done here to not impact
+	 * performance of rendering without volumes */
+	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+
+	if(heterogeneous)
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
+	else
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
+}
+
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
  * through a volume. This can then latter be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-/* CPU only because of malloc/free */
-#ifdef __KERNEL_CPU__
+ * "Importance Sampling Techniques for Path Tracing in Participating Media"
+ *
+ * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * no support for malloc/free and too much stack usage with a fix size array. */
 
 typedef struct VolumeStep {
 	float3 sigma_s;				/* scatter coefficient */
@@ -565,7 +552,11 @@ typedef struct VolumeStep {
 } VolumeStep;
 
 typedef struct VolumeSegment {
+#ifdef __KERNEL_CPU__
 	VolumeStep *steps;			/* recorded steps */
+#else
+	VolumeStep steps[1];		/* recorded steps */
+#endif
 	int numsteps;				/* number of steps */
 	int closure_flag;			/* accumulated closure flags from all steps */
 
@@ -582,6 +573,8 @@ typedef struct VolumeSegment {
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
+
 	/* prepare for volume stepping */
 	int max_steps;
 	float step_size, random_jitter_offset;
@@ -608,7 +601,11 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 	segment->closure_flag = 0;
 	segment->numsteps = 0;
+#ifdef __KERNEL_CPU__
 	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+#else
+	kernel_assert(max_steps == 1);
+#endif
 
 	VolumeStep *step = segment->steps;
 
@@ -669,6 +666,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		t = new_t;
 		if(t == ray->t)
 			break;
+
+		/* stop if nearly all light blocked */
+		if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps)
+			break;
 	}
 
 	/* store total emission and transmittance */
@@ -690,7 +691,9 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
+#ifdef __KERNEL_CPU__
 	free(segment->steps);
+#endif
 }
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
@@ -701,7 +704,8 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
  * these also do not do emission or modify throughput. */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
-	float3 *throughput, RNG *rng, VolumeSegment *segment)
+	float3 *throughput, float rphase, float rscatter,
+	const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter)
 {
 	int closure_flag = segment->closure_flag;
 
@@ -710,38 +714,56 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
-	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 	int channel = (int)(rphase*3.0f);
 	sd->randb_closure = rphase*3.0f - channel;
+	float xi = rscatter;
 
-	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	/* probalistic scattering decision based on transmittance */
+	if(probalistic_scatter) {
+		float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
+
+		if(1.0f - xi >= sample_transmittance) {
+			/* rescale random number so we can reuse it */
+			xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
+		}
+		else
+			return VOLUME_PATH_MISSED;
+	}
 
 	VolumeStep *step;
 	float3 transmittance;
 	float pdf, sample_t;
+	float mis_weight = 1.0f;
+	bool distance_sample = true;
+	bool use_mis = false;
+
+	if(kernel_data.integrator.volume_homogeneous_sampling && light_P) {
+		if(kernel_data.integrator.volume_homogeneous_sampling == 2) {
+			/* multiple importance sample: randomly pick between
+			 * equiangular and distance sampling strategy */
+			if(xi < 0.5f) {
+				xi *= 2.0f;
+			}
+			else {
+				xi = (xi - 0.5f)*2.0f;
+				distance_sample = false;
+			}
 
-	/* pick position on light for equiangular */
-	bool equiangular = (kernel_data.integrator.volume_homogeneous_sampling != 0 && kernel_data.integrator.num_all_lights);
-	float3 light_P;
-
-	if(equiangular) {
-		bool light_distant;
-
-		if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P, &light_distant))
-			return VOLUME_PATH_MISSED;
-
-		/* distant light, revert to distance sampling because position is infinitely far away */
-		if(light_distant)
-			equiangular = false;
+			use_mis = true;
+		}
+		else {
+			/* only equiangular sampling */
+			distance_sample = false;
+		}
 	}
 
 	/* distance sampling */
-	if(!equiangular) {
+	if(distance_sample) {
 		/* find step in cdf */
 		step = segment->steps;
 
 		float prev_t = 0.0f;
-		float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f);
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
 			float prev_cdf = 0.0f;
@@ -764,7 +786,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			xi = (xi - prev_cdf)/(step_cdf - prev_cdf);
 
 			/* pdf for picking step */
-			step_pdf = step->cdf_distance - prev_cdf_distance;
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
 
 		/* determine range in which we will sample */
@@ -773,30 +795,59 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		/* sample distance and compute transmittance */
 		float3 distance_pdf;
 		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-		pdf = average(distance_pdf * step_pdf);
+
+		/* modifiy pdf for hit/miss decision */
+		if(probalistic_scatter)
+			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
+
+		pdf = average(distance_pdf * step_pdf_distance);
+
+		/* multiple importance sampling */
+		if(use_mis) {
+			float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
+			mis_weight = 2.0f*power_heuristic(pdf, equi_pdf);
+		}
 	}
 	/* equi-angular sampling */
 	else {
 		/* sample distance */
-		sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf);
+		sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
 
 		/* find step in which sampled distance is located */
 		step = segment->steps;
 
 		float prev_t = 0.0f;
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
 			/* todo: optimize using binary search */
+			float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
+
 			for(int i = 0; i < segment->numsteps-1; i++, step++) {
 				if(sample_t < step->t)
 					break;
 
 				prev_t = step->t;
+				prev_cdf_distance = step->cdf_distance;
 			}
+
+			/* pdf for picking step with distance sampling */
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
-		
+
+		/* determine range in which we will sample */
+		float step_t = step->t - prev_t;
+		float step_sample_t = sample_t - prev_t;
+
 		/* compute transmittance */
-		transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t);
+		transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
+
+		/* multiple importance sampling */
+		if(use_mis) {
+			float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
+			float distance_pdf = average(distance_pdf3 * step_pdf_distance);
+			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
+		}
 	}
 
 	/* compute transmittance up to this step */
@@ -804,7 +855,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		transmittance *= (step-1)->accum_transmittance;
 
 	/* modify throughput */
-	*throughput *= step->sigma_s * transmittance / pdf;
+	*throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
 
 	/* evaluate shader to create closures at shading point */
 	if(segment->numsteps > 1) {
@@ -820,40 +871,28 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	return VOLUME_PATH_SCATTERED;
 }
 
-#endif
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
+/* decide if we need to use decoupled or not */
+ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct)
 {
-	/* workaround to fix correlation bug in T38710, can find better solution
-	 * in random number generator later, for now this is done here to not impact
-	 * performance of rendering without volumes */
-	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
-	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
-
-#if 0
-	/* debugging code to compare decoupled ray marching */
-	VolumeSegment segment;
-
-	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
-	kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous);
-
-	VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment);
-
-	kernel_volume_decoupled_free(kg, &segment);
+	/* decoupled ray marching for heterogenous volumes not supported on the GPU,
+	 * which also means equiangular and multiple importance sampling is not
+	 * support for that case */
+#ifdef __KERNEL_GPU__
+	if(heterogeneous)
+		return false;
+#endif
 
-	return result;
-#else
-	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+	/* equiangular sampling only implemented for decoupled */
+	bool equiangular = kernel_data.integrator.volume_homogeneous_sampling != 0;
+	if(equiangular)
+		return true;
 
-	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
+	/* for all light sampling use decoupled, reusing shader evaluations is
+	 * typically faster in that case */
+	if(direct)
+		return kernel_data.integrator.sample_all_lights_direct;
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
-#endif
+		return kernel_data.integrator.sample_all_lights_indirect;
 }
 
 /* Volume Stack
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 051ba1baf3a..ee3419b055c 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -101,7 +101,11 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	if(!transparent_shadows)
 		kintegrator->transparent_shadows = false;
 
-	kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling;
+	if(kintegrator->num_all_lights > 0)
+		kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling;
+	else
+		kintegrator->volume_homogeneous_sampling = 0;
+
 	kintegrator->volume_max_steps = volume_max_steps;
 	kintegrator->volume_step_size = volume_step_size;
 
@@ -125,8 +129,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->mesh_light_samples = mesh_light_samples;
 	kintegrator->subsurface_samples = subsurface_samples;
 	kintegrator->volume_samples = volume_samples;
-	kintegrator->sample_all_lights_direct = sample_all_lights_direct;
-	kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
+
+	if(method == BRANCHED_PATH) {
+		kintegrator->sample_all_lights_direct = sample_all_lights_direct;
+		kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
+	}
+	else {
+		kintegrator->sample_all_lights_direct = false;
+		kintegrator->sample_all_lights_indirect = false;
+	}
 
 	kintegrator->sampling_pattern = sampling_pattern;
 	kintegrator->aa_samples = aa_samples;