16 files changed, 580 insertions, 131 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index aa2da00838e..74f94d6eafc 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -220,6 +220,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=1,
                 )
 
+        cls.volume_samples = IntProperty(
+                name="Volume Samples",
+                description="Number of volume scattering samples to render for each AA sample",
+                min=1, max=10000,
+                default=1,
+                )
+
         cls.sampling_pattern = EnumProperty(
                 name="Sampling Pattern",
                 description="Random sampling pattern used by the integrator",
@@ -280,6 +287,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 min=0, max=1024,
                 default=12,
                 )
+        cls.volume_bounces = IntProperty(
+                name="Volume Bounces",
+                description="Maximum number of volumetric scattering events",
+                min=0, max=1024,
+                default=1,
+                )
 
         cls.transparent_min_bounces = IntProperty(
                 name="Transparent Min Bounces",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 969f13017ca..d5f14473230 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -139,6 +139,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "ao_samples", text="AO")
             sub.prop(cscene, "mesh_light_samples", text="Mesh Light")
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
+            sub.prop(cscene, "volume_samples", text="Volume")
 
         if cscene.feature_set == 'EXPERIMENTAL' and (device_type == 'NONE' or cscene.device == 'CPU'):
             layout.row().prop(cscene, "sampling_pattern", text="Pattern")
@@ -208,6 +209,7 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "diffuse_bounces", text="Diffuse")
         sub.prop(cscene, "glossy_bounces", text="Glossy")
         sub.prop(cscene, "transmission_bounces", text="Transmission")
+        sub.prop(cscene, "volume_bounces", text="Volume")
 
 
 class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel):
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 826402c8ba6..14e9f27ee39 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -166,6 +166,7 @@ void BlenderSync::sync_integrator()
 	integrator->max_diffuse_bounce = get_int(cscene, "diffuse_bounces");
 	integrator->max_glossy_bounce = get_int(cscene, "glossy_bounces");
 	integrator->max_transmission_bounce = get_int(cscene, "transmission_bounces");
+	integrator->max_volume_bounce = get_int(cscene, "volume_bounces");
 
 	integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
 	integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces");
@@ -201,6 +202,7 @@ void BlenderSync::sync_integrator()
 	int ao_samples = get_int(cscene, "ao_samples");
 	int mesh_light_samples = get_int(cscene, "mesh_light_samples");
 	int subsurface_samples = get_int(cscene, "subsurface_samples");
+	int volume_samples = get_int(cscene, "volume_samples");
 
 	if(get_boolean(cscene, "use_square_samples")) {
 		integrator->diffuse_samples = diffuse_samples * diffuse_samples;
@@ -209,6 +211,7 @@ void BlenderSync::sync_integrator()
 		integrator->ao_samples = ao_samples * ao_samples;
 		integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
 		integrator->subsurface_samples = subsurface_samples * subsurface_samples;
+		integrator->volume_samples = volume_samples * volume_samples;
 	} 
 	else {
 		integrator->diffuse_samples = diffuse_samples;
@@ -217,6 +220,7 @@ void BlenderSync::sync_integrator()
 		integrator->ao_samples = ao_samples;
 		integrator->mesh_light_samples = mesh_light_samples;
 		integrator->subsurface_samples = subsurface_samples;
+		integrator->volume_samples = volume_samples;
 	}
 	
 
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index b3141d1154f..8d458c552ac 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -32,6 +32,9 @@
 #ifdef __SUBSURFACE__
 #include "../closure/bssrdf.h"
 #endif
+#ifdef __VOLUME__
+#include "../closure/volume.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -124,6 +127,9 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
+		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
+			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
 		default:
 			label = LABEL_NONE;
 			break;
@@ -204,6 +210,11 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
+#ifdef __VOLUME__
+			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				break;
+#endif
 			default:
 				eval = make_float3(0.0f, 0.0f, 0.0f);
 				break;
@@ -266,6 +277,11 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #endif
+#ifdef __VOLUME__
+			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				break;
+#endif
 			default:
 				eval = make_float3(0.0f, 0.0f, 0.0f);
 				break;
@@ -344,6 +360,7 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 			bsdf_hair_reflection_blur(sc, roughness);
 			break;
 #endif
+		/* todo: do we want to blur volume closures? */
 		default:
 			break;
 	}
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 951a03bce71..058c4b8408f 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -39,7 +39,7 @@ ccl_device int volume_henyey_greenstein_setup(ShaderClosure *sc)
 	/* clamp anisotropy to avoid delta function */
 	sc->data0 = signf(sc->data0) * min(fabsf(sc->data0), 1.0f - 1e-3f);
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_SCATTER;
+	return SD_SCATTER|SD_PHASE_HAS_EVAL;
 }
 
 ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, const float3 I, float3 omega_in, float *pdf)
@@ -103,13 +103,13 @@ ccl_device int volume_absorption_setup(ShaderClosure *sc)
 
 /* VOLUME CLOSURE */
 
-ccl_device float3 volume_eval_phase(const ShaderClosure *sc, const float3 I, float3 omega_in, float *pdf)
+ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf)
 {
 	float3 eval;
 
 	switch(sc->type) {
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			eval = volume_henyey_greenstein_eval_phase(sc, I, omega_in, pdf);
+			eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 			break;
 		default:
 			eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -119,7 +119,7 @@ ccl_device float3 volume_eval_phase(const ShaderClosure *sc, const float3 I, flo
 	return eval;
 }
 
-ccl_device int volume_sample(const ShaderData *sd, const ShaderClosure *sc, float randu,
+ccl_device int volume_phase_sample(const ShaderData *sd, const ShaderClosure *sc, float randu,
 	float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf)
 {
 	int label;
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index f4febd7cf2c..24e9d7b8858 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -35,7 +35,7 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 
 		if(type == CLOSURE_BSDF_TRANSPARENT_ID)
 			eval->transparent = value;
-		else if(CLOSURE_IS_BSDF_DIFFUSE(type))
+		else if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
 			eval->diffuse = value;
 		else if(CLOSURE_IS_BSDF_GLOSSY(type))
 			eval->glossy = value;
@@ -55,7 +55,7 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
-		if(CLOSURE_IS_BSDF_DIFFUSE(type))
+		if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
 			eval->diffuse += value;
 		else if(CLOSURE_IS_BSDF_GLOSSY(type))
 			eval->glossy += value;
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index e0db9435239..1ec116b7635 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -103,7 +103,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
 	/* evaluate BSDF at shading point */
 	float bsdf_pdf;
 
+#ifdef __VOLUME__
+	if(sd->prim != ~0)
+		shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+	else
+		shader_volume_phase_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+#else
 	shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+#endif
 
 	if(ls.shader & SHADER_USE_MIS) {
 		/* multiple importance sampling */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index fda6b45e149..8460af85d4c 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -48,6 +48,89 @@
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __VOLUME__
+
+ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray,
+	float num_samples_adjust)
+{
+#ifdef __EMISSION__
+	if(kernel_data.integrator.use_direct_light) {
+		/* sample illumination from lights to find path contribution */
+		if(sd->flag & SD_BSDF_HAS_EVAL) {
+			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+#ifdef __MULTI_CLOSURE__
+			float light_o = 0.0f;
+#else
+			float light_o = path_state_rng_1D(kg, rng, state, PRNG_LIGHT_F);
+#endif
+			float light_u, light_v;
+			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+			Ray light_ray;
+			BsdfEval L_light;
+			bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+			light_ray.time = sd->time;
+#endif
+
+			if(direct_emission(kg, sd, -1, light_t, light_o, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce)) {
+				/* trace shadow ray */
+				float3 shadow;
+
+				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					/* accumulate */
+					path_radiance_accum_light(L, *throughput * num_samples_adjust, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				}
+			}
+		}
+	}
+#endif
+
+	/* sample phase function */
+	float phase_pdf;
+	BsdfEval phase_eval;
+	float3 phase_omega_in;
+	differential3 phase_domega_in;
+	float phase_u, phase_v;
+	path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+	int label;
+
+	label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
+		&phase_omega_in, &phase_domega_in, &phase_pdf);
+
+	if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
+		return false;
+	
+	/* modify throughput */
+	path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+
+	/* set labels */
+	state->ray_pdf = phase_pdf;
+#ifdef __LAMP_MIS__
+	state->ray_t = 0.0f;
+#endif
+	state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
+
+	/* update path state */
+	path_state_next(kg, state, label);
+
+	/* setup ray */
+	ray->P = sd->P;
+	ray->D = phase_omega_in;
+	ray->t = FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+	ray->dP = sd->dP;
+	ray->dD = phase_domega_in;
+#endif
+
+	return true;
+}
+
+#endif
+
 #if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
 
 ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_global float *buffer,
@@ -91,7 +174,17 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_g
 		if(state.volume_stack[0].shader != SHADER_NO_ID) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			kernel_volume_integrate(kg, &state, &volume_ray, L, &throughput);
+
+			ShaderData volume_sd;
+			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
+				&volume_sd, &volume_ray, L, &throughput, rng);
+
+			if(result == VOLUME_PATH_SCATTERED) {
+				if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
+					continue;
+				else
+					break;
+			}
 		}
 #endif
 
@@ -498,7 +591,17 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		if(state.volume_stack[0].shader != SHADER_NO_ID) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			kernel_volume_integrate(kg, &state, &volume_ray, &L, &throughput);
+
+			ShaderData volume_sd;
+			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
+				&volume_sd, &volume_ray, &L, &throughput, rng);
+
+			if(result == VOLUME_PATH_SCATTERED) {
+				if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
+					continue;
+				else
+					break;
+			}
 		}
 #endif
 
@@ -934,7 +1037,7 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *
 				kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
 #endif
 
-			/* update RNG state */
+			/* branch RNG state */
 			path_state_branch(&ps, j, num_samples);
 
 			/* set MIS state */
@@ -996,7 +1099,42 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		if(state.volume_stack[0].shader != SHADER_NO_ID) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			kernel_volume_integrate(kg, &state, &volume_ray, &L, &throughput);
+
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+			float3 avg_tp = make_float3(0.0f, 0.0f, 0.0f);
+
+			/* todo: we should cache the shader evaluations from stepping
+			 * through the volume, for now we redo them multiple times */
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = state;
+				Ray pray = ray;
+				ShaderData volume_sd;
+				float3 tp = throughput;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps,
+					&volume_sd, &volume_ray, &L, &tp, rng);
+				
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* todo: use all-light sampling */
+					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+						kernel_path_indirect(kg, rng, pray, buffer, tp*num_samples_inv, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+				else
+					avg_tp += tp;
+			}
+
+			throughput = avg_tp * num_samples_inv;
 		}
 #endif
 
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index fe4d38b10d6..5def19f2c42 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -42,6 +42,8 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 
 #ifdef __VOLUME__
 	if(kernel_data.integrator.use_volumes) {
+		state->volume_bounce = 0;
+
 		/* initialize volume stack with volume we are inside of */
 		kernel_volume_stack_init(kg, state->volume_stack);
 		/* seed RNG for cases where we can't use stratified samples */
@@ -61,6 +63,9 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
 		state->flag |= PATH_RAY_TRANSPARENT;
 		state->transparent_bounce++;
 
+		/* random number generator next bounce */
+		state->rng_offset += PRNG_BOUNCE_NUM;
+
 		if(!kernel_data.integrator.transparent_shadows)
 			state->flag |= PATH_RAY_MIS_SKIP;
 
@@ -69,39 +74,51 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
 
 	state->bounce++;
 
-	/* reflection/transmission */
-	if(label & LABEL_REFLECT) {
-		state->flag |= PATH_RAY_REFLECT;
-		state->flag &= ~(PATH_RAY_TRANSMIT|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT);
+#ifdef __VOLUME__
+	if(label & LABEL_VOLUME_SCATTER) {
+		/* volume scatter */
+		state->flag |= PATH_RAY_VOLUME_SCATTER;
+		state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_TRANSMIT|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT|PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 
-		if(label & LABEL_DIFFUSE)
-			state->diffuse_bounce++;
-		else
-			state->glossy_bounce++;
+		state->volume_bounce++;
 	}
-	else {
-		kernel_assert(label & LABEL_TRANSMIT);
+	else
+#endif
+	{
+		/* surface reflection/transmission */
+		if(label & LABEL_REFLECT) {
+			state->flag |= PATH_RAY_REFLECT;
+			state->flag &= ~(PATH_RAY_TRANSMIT|PATH_RAY_VOLUME_SCATTER|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT);
+
+			if(label & LABEL_DIFFUSE)
+				state->diffuse_bounce++;
+			else
+				state->glossy_bounce++;
+		}
+		else {
+			kernel_assert(label & LABEL_TRANSMIT);
 
-		state->flag |= PATH_RAY_TRANSMIT;
-		state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT);
+			state->flag |= PATH_RAY_TRANSMIT;
+			state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_VOLUME_SCATTER|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT);
 
-		state->transmission_bounce++;
-	}
+			state->transmission_bounce++;
+		}
 
-	/* diffuse/glossy/singular */
-	if(label & LABEL_DIFFUSE) {
-		state->flag |= PATH_RAY_DIFFUSE|PATH_RAY_DIFFUSE_ANCESTOR;
-		state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
-	}
-	else if(label & LABEL_GLOSSY) {
-		state->flag |= PATH_RAY_GLOSSY|PATH_RAY_GLOSSY_ANCESTOR;
-		state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
-	}
-	else {
-		kernel_assert(label & LABEL_SINGULAR);
+		/* diffuse/glossy/singular */
+		if(label & LABEL_DIFFUSE) {
+			state->flag |= PATH_RAY_DIFFUSE|PATH_RAY_DIFFUSE_ANCESTOR;
+			state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
+		}
+		else if(label & LABEL_GLOSSY) {
+			state->flag |= PATH_RAY_GLOSSY|PATH_RAY_GLOSSY_ANCESTOR;
+			state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
+		}
+		else {
+			kernel_assert(label & LABEL_SINGULAR);
 
-		state->flag |= PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP;
-		state->flag &= ~PATH_RAY_DIFFUSE;
+			state->flag |= PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP;
+			state->flag &= ~PATH_RAY_DIFFUSE;
+		}
 	}
 
 	/* random number generator next bounce */
@@ -136,7 +153,8 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, Path
 		if((state->bounce >= kernel_data.integrator.max_bounce) ||
 		   (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) ||
 		   (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) ||
-		   (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce))
+		   (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) ||
+		   (state->volume_bounce >= kernel_data.integrator.max_volume_bounce))
 		{
 			return 0.0f;
 		}
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7b9a4ab12b1..35844dc4f65 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -27,7 +27,6 @@
 #include "closure/bsdf_util.h"
 #include "closure/bsdf.h"
 #include "closure/emissive.h"
-#include "closure/volume.h"
 
 #include "svm/svm.h"
 
@@ -958,37 +957,120 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 /* Volume */
 
 #ifdef __VOLUME__
-ccl_device float3 shader_volume_eval_phase(KernelGlobals *kg, ShaderData *sd,
-	float3 omega_in, float3 omega_out)
-{
-#ifdef __MULTI_CLOSURE__
-	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
-	float pdf;
 
+ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, const float3 omega_in, float *pdf,
+	int skip_phase, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
+{
 	for(int i = 0; i< sd->num_closure; i++) {
+		if(i == skip_phase)
+			continue;
+
 		const ShaderClosure *sc = &sd->closure[i];
 
-		if(CLOSURE_IS_VOLUME(sc->type))
-			eval += volume_eval_phase(sc, omega_in, omega_out, &pdf);
+		if(CLOSURE_IS_PHASE(sc->type)) {
+			float phase_pdf = 0.0f;
+			float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
+
+			if(phase_pdf != 0.0f) {
+				bsdf_eval_accum(result_eval, sc->type, eval);
+				sum_pdf += phase_pdf;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
 	}
 
-	return eval;
-#else
-	float pdf;
-	return volume_eval_phase(&sd->closure, omega_in, omega_out, &pdf);
-#endif
+	*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
+}
+
+ccl_device void shader_volume_phase_eval(KernelGlobals *kg, const ShaderData *sd,
+	const float3 omega_in, BsdfEval *eval, float *pdf)
+{
+	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
+
+	_shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
 }
 
+ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *sd,
+	float randu, float randv, BsdfEval *phase_eval,
+	float3 *omega_in, differential3 *domega_in, float *pdf)
+{
+	int sampled = 0;
+
+	if(sd->num_closure > 1) {
+		/* pick a phase closure based on sample weights */
+		float sum = 0.0f;
+
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
+			
+			if(CLOSURE_IS_PHASE(sc->type))
+				sum += sc->sample_weight;
+		}
+
+		float r = sd->randb_closure*sum;
+		sum = 0.0f;
+
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
+			
+			if(CLOSURE_IS_PHASE(sc->type)) {
+				sum += sc->sample_weight;
+
+				if(r <= sum)
+					break;
+			}
+		}
+
+		if(sampled == sd->num_closure) {
+			*pdf = 0.0f;
+			return LABEL_NONE;
+		}
+	}
+
+	/* todo: this isn't quite correct, we don't weight anisotropy properly
+	 * depending on color channels, even if this is perhaps not a common case */
+	const ShaderClosure *sc = &sd->closure[sampled];
+	int label;
+	float3 eval;
+
+	*pdf = 0.0f;
+	label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+
+	if(*pdf != 0.0f) {
+		bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+	}
+
+	return label;
+}
+
+ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *sd,
+	const ShaderClosure *sc, float randu, float randv, BsdfEval *phase_eval,
+	float3 *omega_in, differential3 *domega_in, float *pdf)
+{
+	int label;
+	float3 eval;
+
+	*pdf = 0.0f;
+	label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+
+	if(*pdf != 0.0f)
+		bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+
+	return label;
+}
+
+#endif
+
 /* Volume Evaluation */
 
 ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
-	VolumeStack *stack, float randb, int path_flag, ShaderContext ctx)
+	VolumeStack *stack, int path_flag, ShaderContext ctx)
 {
 	/* reset closures once at the start, we will be accumulating the closures
 	 * for all volumes in the stack into a single array of closures */
 #ifdef __MULTI_CLOSURE__
 	sd->num_closure = 0;
-	sd->randb_closure = randb;
 #else
 	sd->closure.type = NBUILTIN_CLOSURES;
 #endif
@@ -1022,7 +1104,7 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 		else
 #endif
 		{
-			svm_eval_nodes(kg, sd, SHADER_TYPE_VOLUME, randb, path_flag);
+			svm_eval_nodes(kg, sd, SHADER_TYPE_VOLUME, 0.0f, path_flag);
 		}
 #endif
 
@@ -1031,7 +1113,6 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 			shader_merge_closures(sd);
 	}
 }
-#endif
 
 /* Displacement Evaluation */
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index faed1e7a71e..77ecc2b8dd8 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -190,7 +190,16 @@ enum PathTraceDimension {
 	PRNG_LIGHT_V = 5,
 	PRNG_LIGHT_F = 6,
 	PRNG_TERMINATE = 7,
-	PRNG_BOUNCE_NUM = 8
+
+#ifdef __VOLUME__
+	PRNG_PHASE_U = 8,
+	PRNG_PHASE_V = 9,
+	PRNG_PHASE = 10,
+	PRNG_SCATTER_DISTANCE = 11,
+	PRNG_BOUNCE_NUM = 12,
+#else
+	PRNG_BOUNCE_NUM = 8,
+#endif
 };
 
 enum SamplingPattern {
@@ -503,11 +512,12 @@ enum ShaderDataFlag {
 	SD_EMISSION = 2,		/* have emissive closure? */
 	SD_BSDF = 4,			/* have bsdf closure? */
 	SD_BSDF_HAS_EVAL = 8,	/* have non-singular bsdf closure? */
+	SD_PHASE_HAS_EVAL = 8,	/* have non-singular phase closure? */
 	SD_BSDF_GLOSSY = 16,	/* have glossy bsdf */
 	SD_BSSRDF = 32,			/* have bssrdf */
 	SD_HOLDOUT = 64,		/* have holdout closure? */
 	SD_ABSORPTION = 128,	/* have volume absorption closure? */
-	SD_SCATTER = 256,		/* have volume scattering closure? */
+	SD_SCATTER = 256,		/* have volume phase closure? */
 	SD_AO = 512,			/* have ao closure? */
 
 	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
@@ -646,6 +656,7 @@ typedef struct PathState {
 
 	/* volume rendering */
 #ifdef __VOLUME__
+	int volume_bounce;
 	RNG rng_congruential;
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #endif
@@ -791,6 +802,7 @@ typedef struct KernelIntegrator {
 	int max_diffuse_bounce;
 	int max_glossy_bounce;
 	int max_transmission_bounce;
+	int max_volume_bounce;
 
 	/* transparent */
 	int transparent_min_bounce;
@@ -830,7 +842,7 @@ typedef struct KernelIntegrator {
 	int use_volumes;
 	int volume_max_steps;
 	float volume_step_size;
-	int pad1, pad2;
+	int volume_samples;
 } KernelIntegrator;
 
 typedef struct KernelBVH {
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 65333190350..dc2ddf1098e 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -17,10 +17,9 @@
 CCL_NAMESPACE_BEGIN
 
 typedef enum VolumeIntegrateResult {
-	VOLUME_PATH_TERMINATED = 0,
-	VOLUME_PATH_SCATTERED = 1,
-	VOLUME_PATH_ATTENUATED = 2,
-	VOLUME_PATH_MISSED = 3
+	VOLUME_PATH_SCATTERED = 0,
+	VOLUME_PATH_ATTENUATED = 1,
+	VOLUME_PATH_MISSED = 2
 } VolumeIntegrateResult;
 
 /* Volume shader properties
@@ -35,10 +34,10 @@ typedef struct VolumeShaderCoefficients {
 } VolumeShaderCoefficients;
 
 /* evaluate shader to get extinction coefficient at P */
-ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack, int path_flag, ShaderContext ctx, float3 P, float3 *extinction)
+ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, float3 *extinction)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, stack, 0.0f, path_flag, ctx);
+	shader_eval_volume(kg, sd, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
 
 	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER)))
 		return false;
@@ -57,27 +56,37 @@ ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *s
 }
 
 /* evaluate shader to get absorption, scattering and emission at P */
-ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack, int path_flag, ShaderContext ctx, float3 P, VolumeShaderCoefficients *sample)
+ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, VolumeShaderCoefficients *coeff)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, stack, 0.0f, path_flag, ctx);
+	shader_eval_volume(kg, sd, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
 
 	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION)))
 		return false;
-
-	sample->sigma_a = make_float3(0.0f, 0.0f, 0.0f);
-	sample->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-	sample->emission = make_float3(0.0f, 0.0f, 0.0f);
+	
+	coeff->sigma_a = make_float3(0.0f, 0.0f, 0.0f);
+	coeff->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
+	coeff->emission = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i < sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];
 
 		if(sc->type == CLOSURE_VOLUME_ABSORPTION_ID)
-			sample->sigma_a += sc->weight;
+			coeff->sigma_a += sc->weight;
 		else if(sc->type == CLOSURE_EMISSION_ID)
-			sample->emission += sc->weight;
+			coeff->emission += sc->weight;
 		else if(CLOSURE_IS_VOLUME(sc->type))
-			sample->sigma_s += sc->weight;
+			coeff->sigma_s += sc->weight;
+	}
+
+	/* when at the max number of bounces, treat scattering as absorption */
+	if(sd->flag & SD_SCATTER) {
+		if(state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+			coeff->sigma_a += coeff->sigma_s;
+			coeff->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
+			sd->flag &= ~SD_SCATTER;
+			sd->flag |= SD_ABSORPTION;
+		}
 	}
 
 	return true;
@@ -100,7 +109,7 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st
 	return false;
 }
 
-/* Volumetric Shadows
+/* Volume Shadows
  *
  * These functions are used to attenuate shadow rays to lights. Both absorption
  * and scattering will block light, represented by the extinction coefficient. */
@@ -109,11 +118,9 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st
  * the extinction coefficient for the entire line segment */
 ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
-	ShaderContext ctx = SHADER_CONTEXT_SHADOW;
-	int path_flag = PATH_RAY_SHADOW;
 	float3 sigma_t;
 
-	if(volume_shader_extinction_sample(kg, sd, state->volume_stack, path_flag, ctx, ray->P, &sigma_t))
+	if(volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
 		*throughput *= volume_color_attenuation(sigma_t, ray->t);
 }
 
@@ -121,8 +128,6 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
  * reach the end, get absorbed entirely, or run out of iterations */
 ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
-	ShaderContext ctx = SHADER_CONTEXT_SHADOW;
-	int path_flag = PATH_RAY_SHADOW;
 	float3 tp = *throughput;
 	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
 
@@ -136,7 +141,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	float3 P = ray->P;
 	float3 sigma_t;
 
-	if(!volume_shader_extinction_sample(kg, sd, state->volume_stack, path_flag, ctx, P, &sigma_t))
+	if(!volume_shader_extinction_sample(kg, sd, state, P, &sigma_t))
 		sigma_t = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i < max_steps; i++) {
@@ -146,7 +151,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 		float3 new_sigma_t;
 
 		/* compute attenuation over segment */
-		if(volume_shader_extinction_sample(kg, sd, state->volume_stack, path_flag, ctx, new_P, &new_sigma_t)) {
+		if(volume_shader_extinction_sample(kg, sd, state, new_P, &new_sigma_t)) {
 			/* todo: we could avoid computing expf() for each step by summing,
 			 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
 			 * tp_eps check too */
@@ -174,7 +179,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 /* get the volume attenuation over line segment defined by ray, with the
  * assumption that there are no surfaces blocking light between the endpoints */
-ccl_device void kernel_volume_shadow(KernelGlobals *kg, PathState *state, Ray *ray, float3 *throughput)
+ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, PathState *state, Ray *ray, float3 *throughput)
 {
 	ShaderData sd;
 	shader_setup_from_volume(kg, &sd, ray, state->bounce);
@@ -185,46 +190,81 @@ ccl_device void kernel_volume_shadow(KernelGlobals *kg, PathState *state, Ray *r
 		kernel_volume_shadow_homogeneous(kg, state, ray, &sd, throughput);
 }
 
-/* Volumetric Path */
+/* Volume Path */
 
 /* homogenous volume: assume shader evaluation at the starts gives
  * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
+	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
+	RNG *rng)
 {
-	ShaderContext ctx = SHADER_CONTEXT_VOLUME;
-	int path_flag = PATH_RAY_SHADOW;
 	VolumeShaderCoefficients coeff;
 
-	if(!volume_shader_sample(kg, sd, state->volume_stack, path_flag, ctx, ray->P, &coeff))
+	if(!volume_shader_sample(kg, sd, state, ray->P, &coeff))
 		return VOLUME_PATH_MISSED;
 
-	/* todo: in principle the SD_EMISSION, SD_ABSORPTION and SD_SCATTER flags
-	 * should ensure that one of the components is > 0 and so no division by
-	 * zero occurs, however this needs to be double-checked and tested */
-	
 	int closure_flag = sd->flag;
 	float t = ray->t;
+	float3 new_tp;
+	float3 transmittance;
+
+	/* randomly scatter, and if we do t is shortened */
+	if(closure_flag & SD_SCATTER) {
+		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
+
+		/* set up variables for sampling */
+		float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
+		int channel = (int)(rphase*3.0f);
+		sd->randb_closure = rphase*3.0f - channel;
+
+		/* pick random color channel, we use the Veach one-sample
+		 * model with balance heuristic for the channels */
+		float sample_sigma_t;
+
+		if(channel == 0)
+			sample_sigma_t = sigma_t.x;
+		else if(channel == 1)
+			sample_sigma_t = sigma_t.y;
+		else
+			sample_sigma_t = sigma_t.z;
+
+		/* xi is [0, 1[ so log(0) should never happen, division by zero is
+		 * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+		float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float sample_t = min(t, -logf(1.0f - xi)/sample_sigma_t);
 
-	/* compute attenuation from absorption */
-	float3 attenuation;
+		transmittance = volume_color_attenuation(sigma_t, sample_t);
 
-	if(closure_flag & SD_ABSORPTION)
-		attenuation = volume_color_attenuation(coeff.sigma_a, t);
+		if(sample_t < t) {
+			float pdf = dot(sigma_t, transmittance);
+			new_tp = *throughput * coeff.sigma_s * transmittance * (3.0f / pdf);
+			t = sample_t;
+		}
+		else {
+			float pdf = (transmittance.x + transmittance.y + transmittance.z);
+			new_tp = *throughput * transmittance * (3.0f / pdf);
+		}
+	}
+	else if(closure_flag & SD_ABSORPTION) {
+		/* absorption only, no sampling needed */
+		transmittance = volume_color_attenuation(coeff.sigma_a, t);
+		new_tp = *throughput * transmittance;
+	}
 
-	/* integrate emission attenuated by absorption
-	 * integral E * exp(-sigma_a * t) from 0 to t = E * (1 - exp(-sigma_a * t))/sigma_a
-	 * this goes to E * t as sigma_a goes to zero
+	/* integrate emission attenuated by extinction
+	 * integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+	 * this goes to E * t as sigma_t goes to zero
 	 *
-	 * todo: we should use an epsilon to avoid precision issues near zero sigma_a */
+	 * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
 	if(closure_flag & SD_EMISSION) {
 		float3 emission = coeff.emission;
 
 		if(closure_flag & SD_ABSORPTION) {
-			float3 sigma_a = coeff.sigma_a;
+			float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 
-			emission.x *= (sigma_a.x > 0.0f)? (1.0f - attenuation.x)/sigma_a.x: t;
-			emission.y *= (sigma_a.y > 0.0f)? (1.0f - attenuation.y)/sigma_a.y: t;
-			emission.z *= (sigma_a.z > 0.0f)? (1.0f - attenuation.z)/sigma_a.z: t;
+			emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: t;
+			emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: t;
+			emission.z *= (sigma_t.z > 0.0f)? (1.0f - transmittance.z)/sigma_t.z: t;
 		}
 		else
 			emission *= t;
@@ -233,18 +273,26 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	}
 
 	/* modify throughput */
-	if(closure_flag & SD_ABSORPTION)
-		*throughput *= attenuation;
+	if(closure_flag & (SD_ABSORPTION|SD_SCATTER)) {
+		*throughput = new_tp;
+
+		/* prepare to scatter to new direction */
+		if(t < ray->t) {
+			/* adjust throughput and move to new location */
+			sd->P = ray->P + t*ray->D;
+
+			return VOLUME_PATH_SCATTERED;
+		}
+	}
 
 	return VOLUME_PATH_ATTENUATED;
 }
 
 /* heterogeneous volume: integrate stepping through the volume until we
  * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
+	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
 {
-	ShaderContext ctx = SHADER_CONTEXT_VOLUME;
-	int path_flag = PATH_RAY_SHADOW;
 	VolumeShaderCoefficients coeff;
 	float3 tp = *throughput;
 	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
@@ -258,46 +306,125 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	float t = 0.0f;
 	float3 P = ray->P;
 
-	if(!volume_shader_sample(kg, sd, state->volume_stack, path_flag, ctx, P, &coeff)) {
+	if(!volume_shader_sample(kg, sd, state, P, &coeff)) {
 		coeff.sigma_a = make_float3(0.0f, 0.0f, 0.0f);
 		coeff.sigma_s = make_float3(0.0f, 0.0f, 0.0f);
 		coeff.emission = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	/* accumulate these values so we can use a single stratified number to sample */
+	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
+	float3 accum_sigma_t = make_float3(0.0f, 0.0f, 0.0f);
+	float3 accum_sigma_s = make_float3(0.0f, 0.0f, 0.0f);
+
+	/* cache some constant variables */
+	float nlogxi;
+	int channel = -1;
+	bool has_scatter = false;
+
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
 		float new_t = min(ray->t, t + random_jitter_offset + i * step);
 		float3 new_P = ray->P + ray->D * new_t;
 		VolumeShaderCoefficients new_coeff;
 
-		/* compute attenuation over segment */
-		if(volume_shader_sample(kg, sd, state->volume_stack, path_flag, ctx, new_P, &new_coeff)) {
+		/* compute segment */
+		if(volume_shader_sample(kg, sd, state, new_P, &new_coeff)) {
 			int closure_flag = sd->flag;
 			float dt = new_t - t;
+			float3 new_tp;
+			float3 transmittance;
+			bool scatter = false;
+
+			/* randomly scatter, and if we do dt and new_t are shortened */
+			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
+				has_scatter = true;
+
+				/* average sigma_t and sigma_s over segment */
+				float3 last_sigma_t = coeff.sigma_a + coeff.sigma_s;
+				float3 new_sigma_t = new_coeff.sigma_a + new_coeff.sigma_s;
+				float3 sigma_t = 0.5f*(last_sigma_t + new_sigma_t);
+				float3 sigma_s = 0.5f*(coeff.sigma_s + new_coeff.sigma_s);
+
+				/* lazily set up variables for sampling */
+				if(channel == -1) {
+					float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
+					nlogxi = -logf(1.0f - xi);
+
+					float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
+					channel = (int)(rphase*3.0f);
+					sd->randb_closure = rphase*3.0f - channel;
+				}
 
-			/* compute attenuation from absorption */
-			float3 attenuation, sigma_a;
+				/* pick random color channel, we use the Veach one-sample
+				 * model with balance heuristic for the channels */
+				float sample_sigma_t;
+
+				if(channel == 0)
+					sample_sigma_t = accum_sigma_t.x + dt*sigma_t.x;
+				else if(channel == 1)
+					sample_sigma_t = accum_sigma_t.y + dt*sigma_t.y;
+				else
+					sample_sigma_t = accum_sigma_t.z + dt*sigma_t.z;
+
+				if(nlogxi < sample_sigma_t) {
+					/* compute sampling distance */
+					sample_sigma_t /= new_t;
+					new_t = nlogxi/sample_sigma_t;
+					dt = new_t - t;
+
+					transmittance = volume_color_attenuation(sigma_t, dt);
+
+					accum_transmittance *= transmittance;
+					accum_sigma_t = (accum_sigma_t + dt*sigma_t)/new_t;
+					accum_sigma_s = (accum_sigma_s + dt*sigma_s)/new_t;
+
+					/* todo: it's not clear to me that this is correct if we move
+					 * through a color volumed, needs verification */
+					float pdf = dot(accum_sigma_t, accum_transmittance);
+					new_tp = tp * accum_sigma_s * transmittance * (3.0f / pdf);
+
+					scatter = true;
+				}
+				else {
+					transmittance = volume_color_attenuation(sigma_t, dt);
+
+					accum_transmittance *= transmittance;
+					accum_sigma_t += dt*sigma_t;
+					accum_sigma_s += dt*sigma_s;
+
+					new_tp = tp * transmittance;
+				}
+			}
+			else if(closure_flag & SD_ABSORPTION) {
+				/* absorption only, no sampling needed */
+				float3 sigma_a = 0.5f*(coeff.sigma_a + new_coeff.sigma_a);
+				transmittance = volume_color_attenuation(sigma_a, dt);
+
+				accum_transmittance *= transmittance;
+				accum_sigma_t += dt*sigma_a;
+
+				new_tp = tp * transmittance;
 
-			if(closure_flag & SD_ABSORPTION) {
 				/* todo: we could avoid computing expf() for each step by summing,
 				 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
 				 * tp_eps check too */
-				sigma_a = 0.5f*(coeff.sigma_a + new_coeff.sigma_a);
-				attenuation = volume_color_attenuation(sigma_a, dt);
 			}
 
 			/* integrate emission attenuated by absorption 
-			 * integral E * exp(-sigma_a * t) from 0 to t = E * (1 - exp(-sigma_a * t))/sigma_a
-			 * this goes to E * t as sigma_a goes to zero
+			 * integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+			 * this goes to E * t as sigma_t goes to zero
 			 *
-			 * todo: we should use an epsilon to avoid precision issues near zero sigma_a */
+			 * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
 			if(closure_flag & SD_EMISSION) {
 				float3 emission = 0.5f*(coeff.emission + new_coeff.emission);
 
 				if(closure_flag & SD_ABSORPTION) {
-					emission.x *= (sigma_a.x > 0.0f)? (1.0f - attenuation.x)/sigma_a.x: dt;
-					emission.y *= (sigma_a.y > 0.0f)? (1.0f - attenuation.y)/sigma_a.y: dt;
-					emission.z *= (sigma_a.z > 0.0f)? (1.0f - attenuation.z)/sigma_a.z: dt;
+					float3 sigma_t = 0.5f*(coeff.sigma_a + coeff.sigma_s + new_coeff.sigma_a + new_coeff.sigma_s);
+
+					emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: dt;
+					emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: dt;
+					emission.z *= (sigma_t.z > 0.0f)? (1.0f - transmittance.z)/sigma_t.z: dt;
 				}
 				else
 					emission *= dt;
@@ -306,14 +433,23 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			}
 
 			/* modify throughput */
-			if(closure_flag & SD_ABSORPTION) {
-				tp *= attenuation;
+			if(closure_flag & (SD_ABSORPTION|SD_SCATTER)) {
+				tp = new_tp;
 
 				/* stop if nearly all light blocked */
 				if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps) {
 					tp = make_float3(0.0f, 0.0f, 0.0f);
 					break;
 				}
+
+				/* prepare to scatter to new direction */
+				if(scatter) {
+					/* adjust throughput and move to new location */
+					sd->P = ray->P + new_t*ray->D;
+					*throughput = tp;
+
+					return VOLUME_PATH_SCATTERED;
+				}
 			}
 
 			coeff = new_coeff;
@@ -331,6 +467,13 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			break;
 	}
 
+	/* include pdf for volumes with scattering */
+	if(has_scatter) {
+		float pdf = (accum_transmittance.x + accum_transmittance.y + accum_transmittance.z);
+		if(pdf > 0.0f)
+			tp *= (3.0f/pdf);
+	}
+
 	*throughput = tp;
 
 	return VOLUME_PATH_ATTENUATED;
@@ -339,15 +482,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 /* get the volume attenuation and emission over line segment defined by
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints */
-ccl_device VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, PathState *state, Ray *ray, PathRadiance *L, float3 *throughput)
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
+	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
 {
-	ShaderData sd;
-	shader_setup_from_volume(kg, &sd, ray, state->bounce);
+	shader_setup_from_volume(kg, sd, ray, state->bounce);
 
 	if(volume_stack_is_heterogeneous(kg, state->volume_stack))
-		return kernel_volume_integrate_heterogeneous(kg, state, ray, &sd, L, throughput);
+		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, rng);
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, &sd, L, throughput);
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng);
 }
 
 /* Volume Stack
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 9850a058611..52bea153a2a 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -410,6 +410,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_HOLDOUT(type) (type == CLOSURE_HOLDOUT_ID)
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
+#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index ed8883c52f7..1042a5c4102 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -33,6 +33,7 @@ Integrator::Integrator()
 	max_diffuse_bounce = max_bounce;
 	max_glossy_bounce = max_bounce;
 	max_transmission_bounce = max_bounce;
+	max_volume_bounce = max_bounce;
 	probalistic_termination = true;
 
 	transparent_min_bounce = min_bounce;
@@ -57,6 +58,7 @@ Integrator::Integrator()
 	ao_samples = 1;
 	mesh_light_samples = 1;
 	subsurface_samples = 1;
+	volume_samples = 1;
 	method = PATH;
 
 	sampling_pattern = SAMPLING_PATTERN_SOBOL;
@@ -88,6 +90,11 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->max_glossy_bounce = max_glossy_bounce + 1;
 	kintegrator->max_transmission_bounce = max_transmission_bounce + 1;
 
+	if(kintegrator->use_volumes)
+		kintegrator->max_volume_bounce = max_volume_bounce + 1;
+	else
+		kintegrator->max_volume_bounce = 1;
+
 	kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
 	if(transparent_probalistic)
 		kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
@@ -118,6 +125,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->ao_samples = ao_samples;
 	kintegrator->mesh_light_samples = mesh_light_samples;
 	kintegrator->subsurface_samples = subsurface_samples;
+	kintegrator->volume_samples = volume_samples;
 
 	kintegrator->sampling_pattern = sampling_pattern;
 
@@ -130,6 +138,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
 		max_samples = max(max_samples, max(diffuse_samples, max(glossy_samples, transmission_samples)));
 		max_samples = max(max_samples, max(ao_samples, max(mesh_light_samples, subsurface_samples)));
+		max_samples = max(max_samples, volume_samples);
 	}
 
 	max_samples *= (max_bounce + transparent_max_bounce + 3);
@@ -159,6 +168,7 @@ bool Integrator::modified(const Integrator& integrator)
 		max_diffuse_bounce == integrator.max_diffuse_bounce &&
 		max_glossy_bounce == integrator.max_glossy_bounce &&
 		max_transmission_bounce == integrator.max_transmission_bounce &&
+		max_volume_bounce == integrator.max_volume_bounce &&
 		probalistic_termination == integrator.probalistic_termination &&
 		transparent_min_bounce == integrator.transparent_min_bounce &&
 		transparent_max_bounce == integrator.transparent_max_bounce &&
@@ -179,6 +189,7 @@ bool Integrator::modified(const Integrator& integrator)
 		ao_samples == integrator.ao_samples &&
 		mesh_light_samples == integrator.mesh_light_samples &&
 		subsurface_samples == integrator.subsurface_samples &&
+		volume_samples == integrator.volume_samples &&
 		motion_blur == integrator.motion_blur &&
 		sampling_pattern == integrator.sampling_pattern);
 }
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index d9e49d5906e..cc4a28a4e54 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -33,6 +33,7 @@ public:
 	int max_diffuse_bounce;
 	int max_glossy_bounce;
 	int max_transmission_bounce;
+	int max_volume_bounce;
 	bool probalistic_termination;
 
 	int transparent_min_bounce;
@@ -59,6 +60,7 @@ public:
 	int ao_samples;
 	int mesh_light_samples;
 	int subsurface_samples;
+	int volume_samples;
 
 	enum Method {
 		BRANCHED_PATH = 0,
diff --git a/source/blender/blenkernel/intern/node.c b/source/blender/blenkernel/intern/node.c
index 11c27eb0334..6e04f7e0167 100644
--- a/source/blender/blenkernel/intern/node.c
+++ b/source/blender/blenkernel/intern/node.c
@@ -3500,7 +3500,7 @@ static void registerShaderNodes(void)
 	register_node_type_sh_emission();
 	register_node_type_sh_holdout();
 	register_node_type_sh_volume_absorption();
-	//register_node_type_sh_volume_scatter();
+	register_node_type_sh_volume_scatter();
 	register_node_type_sh_subsurface_scattering();
 	register_node_type_sh_mix_shader();
 	register_node_type_sh_add_shader();