From cdb0b3b1dcd4e9962426422868b2f40535670a5c Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sun, 8 Oct 2017 04:32:25 +0200
Subject: Code refactor: use DeviceInfo to enable QBVH and decoupled volume
 shading.

---
 intern/cycles/blender/blender_session.cpp   |  13 +-
 intern/cycles/blender/blender_sync.cpp      |  17 +-
 intern/cycles/blender/blender_sync.h        |   7 +-
 intern/cycles/device/device.cpp             |   4 +
 intern/cycles/device/device.h               |   4 +
 intern/cycles/device/device_cpu.cpp         |   2 +
 intern/cycles/device/device_cuda.cpp        |   2 +
 intern/cycles/device/device_network.cpp     |   6 +-
 intern/cycles/device/device_opencl.cpp      |   2 +
 intern/cycles/kernel/kernel_path.h          | 125 ++++++------
 intern/cycles/kernel/kernel_path_branched.h | 305 +++++++++++++++-------------
 intern/cycles/kernel/kernel_types.h         |   2 +-
 intern/cycles/kernel/kernel_volume.h        |   3 +
 intern/cycles/render/integrator.cpp         |   1 +
 intern/cycles/render/mesh.cpp               |  16 +-
 intern/cycles/render/mesh.h                 |   3 +-
 intern/cycles/render/scene.h                |   2 +-
 intern/cycles/render/shader.cpp             |  10 +-
 18 files changed, 285 insertions(+), 239 deletions(-)

diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 12de3da063f..9e54b7de573 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -115,8 +115,7 @@ void BlenderSession::create()
 void BlenderSession::create_session()
 {
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	bool is_cpu = session_params.device.type == DEVICE_CPU;
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 	/* reset status/progress */
@@ -141,7 +140,7 @@ void BlenderSession::create_session()
 	session->set_pause(session_pause);
 
 	/* create sync */
-	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu);
+	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
 	BL::Object b_camera_override(b_engine.camera_override());
 	if(b_v3d) {
 		if(session_pause == false) {
@@ -179,8 +178,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_)
 	b_scene = b_scene_;
 
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	const bool is_cpu = session_params.device.type == DEVICE_CPU;
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 
 	width = render_resolution_x(b_render);
 	height = render_resolution_y(b_render);
@@ -211,7 +209,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_)
 	session->stats.mem_peak = session->stats.mem_used;
 
 	/* sync object should be re-created */
-	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu);
+	sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
 
 	/* for final render we will do full data sync per render layer, only
 	 * do some basic syncing here, no objects or materials for speed */
@@ -736,8 +734,7 @@ void BlenderSession::synchronize()
 
 	/* on session/scene parameter changes, we recreate session entirely */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	const bool is_cpu = session_params.device.type == DEVICE_CPU;
-	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu);
+	SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 	bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
 	if(session->params.modified(session_params) ||
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 42e3721883f..2e3301c4209 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -47,8 +47,7 @@ BlenderSync::BlenderSync(BL::RenderEngine& b_engine,
                          BL::Scene& b_scene,
                          Scene *scene,
                          bool preview,
-                         Progress &progress,
-                         bool is_cpu)
+                         Progress &progress)
 : b_engine(b_engine),
   b_data(b_data),
   b_scene(b_scene),
@@ -62,7 +61,6 @@ BlenderSync::BlenderSync(BL::RenderEngine& b_engine,
   scene(scene),
   preview(preview),
   experimental(false),
-  is_cpu(is_cpu),
   dicing_rate(1.0f),
   max_subdivisions(12),
   progress(progress)
@@ -613,8 +611,7 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 /* Scene Parameters */
 
 SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
-                                          bool background,
-                                          bool is_cpu)
+                                          bool background)
 {
 	BL::RenderSettings r = b_scene.render();
 	SceneParams params;
@@ -654,15 +651,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 		params.texture_limit = 0;
 	}
 
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-	if(is_cpu) {
-		params.use_qbvh = DebugFlags().cpu.qbvh && system_cpu_support_sse2();
-	}
-	else
-#endif
-	{
-		params.use_qbvh = false;
-	}
+	params.use_qbvh = DebugFlags().cpu.qbvh;
 
 	return params;
 }
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 4ec46424b5a..11e279b81c4 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -54,8 +54,7 @@ public:
 	            BL::Scene& b_scene,
 	            Scene *scene,
 	            bool preview,
-	            Progress &progress,
-	            bool is_cpu);
+	            Progress &progress);
 	~BlenderSync();
 
 	/* sync */
@@ -83,8 +82,7 @@ public:
 
 	/* get parameters */
 	static SceneParams get_scene_params(BL::Scene& b_scene,
-	                                    bool background,
-	                                    bool is_cpu);
+	                                    bool background);
 	static SessionParams get_session_params(BL::RenderEngine& b_engine,
 	                                        BL::UserPreferences& b_userpref,
 	                                        BL::Scene& b_scene,
@@ -177,7 +175,6 @@ private:
 	Scene *scene;
 	bool preview;
 	bool experimental;
-	bool is_cpu;
 
 	float dicing_rate;
 	int max_subdivisions;
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index f64436aec7b..533294407ea 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -379,10 +379,14 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
 	info.num = 0;
 
 	info.has_bindless_textures = true;
+	info.has_volume_decoupled = true;
+	info.has_qbvh = true;
 	foreach(DeviceInfo &device, subdevices) {
 		assert(device.type == info.multi_devices[0].type);
 
 		info.has_bindless_textures &= device.has_bindless_textures;
+		info.has_volume_decoupled &= device.has_volume_decoupled;
+		info.has_qbvh &= device.has_qbvh;
 	}
 
 	return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 0e0a0079209..c134fc9411e 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -55,6 +55,8 @@ public:
 	bool display_device;
 	bool advanced_shading;
 	bool has_bindless_textures; /* flag for GPU and Multi device */
+	bool has_volume_decoupled;
+	bool has_qbvh;
 	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
 
@@ -66,6 +68,8 @@ public:
 		display_device = false;
 		advanced_shading = true;
 		has_bindless_textures = false;
+		has_volume_decoupled = false;
+		has_qbvh = false;
 		use_split_kernel = false;
 	}
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index ac6d3246d38..a17caabc850 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -1024,6 +1024,8 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.id = "CPU";
 	info.num = 0;
 	info.advanced_shading = true;
+	info.has_qbvh = system_cpu_support_sse2();
+	info.has_volume_decoupled = true;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dcbe6033bcc..56a56c5217c 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -2128,6 +2128,8 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 		info.advanced_shading = (major >= 2);
 		info.has_bindless_textures = (major >= 3);
+		info.has_volume_decoupled = false;
+		info.has_qbvh = false;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index deea59f1d23..ced10c98dc9 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -343,7 +343,11 @@ void device_network_info(vector<DeviceInfo>& devices)
 	info.description = "Network Device";
 	info.id = "NETWORK";
 	info.num = 0;
-	info.advanced_shading = true; /* todo: get this info from device */
+
+	/* todo: get this info from device */
+	info.advanced_shading = true;
+	info.has_volume_decoupled = false;
+	info.has_qbvh = false;
 
 	devices.push_back(info);
 }
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 9d89decaaaf..5808a31e605 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -123,6 +123,8 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
 		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
 		                                                     device_type);
+		info.has_volume_decoupled = false;
+		info.has_qbvh = false;
 		info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
 		devices.push_back(info);
 		num_devices++;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 6b6c5603b70..652777a77a0 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -170,87 +170,90 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
 	if(!hit) {
 		kernel_volume_clean_stack(kg, state->volume_stack);
 	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return VOLUME_PATH_ATTENUATED;
+	}
+
 	/* volume attenuation, emission, scatter */
-	if(state->volume_stack[0].shader != SHADER_NONE) {
-		Ray volume_ray = *ray;
-		volume_ray.t = (hit)? isect->t: FLT_MAX;
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
 
-		bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
 #  ifdef __VOLUME_DECOUPLED__
-		int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-		bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-		bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+	int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+	bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
+	bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
 
-		if(decoupled) {
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
+	if(decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
 
-			shader_setup_from_volume(kg, sd, &volume_ray);
-			kernel_volume_decoupled_record(kg, state,
-				&volume_ray, sd, &volume_segment, heterogeneous);
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
 
-			volume_segment.sampling_method = sampling_method;
+		volume_segment.sampling_method = sampling_method;
 
-			/* emission */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+		/* emission */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
 
-			/* scattering */
-			VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+		/* scattering */
+		VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
 
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				int all = kernel_data.integrator.sample_all_lights_indirect;
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			int all = kernel_data.integrator.sample_all_lights_indirect;
 
-				/* direct light sampling */
-				kernel_branched_path_volume_connect_light(kg, sd,
-					emission_sd, *throughput, state, L, all,
-					&volume_ray, &volume_segment);
+			/* direct light sampling */
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
 
-				/* indirect sample. if we use distance sampling and take just
-				 * one sample for direct and indirect light, we could share
-				 * this computation, but makes code a bit complex */
-				float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-				float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+			/* indirect sample. if we use distance sampling and take just
+			 * one sample for direct and indirect light, we could share
+			 * this computation, but makes code a bit complex */
+			float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+			float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
-				result = kernel_volume_decoupled_scatter(kg,
-					state, &volume_ray, sd, throughput,
-					rphase, rscatter, &volume_segment, NULL, true);
-			}
+			result = kernel_volume_decoupled_scatter(kg,
+				state, &volume_ray, sd, throughput,
+				rphase, rscatter, &volume_segment, NULL, true);
+		}
 
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
 
-			if(result == VOLUME_PATH_SCATTERED) {
-				if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-					return VOLUME_PATH_SCATTERED;
-				else
-					return VOLUME_PATH_MISSED;
-			}
-			else {
-				*throughput *= volume_segment.accum_transmittance;
-			}
+		if(result == VOLUME_PATH_SCATTERED) {
+			if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+				return VOLUME_PATH_SCATTERED;
+			else
+				return VOLUME_PATH_MISSED;
 		}
-		else
+		else {
+			*throughput *= volume_segment.accum_transmittance;
+		}
+	}
+	else
 #  endif  /* __VOLUME_DECOUPLED__ */
-		{
-			/* integrate along volume segment with distance sampling */
-			VolumeIntegrateResult result = kernel_volume_integrate(
-				kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+	{
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, state, sd, &volume_ray, L, throughput, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
-			if(result == VOLUME_PATH_SCATTERED) {
-				/* direct lighting */
-				kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-				/* indirect light bounce */
-				if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-					return VOLUME_PATH_SCATTERED;
-				else
-					return VOLUME_PATH_MISSED;
-			}
-#  endif  /* __VOLUME_SCATTER__ */
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+			/* indirect light bounce */
+			if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+				return VOLUME_PATH_SCATTERED;
+			else
+				return VOLUME_PATH_MISSED;
 		}
+#  endif  /* __VOLUME_SCATTER__ */
 	}
 
 	return VOLUME_PATH_ATTENUATED;
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 2597d684a36..42df7e85b41 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -64,6 +64,164 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 #ifndef __SPLIT_KERNEL__
 
+#ifdef __VOLUME__
+ccl_device_forceinline void kernel_branched_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *indirect_sd,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return;
+	}
+
+	/* volume attenuation, emission, scatter */
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+	/* decoupled ray marching only supported on CPU */
+	if(kernel_data.integrator.volume_decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
+
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
+
+		/* direct light sampling */
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+
+			int all = kernel_data.integrator.sample_all_lights_direct;
+
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
+
+			/* indirect light sampling */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = *state;
+				Ray pray = *ray;
+				float3 tp = *throughput;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				/* scatter sample. if we use distance sampling and take just one
+				 * sample for direct and indirect light, we could share this
+				 * computation, but makes code a bit complex */
+				float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+				float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					&ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+
+				if(result == VOLUME_PATH_SCATTERED &&
+				   kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp*num_samples_inv,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+		}
+
+		/* emission and transmittance */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+		*throughput *= volume_segment.accum_transmittance;
+
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
+	}
+	else
+#  endif  /* __VOLUME_DECOUPLED__ */
+	{
+		/* GPU: no decoupled ray marching, scatter probalistically */
+		int num_samples = kernel_data.integrator.volume_samples;
+		float num_samples_inv = 1.0f/num_samples;
+
+		/* todo: we should cache the shader evaluations from stepping
+		 * through the volume, for now we redo them multiple times */
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			Ray pray = *ray;
+			float3 tp = (*throughput) * num_samples_inv;
+
+			/* branch RNG state */
+			path_state_branch(&ps, j, num_samples);
+
+			VolumeIntegrateResult result = kernel_volume_integrate(
+				kg, &ps, sd, &volume_ray, L, &tp, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+			if(result == VOLUME_PATH_SCATTERED) {
+				/* todo: support equiangular, MIS and all light sampling.
+				 * alternatively get decoupled ray marching working on the GPU */
+				kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
+
+				if(kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+# endif  /* __VOLUME_SCATTER__ */
+		}
+
+		/* todo: avoid this calculation using decoupled ray marching */
+		kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
+	}
+}
+#endif  /* __VOLUME__ */
+
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
 	ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
@@ -293,142 +451,17 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
 		bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L);
 
 #ifdef __VOLUME__
-		/* Sanitize volume stack. */
-		if(!hit) {
-			kernel_volume_clean_stack(kg, state.volume_stack);
-		}
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-
-			shader_setup_from_volume(kg, &sd, &volume_ray);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				int all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, &sd,
-					&emission_sd, throughput, &state, L, all,
-					&volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
-					float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-					if(result == VOLUME_PATH_SCATTERED &&
-					   kernel_path_volume_bounce(kg,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L->state,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     &pray,
-						                     tp*num_samples_inv,
-						                     &ps,
-						                     L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(L);
-						path_radiance_reset_indirect(L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(L, &state, throughput, volume_segment.accum_emission);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, L, &tp, heterogeneous);
-
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, &sd, &emission_sd, tp, &state, L);
-
-					if(kernel_path_volume_bounce(kg,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L->state,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     &pray,
-						                     tp,
-						                     &ps,
-						                     L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(L);
-						path_radiance_reset_indirect(L);
-					}
-				}
-#endif  /* __VOLUME_SCATTER__ */
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
-#endif  /* __VOLUME_DECOUPLED__ */
-		}
+		/* Volume integration. */
+		kernel_branched_path_volume(kg,
+		                            &sd,
+		                            &state,
+		                            &ray,
+		                            &throughput,
+		                            &isect,
+		                            hit,
+		                            &indirect_sd,
+		                            &emission_sd,
+		                            L);
 #endif  /* __VOLUME__ */
 
 		/* Shade background. */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 19c77c1ed4f..f76d6c2e556 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1262,6 +1262,7 @@ typedef struct KernelIntegrator {
 
 	/* branched path */
 	int branched;
+	int volume_decoupled;
 	int diffuse_samples;
 	int glossy_samples;
 	int transmission_samples;
@@ -1287,7 +1288,6 @@ typedef struct KernelIntegrator {
 	float light_inv_rr_threshold;
 
 	int start_sample;
-	int pad1;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index d9c310a893e..5905fb3bf12 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -1026,6 +1026,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 	/* decoupled ray marching for heterogeneous volumes not supported on the GPU,
 	 * which also means equiangular and multiple importance sampling is not
 	 * support for that case */
+	if(!kernel_data.integrator.volume_decoupled)
+		return false;
+
 #ifdef __KERNEL_GPU__
 	if(heterogeneous)
 		return false;
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 15b728d6e02..b268478e6d3 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -145,6 +145,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->sample_clamp_indirect = (sample_clamp_indirect == 0.0f)? FLT_MAX: sample_clamp_indirect*3.0f;
 
 	kintegrator->branched = (method == BRANCHED_PATH);
+	kintegrator->volume_decoupled = device->info.has_volume_decoupled;
 	kintegrator->diffuse_samples = diffuse_samples;
 	kintegrator->glossy_samples = glossy_samples;
 	kintegrator->transmission_samples = transmission_samples;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 84537bf5993..c02a5222463 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1016,7 +1016,8 @@ void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, ui
 	}
 }
 
-void Mesh::compute_bvh(DeviceScene *dscene,
+void Mesh::compute_bvh(Device *device,
+                       DeviceScene *dscene,
                        SceneParams *params,
                        Progress *progress,
                        int n,
@@ -1050,7 +1051,7 @@ void Mesh::compute_bvh(DeviceScene *dscene,
 
 			BVHParams bparams;
 			bparams.use_spatial_split = params->use_bvh_spatial_split;
-			bparams.use_qbvh = params->use_qbvh;
+			bparams.use_qbvh = params->use_qbvh && device->info.has_qbvh;
 			bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 			                              params->use_bvh_unaligned_nodes;
 			bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
@@ -1814,18 +1815,18 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	/* bvh build */
 	progress.set_status("Updating Scene BVH", "Building");
 
-	VLOG(1) << (scene->params.use_qbvh ? "Using QBVH optimization structure"
-	                                   : "Using regular BVH optimization structure");
-
 	BVHParams bparams;
 	bparams.top_level = true;
-	bparams.use_qbvh = scene->params.use_qbvh;
+	bparams.use_qbvh = scene->params.use_qbvh && device->info.has_qbvh;
 	bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
 	bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 	                              scene->params.use_bvh_unaligned_nodes;
 	bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
 	bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
 
+	VLOG(1) << (bparams.use_qbvh ? "Using QBVH optimization structure"
+	                             : "Using regular BVH optimization structure");
+
 	delete bvh;
 	bvh = BVH::create(bparams, scene->objects);
 	bvh->build(progress);
@@ -1879,7 +1880,7 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	}
 
 	dscene->data.bvh.root = pack.root_index;
-	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
+	dscene->data.bvh.use_qbvh = bparams.use_qbvh;
 	dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
 }
 
@@ -2084,6 +2085,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 		if(mesh->need_update) {
 			pool.push(function_bind(&Mesh::compute_bvh,
 			                        mesh,
+			                        device,
 			                        dscene,
 			                        &scene->params,
 			                        &progress,
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 043ce9d0ffc..9a51ca73950 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -282,7 +282,8 @@ public:
 	void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
 	void pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset);
 
-	void compute_bvh(DeviceScene *dscene,
+	void compute_bvh(Device *device,
+	                 DeviceScene *dscene,
 	                 SceneParams *params,
 	                 Progress *progress,
 	                 int n,
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 0194327f567..a1966afd23b 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -149,7 +149,7 @@ public:
 		use_bvh_spatial_split = false;
 		use_bvh_unaligned_nodes = true;
 		num_bvh_time_steps = 0;
-		use_qbvh = false;
+		use_qbvh = true;
 		persistent_data = false;
 		texture_limit = 0;
 	}
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 864875361c0..3992ada2e85 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -451,10 +451,12 @@ void ShaderManager::device_update_common(Device *device,
 			flag |= SD_HETEROGENEOUS_VOLUME;
 		if(shader->has_bssrdf_bump)
 			flag |= SD_HAS_BSSRDF_BUMP;
-		if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
-			flag |= SD_VOLUME_EQUIANGULAR;
-		if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
-			flag |= SD_VOLUME_MIS;
+		if(device->info.has_volume_decoupled) {
+			if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
+				flag |= SD_VOLUME_EQUIANGULAR;
+			if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+				flag |= SD_VOLUME_MIS;
+		}
 		if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC)
 			flag |= SD_VOLUME_CUBIC;
 		if(shader->has_bump)
-- 
cgit v1.2.3