Merge branch 'master' into blender2.8

author: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-03 19:33:02 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-03 19:33:02 +0300
commit: 82e242cc7287ce56d10f88a76de5418a7b20e239 (patch)
tree: c513d6537cfd3d3a1b3d098a8a4f45c55611f165 /intern
parent: 2564d929fc5f75403bc5ca051fb33bc7b8874646 (diff)
parent: 15189baa5231d70c1363192eb01b83fa946f36f0 (diff)
45 files changed, 1581 insertions, 593 deletions
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 7c58df68011..1cb7835d14d 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -102,6 +102,9 @@ class CyclesRender(bpy.types.RenderEngine):
         else:
             self.report({'ERROR'}, "OSL support disabled in this build.")
 
+    def update_render_passes(self, scene, srl):
+        engine.register_passes(self, scene, srl)
+
 
 def engine_exit():
     engine.exit()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index a418cdd6821..d74ae5f3061 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -208,3 +208,36 @@ def with_network():
 def system_info():
     import _cycles
     return _cycles.system_info()
+
+def register_passes(engine, scene, srl):
+    engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR')
+
+    if srl.use_pass_z:                     engine.register_pass(scene, srl, "Depth",         1, "Z",    'VALUE')
+    if srl.use_pass_mist:                  engine.register_pass(scene, srl, "Mist",          1, "Z",    'VALUE')
+    if srl.use_pass_normal:                engine.register_pass(scene, srl, "Normal",        3, "XYZ",  'VECTOR')
+    if srl.use_pass_vector:                engine.register_pass(scene, srl, "Vector",        4, "XYZW", 'VECTOR')
+    if srl.use_pass_uv:                    engine.register_pass(scene, srl, "UV",            3, "UVA",  'VECTOR')
+    if srl.use_pass_object_index:          engine.register_pass(scene, srl, "IndexOB",       1, "X",    'VALUE')
+    if srl.use_pass_material_index:        engine.register_pass(scene, srl, "IndexMA",       1, "X",    'VALUE')
+    if srl.use_pass_shadow:                engine.register_pass(scene, srl, "Shadow",        3, "RGB",  'COLOR')
+    if srl.use_pass_ambient_occlusion:     engine.register_pass(scene, srl, "AO",            3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_direct:        engine.register_pass(scene, srl, "DiffDir",       3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_indirect:      engine.register_pass(scene, srl, "DiffInd",       3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_color:         engine.register_pass(scene, srl, "DiffCol",       3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_direct:         engine.register_pass(scene, srl, "GlossDir",      3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_indirect:       engine.register_pass(scene, srl, "GlossInd",      3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_color:          engine.register_pass(scene, srl, "GlossCol",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_direct:   engine.register_pass(scene, srl, "TransDir",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_color:    engine.register_pass(scene, srl, "TransCol",      3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_direct:     engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_indirect:   engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_color:      engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB",  'COLOR')
+    if srl.use_pass_emit:                  engine.register_pass(scene, srl, "Emit",          3, "RGB",  'COLOR')
+    if srl.use_pass_environment:           engine.register_pass(scene, srl, "Env",           3, "RGB",  'COLOR')
+
+    crl = srl.cycles
+    if crl.pass_debug_bvh_traversed_nodes:     engine.register_pass(scene, srl, "Debug BVH Traversed Nodes",     1, "X", 'VALUE')
+    if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
+    if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X", 'VALUE')
+    if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X", 'VALUE')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 53740efb627..a8a0f0bfc70 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1166,6 +1166,39 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
     def unregister(cls):
         del bpy.types.Scene.cycles_curves
 
+class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
+    @classmethod
+    def register(cls):
+        bpy.types.SceneRenderLayer.cycles = PointerProperty(
+                name="Cycles SceneRenderLayer Settings",
+                description="Cycles SceneRenderLayer Settings",
+                type=cls,
+                )
+        cls.pass_debug_bvh_traversed_nodes = BoolProperty(
+                name="Debug BVH Traversed Nodes",
+                description="Store Debug BVH Traversed Nodes pass",
+                default=False,
+                )
+        cls.pass_debug_bvh_traversed_instances = BoolProperty(
+                name="Debug BVH Traversed Instances",
+                description="Store Debug BVH Traversed Instances pass",
+                default=False,
+                )
+        cls.pass_debug_bvh_intersections = BoolProperty(
+                name="Debug BVH Intersections",
+                description="Store Debug BVH Intersections",
+                default=False,
+                )
+        cls.pass_debug_ray_bounces = BoolProperty(
+                name="Debug Ray Bounces",
+                description="Store Debug Ray Bounces pass",
+                default=False,
+                )
+
+    @classmethod
+    def unregister(cls):
+        del bpy.types.SceneRenderLayer.cycles
+
 
 class CyclesCurveSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -1324,6 +1357,7 @@ def register():
     bpy.utils.register_class(CyclesCurveSettings)
     bpy.utils.register_class(CyclesDeviceSettings)
     bpy.utils.register_class(CyclesPreferences)
+    bpy.utils.register_class(CyclesRenderLayerSettings)
 
 
 def unregister():
@@ -1339,3 +1373,4 @@ def unregister():
     bpy.utils.unregister_class(CyclesCurveSettings)
     bpy.utils.unregister_class(CyclesDeviceSettings)
     bpy.utils.unregister_class(CyclesPreferences)
+    bpy.utils.unregister_class(CyclesRenderLayerSettings)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 4423f3c266b..e81f7ba52b3 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')
 
 
 def use_sample_all_lights(context):
@@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
@@ -477,11 +476,14 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
     bl_options = {'DEFAULT_CLOSED'}
 
     def draw(self, context):
+        import _cycles
+
         layout = self.layout
 
         scene = context.scene
         rd = scene.render
         rl = rd.layers.active
+        crl = rl.cycles
 
         split = layout.split()
 
@@ -528,8 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
-        if hasattr(rd, "debug_pass_type"):
-            layout.prop(rd, "debug_pass_type")
+        if _cycles.with_cycles_debug:
+          col = layout.column()
+          col.prop(crl, "pass_debug_bvh_traversed_nodes")
+          col.prop(crl, "pass_debug_bvh_traversed_instances")
+          col.prop(crl, "pass_debug_bvh_intersections")
+          col.prop(crl, "pass_debug_ray_bounces")
 
 
 class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index ed361cc971e..42b985305ea 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -776,17 +776,17 @@ static void ExportCurveTriangleVcol(ParticleCurveData *CData,
 
 			for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) {
 				for(int section = 0; section < resol; section++) {
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
 				}
 			}
@@ -1004,7 +1004,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 
 					for(size_t curve = 0; curve < CData.curve_vcol.size(); curve++)
 						if(!(CData.curve_keynum[curve] <= 1 || CData.curve_length[curve] == 0.0f))
-							fdata[i++] = color_srgb_to_scene_linear(CData.curve_vcol[curve]);
+							fdata[i++] = color_srgb_to_scene_linear_v3(CData.curve_vcol[curve]);
 				}
 			}
 		}
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 54571b1fea1..f13b9db7013 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -356,7 +356,7 @@ static void attr_create_vertex_color(Scene *scene,
 				int n = p->loop_total();
 				for(int i = 0; i < n; i++) {
 					float3 color = get_float3(l->data[p->loop_start() + i].color());
-					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color));
+					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear_v3(color));
 				}
 			}
 		}
@@ -380,11 +380,11 @@ static void attr_create_vertex_color(Scene *scene,
 				face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
 
 				uchar4 colors[4];
-				colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
-				colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
-				colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
+				colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1())));
+				colors[1] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color2())));
+				colors[2] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color3())));
 				if(nverts[i] == 4) {
-					colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+					colors[3] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color4())));
 				}
 
 				cdata[0] = colors[tri_a[0]];
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 92393c10893..3c769881bb7 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -815,6 +815,14 @@ void *CCL_python_module_init()
 	PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
 #endif
 
+#ifdef WITH_CYCLES_DEBUG
+	PyModule_AddObject(mod, "with_cycles_debug", Py_True);
+	Py_INCREF(Py_True);
+#else
+	PyModule_AddObject(mod, "with_cycles_debug", Py_False);
+	Py_INCREF(Py_False);
+#endif
+
 #ifdef WITH_NETWORK
 	PyModule_AddObject(mod, "with_network", Py_True);
 	Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 3e851adbf54..6f2e7065d97 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -247,90 +247,6 @@ void BlenderSession::free_session()
 	delete session;
 }
 
-static PassType get_pass_type(BL::RenderPass& b_pass)
-{
-	switch(b_pass.type()) {
-		case BL::RenderPass::type_COMBINED:
-			return PASS_COMBINED;
-
-		case BL::RenderPass::type_Z:
-			return PASS_DEPTH;
-		case BL::RenderPass::type_MIST:
-			return PASS_MIST;
-		case BL::RenderPass::type_NORMAL:
-			return PASS_NORMAL;
-		case BL::RenderPass::type_OBJECT_INDEX:
-			return PASS_OBJECT_ID;
-		case BL::RenderPass::type_UV:
-			return PASS_UV;
-		case BL::RenderPass::type_VECTOR:
-			return PASS_MOTION;
-		case BL::RenderPass::type_MATERIAL_INDEX:
-			return PASS_MATERIAL_ID;
-
-		case BL::RenderPass::type_DIFFUSE_DIRECT:
-			return PASS_DIFFUSE_DIRECT;
-		case BL::RenderPass::type_GLOSSY_DIRECT:
-			return PASS_GLOSSY_DIRECT;
-		case BL::RenderPass::type_TRANSMISSION_DIRECT:
-			return PASS_TRANSMISSION_DIRECT;
-		case BL::RenderPass::type_SUBSURFACE_DIRECT:
-			return PASS_SUBSURFACE_DIRECT;
-
-		case BL::RenderPass::type_DIFFUSE_INDIRECT:
-			return PASS_DIFFUSE_INDIRECT;
-		case BL::RenderPass::type_GLOSSY_INDIRECT:
-			return PASS_GLOSSY_INDIRECT;
-		case BL::RenderPass::type_TRANSMISSION_INDIRECT:
-			return PASS_TRANSMISSION_INDIRECT;
-		case BL::RenderPass::type_SUBSURFACE_INDIRECT:
-			return PASS_SUBSURFACE_INDIRECT;
-
-		case BL::RenderPass::type_DIFFUSE_COLOR:
-			return PASS_DIFFUSE_COLOR;
-		case BL::RenderPass::type_GLOSSY_COLOR:
-			return PASS_GLOSSY_COLOR;
-		case BL::RenderPass::type_TRANSMISSION_COLOR:
-			return PASS_TRANSMISSION_COLOR;
-		case BL::RenderPass::type_SUBSURFACE_COLOR:
-			return PASS_SUBSURFACE_COLOR;
-
-		case BL::RenderPass::type_EMIT:
-			return PASS_EMISSION;
-		case BL::RenderPass::type_ENVIRONMENT:
-			return PASS_BACKGROUND;
-		case BL::RenderPass::type_AO:
-			return PASS_AO;
-		case BL::RenderPass::type_SHADOW:
-			return PASS_SHADOW;
-
-		case BL::RenderPass::type_DIFFUSE:
-		case BL::RenderPass::type_COLOR:
-		case BL::RenderPass::type_REFRACTION:
-		case BL::RenderPass::type_SPECULAR:
-		case BL::RenderPass::type_REFLECTION:
-			return PASS_NONE;
-#ifdef WITH_CYCLES_DEBUG
-		case BL::RenderPass::type_DEBUG:
-		{
-			switch(b_pass.debug_type()) {
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES:
-					return PASS_BVH_TRAVERSED_NODES;
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES:
-					return PASS_BVH_TRAVERSED_INSTANCES;
-				case BL::RenderPass::debug_type_BVH_INTERSECTIONS:
-					return PASS_BVH_INTERSECTIONS;
-				case BL::RenderPass::debug_type_RAY_BOUNCES:
-					return PASS_RAY_BOUNCES;
-			}
-			break;
-		}
-#endif
-	}
-	
-	return PASS_NONE;
-}
-
 static ShaderEvalType get_shader_type(const string& pass_type)
 {
 	const char *shader_type = pass_type.c_str();
@@ -487,22 +403,11 @@ void BlenderSession::render()
 
 		/* add passes */
 		array<Pass> passes;
-		Pass::add(PASS_COMBINED, passes);
-
 		if(session_params.device.advanced_shading) {
-
-			/* loop over passes */
-			BL::RenderLayer::passes_iterator b_pass_iter;
-
-			for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
-				BL::RenderPass b_pass(*b_pass_iter);
-				PassType pass_type = get_pass_type(b_pass);
-
-				if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
-					continue;
-				if(pass_type != PASS_NONE)
-					Pass::add(pass_type, passes);
-			}
+			passes = sync->sync_render_passes(b_rlay, *b_layer_iter);
+		}
+		else {
+			Pass::add(PASS_COMBINED, passes);
 		}
 
 		buffer_params.passes = passes;
@@ -757,19 +662,25 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 			BL::RenderPass b_pass(*b_iter);
 
 			/* find matching pass type */
-			PassType pass_type = get_pass_type(b_pass);
+			PassType pass_type = BlenderSync::get_pass_type(b_pass);
 			int components = b_pass.channels();
 
-			/* copy pixels */
-			if(!buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]))
+			bool read = false;
+			if(pass_type != PASS_NONE) {
+				/* copy pixels */
+				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
+			}
+
+			if(!read) {
 				memset(&pixels[0], 0, pixels.size()*sizeof(float));
+			}
 
 			b_pass.rect(&pixels[0]);
 		}
 	}
 	else {
 		/* copy combined pass */
-		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str()));
+		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
 		if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0]))
 			b_combined_pass.rect(&pixels[0]);
 	}
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index abec9616cff..08120a2ca28 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -481,6 +481,94 @@ void BlenderSync::sync_images()
 	}
 }
 
+/* Passes */
+PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+#define MAP_PASS(passname, passtype) if(name == passname) return passtype;
+	/* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+	MAP_PASS("Combined", PASS_COMBINED);
+	MAP_PASS("Depth", PASS_DEPTH);
+	MAP_PASS("Mist", PASS_MIST);
+	MAP_PASS("Normal", PASS_NORMAL);
+	MAP_PASS("IndexOB", PASS_OBJECT_ID);
+	MAP_PASS("UV", PASS_UV);
+	MAP_PASS("Vector", PASS_MOTION);
+	MAP_PASS("IndexMA", PASS_MATERIAL_ID);
+
+	MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
+	MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
+	MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
+	MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT);
+
+	MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
+	MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
+	MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
+	MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT);
+
+	MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
+	MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
+	MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
+	MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR);
+
+	MAP_PASS("Emit", PASS_EMISSION);
+	MAP_PASS("Env", PASS_BACKGROUND);
+	MAP_PASS("AO", PASS_AO);
+	MAP_PASS("Shadow", PASS_SHADOW);
+
+#ifdef __KERNEL_DEBUG__
+	MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES);
+	MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES);
+	MAP_PASS("Debug BVH Intersections", PASS_BVH_INTERSECTIONS);
+	MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
+#endif
+#undef MAP_PASS
+
+	return PASS_NONE;
+}
+
+array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
+	                                        BL::SceneRenderLayer& b_srlay)
+{
+	array<Pass> passes;
+	Pass::add(PASS_COMBINED, passes);
+
+	/* loop over passes */
+	BL::RenderLayer::passes_iterator b_pass_iter;
+
+	for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
+		BL::RenderPass b_pass(*b_pass_iter);
+		PassType pass_type = get_pass_type(b_pass);
+
+		if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
+			continue;
+		if(pass_type != PASS_NONE)
+			Pass::add(pass_type, passes);
+	}
+
+#ifdef __KERNEL_DEBUG__
+	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+	if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
+		b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
+	}
+	if(get_boolean(crp, "pass_debug_bvh_traversed_instances")) {
+		b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes);
+	}
+	if(get_boolean(crp, "pass_debug_bvh_intersections")) {
+		b_engine.add_pass("Debug BVH Intersections", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_INTERSECTIONS, passes);
+	}
+	if(get_boolean(crp, "pass_debug_ray_bounces")) {
+		b_engine.add_pass("Debug Ray Bounces", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_RAY_BOUNCES, passes);
+	}
+#endif
+
+	return passes;
+}
+
 /* Scene Parameters */
 
 SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 1388e59f36d..bc7a4de3b36 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -68,6 +68,8 @@ public:
 	               void **python_thread_state,
 	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
+	array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
+	                               BL::SceneRenderLayer& b_srlay);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings& b_render,
 	                 BL::Object& b_override,
@@ -94,6 +96,8 @@ public:
 	                                      Camera *cam,
 	                                      int width, int height);
 
+	static PassType get_pass_type(BL::RenderPass& b_pass);
+
 private:
 	/* sync */
 	void sync_lamps(bool update_all);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 71d52bb8097..9118793aad6 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -19,6 +19,7 @@
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data_types.h"
 
+#include "util/util_logging.h"
 #include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
@@ -38,6 +39,8 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
 	kernel_do_volume = NULL;
 	kernel_queue_enqueue = NULL;
 	kernel_indirect_background = NULL;
+	kernel_shader_setup = NULL;
+	kernel_shader_sort = NULL;
 	kernel_shader_eval = NULL;
 	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
 	kernel_subsurface_scatter = NULL;
@@ -63,6 +66,8 @@ DeviceSplitKernel::~DeviceSplitKernel()
 	delete kernel_do_volume;
 	delete kernel_queue_enqueue;
 	delete kernel_indirect_background;
+	delete kernel_shader_setup;
+	delete kernel_shader_sort;
 	delete kernel_shader_eval;
 	delete kernel_holdout_emission_blurring_pathtermination_ao;
 	delete kernel_subsurface_scatter;
@@ -88,6 +93,8 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 	LOAD_KERNEL(do_volume);
 	LOAD_KERNEL(queue_enqueue);
 	LOAD_KERNEL(indirect_background);
+	LOAD_KERNEL(shader_setup);
+	LOAD_KERNEL(shader_sort);
 	LOAD_KERNEL(shader_eval);
 	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
 	LOAD_KERNEL(subsurface_scatter);
@@ -108,6 +115,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
 {
 	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	VLOG(1) << "Split state element size: "
+	        << string_human_readable_number(size_per_element) << " bytes. ("
+	        << string_human_readable_size(size_per_element) << ").";
 	return max_buffer_size / size_per_element;
 }
 
@@ -237,9 +247,12 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 55548122c0c..58c2fdbb077 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -61,6 +61,8 @@ private:
 	SplitKernelFunction *kernel_do_volume;
 	SplitKernelFunction *kernel_queue_enqueue;
 	SplitKernelFunction *kernel_indirect_background;
+	SplitKernelFunction *kernel_shader_setup;
+	SplitKernelFunction *kernel_shader_sort;
 	SplitKernelFunction *kernel_shader_eval;
 	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
 	SplitKernelFunction *kernel_subsurface_scatter;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 764216d0dfa..d061973dcb7 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -31,19 +31,16 @@ CCL_NAMESPACE_BEGIN
 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
 #  undef clEnqueueNDRangeKernel
 #  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
 	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
 	clFinish(a);
 
 #  undef clEnqueueWriteBuffer
 #  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
 	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
 	clFinish(a);
 
 #  undef clEnqueueReadBuffer
 #  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
 	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
 	clFinish(a);
 #endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3750225571d..9bb0455b9d5 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -21,6 +21,8 @@ set(SRC
 	kernels/opencl/kernel_lamp_emission.cl
 	kernels/opencl/kernel_do_volume.cl
 	kernels/opencl/kernel_indirect_background.cl
+	kernels/opencl/kernel_shader_setup.cl
+	kernels/opencl/kernel_shader_sort.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
 	kernels/opencl/kernel_subsurface_scatter.cl
@@ -235,6 +237,7 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+	split/kernel_branched.h
 	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
@@ -247,6 +250,8 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_path_init.h
 	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
+	split/kernel_shader_setup.h
+	split/kernel_shader_sort.h
 	split/kernel_shader_eval.h
 	split/kernel_shadow_blocked_ao.h
 	split/kernel_shadow_blocked_dl.h
@@ -456,6 +461,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index e7957042182..58da141aed3 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
+                                        ccl_addr_space PathState *state,
                                         RNG *rng,
                                         float3 throughput,
                                         float3 ao_alpha)
@@ -98,6 +98,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -818,5 +820,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 085eb42325d..ddcb57161ea 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
+                                               ccl_addr_space PathState *state,
                                                RNG *rng,
                                                float3 throughput)
 {
@@ -65,6 +65,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
@@ -648,6 +649,8 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 076c82f3853..bd4ba775b4d 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -155,7 +155,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
         PathRadiance *L,
-        Ray *ray)
+        ccl_addr_space Ray *ray)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index baf629342b9..274713addc2 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -417,9 +417,8 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
-#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -507,7 +506,6 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* setup diffuse bsdf */
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
-#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index b1269cdb6b4..9b354457b91 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN
 #  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
 #endif
 
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+#  define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+#  define SHADER_SORT_LOCAL_SIZE 32
+#else
+#  define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -71,22 +83,18 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
+#  define __BRANCHED_PATH__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __SUBSURFACE__
 #  define __PRINCIPLED__
+#  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  ifndef __SPLIT_KERNEL__
-#    define __VOLUME_DECOUPLED__
-#    define __VOLUME_RECORD_ALL__
-#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
@@ -138,6 +146,7 @@ CCL_NAMESPACE_BEGIN
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
 #    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@@ -1300,7 +1309,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1313,22 +1321,40 @@ enum QueueNumber {
 	 * 3. Rays to be regenerated
 	 * are enqueued here.
 	 */
-	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contribution for AO are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contributing for direct lighting are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+	/* Rays sorted according to shader->id */
+	QUEUE_SHADER_SORTED_RAYS,
+
+#ifdef __BRANCHED_PATH__
+	/* All rays moving to next iteration of the indirect loop for light */
+	QUEUE_LIGHT_INDIRECT_ITER,
+#  ifdef __VOLUME__
+	/* All rays moving to next iteration of the indirect loop for volumes */
+	QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+	/* All rays moving to next iteration of the indirect loop for subsurface */
+	QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+	NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
 	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
@@ -1343,14 +1369,22 @@ enum RayState {
 	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
 	RAY_REGENERATED,
-	/* Flag's ray has to execute shadow blocked function in AO part */
-	RAY_SHADOW_RAY_CAST_AO = 16,
-	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
-	RAY_SHADOW_RAY_CAST_DL = 32,
+	/* Denotes ray is moving to next iteration of the branched indirect loop */
+	RAY_LIGHT_INDIRECT_NEXT_ITER,
+	RAY_VOLUME_INDIRECT_NEXT_ITER,
+	RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+	/* Ray flags */
+
+	/* Flags to denote that the ray is currently evaluating the branched indirect loop */
+	RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 896b80d783e..39c9a9cf33c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -77,6 +77,8 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
 DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
 DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 148b2eef568..8c05dd1d9ef 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -44,6 +44,8 @@
 #  include "kernel/split/kernel_do_volume.h"
 #  include "kernel/split/kernel_queue_enqueue.h"
 #  include "kernel/split/kernel_indirect_background.h"
+#  include "kernel/split/kernel_shader_setup.h"
+#  include "kernel/split/kernel_shader_sort.h"
 #  include "kernel/split/kernel_shader_eval.h"
 #  include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 #  include "kernel/split/kernel_subsurface_scatter.h"
@@ -181,9 +183,11 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
@@ -209,6 +213,8 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name,
 	REGISTER(do_volume);
 	REGISTER(queue_enqueue);
 	REGISTER(indirect_background);
+	REGISTER(shader_setup);
+	REGISTER(shader_sort);
 	REGISTER(shader_eval);
 	REGISTER(holdout_emission_blurring_pathtermination_ao);
 	REGISTER(subsurface_scatter);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index a679eff8409..8b7f1a8d405 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -31,6 +31,8 @@
 #include "kernel/split/kernel_do_volume.h"
 #include "kernel/split/kernel_queue_enqueue.h"
 #include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
 #include "kernel/split/kernel_shader_eval.h"
 #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 #include "kernel/split/kernel_subsurface_scatter.h"
@@ -108,9 +110,11 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 6baee460986..5bfb31b193a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_shader_eval(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics;
-	kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
+	kernel_shader_eval((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
new file mode 100644
index 00000000000..38bfd04ad4c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_setup.h"
+
+__kernel void kernel_ocl_path_trace_shader_setup(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	ccl_local unsigned int local_queue_atomics;
+	kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
new file mode 100644
index 00000000000..6f722915d45
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_sort.h"
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void kernel_ocl_path_trace_shader_sort(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	ccl_local ShaderSortLocals locals;
+	kernel_shader_sort((KernelGlobals*)kg, &locals);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 732cda30115..8de82db7afe 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -23,6 +23,8 @@
 #include "kernel/kernels/opencl/kernel_do_volume.cl"
 #include "kernel/kernels/opencl/kernel_indirect_background.cl"
 #include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_setup.cl"
+#include "kernel/kernels/opencl/kernel_shader_sort.cl"
 #include "kernel/kernels/opencl/kernel_shader_eval.cl"
 #include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
 #include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
index 7a1838e485f..99b74a1802b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_subsurface_scatter(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics;
-	kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
+	kernel_subsurface_scatter((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..c7bc1b4df0a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+		branched_state->name = kernel_split_state.name[ray_index];
+
+	BRANCHED_STORE(path_state);
+	BRANCHED_STORE(throughput);
+	BRANCHED_STORE(ray);
+	BRANCHED_STORE(sd);
+	BRANCHED_STORE(isect);
+	BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+	/* set loop counters to intial position */
+	branched_state->next_closure = 0;
+	branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* restore state */
+#define BRANCHED_RESTORE(name) \
+		kernel_split_state.name[ray_index] = branched_state->name;
+
+	BRANCHED_RESTORE(path_state);
+	BRANCHED_RESTORE(throughput);
+	BRANCHED_RESTORE(ray);
+	BRANCHED_RESTORE(sd);
+	BRANCHED_RESTORE(isect);
+	BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+	/* leave indirect loop */
+	REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = saved_sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	float3 throughput = branched_state->throughput;
+
+	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(rng, i);
+
+		for(int j = branched_state->next_sample; j < num_samples; j++) {
+			ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+			if(reset_path_state) {
+				*ps = branched_state->path_state;
+			}
+
+			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+			*tp = throughput;
+
+			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+			if(!kernel_branched_path_surface_bounce(kg,
+			                                        &bsdf_rng,
+			                                        sd,
+			                                        sc,
+			                                        j,
+			                                        num_samples,
+			                                        tp,
+			                                        ps,
+			                                        L,
+			                                        bsdf_ray))
+			{
+				continue;
+			}
+
+			/* update state for next iteration */
+			branched_state->next_closure = i;
+			branched_state->next_sample = j+1;
+			branched_state->num_samples = num_samples;
+
+			/* start the indirect path */
+			*tp *= num_samples_inv;
+
+			return true;
+		}
+
+		branched_state->next_sample = 0;
+	}
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 9d3d01fff75..642ccac8239 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -105,21 +105,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 
 	/* Initialize queue data and queue index. */
 	if(thread_index < queuesize) {
-		/* Initialize active ray queue. */
-		kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize background and buffer update queue. */
-		kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of AO queue. */
-		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of direct lighting queue. */
-		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		}
 	}
 
 	if(thread_index == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			Queue_index[i] = 0;
+		}
+
 		/* The scene-intersect kernel should not use the queues very first time.
 		 * since the queue would be empty.
 		 */
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index bdbf7387b95..3336c968a44 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
 	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
@@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 		/* direct lighting */
 #ifdef __EMISSION__
 		RNG rng = kernel_split_state.rng[ray_index];
+
 		bool flag = (kernel_data.integrator.use_direct_light &&
 		             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+		if(flag && kernel_data.integrator.branched) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __BRANCHED_PATH__ */
+
 #  ifdef __SHADOW_TRICKS__
 		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
 			flag = false;
-			ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
-			float3 throughput = kernel_split_state.throughput[ray_index];
-			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-			kernel_branched_path_surface_connect_light(kg,
-			                                           &rng,
-			                                           sd,
-			                                           emission_sd,
-			                                           state,
-			                                           throughput,
-			                                           1.0f,
-			                                           L,
-			                                           1);
+			enqueue_flag = 1;
 		}
 #  endif  /* __SHADOW_TRICKS__ */
+
 		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
 			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 					kernel_split_state.bsdf_eval[ray_index] = L_light;
 					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
@@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 #endif  /* __EMISSION__ */
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
 #ifdef __EMISSION__
 	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
 	enqueue_ray_index_local(ray_index,
@@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 	                        kernel_split_state.queue_data,
 	                        kernel_split_params.queue_index);
 #endif
+
+#ifdef __BRANCHED_PATH__
+	/* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+	 * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+	 */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_LIGHT_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 47d3c280831..182e6c6e4fa 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -16,6 +16,81 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	/* GPU: no decoupled ray marching, scatter probalistically */
+	int num_samples = kernel_data.integrator.volume_samples;
+	float num_samples_inv = 1.0f/num_samples;
+
+	Ray volume_ray = branched_state->ray;
+	volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+	for(int j = branched_state->next_sample; j < num_samples; j++) {
+		ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+		*ps = branched_state->path_state;
+
+		ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+		*pray = branched_state->ray;
+
+		ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+		*tp = branched_state->throughput * num_samples_inv;
+
+		/* branch RNG state */
+		path_state_branch(ps, j, num_samples);
+
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+			/* indirect light bounce */
+			if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+				continue;
+			}
+
+			/* start the indirect path */
+			branched_state->next_closure = 0;
+			branched_state->next_sample = j+1;
+			branched_state->num_samples = num_samples;
+
+			return true;
+		}
+#  endif
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	/* todo: avoid this calculation using decoupled ray marching */
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+	kernel_split_state.throughput[ray_index] = throughput;
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */
 
 ccl_device void kernel_do_volume(KernelGlobals *kg)
 {
@@ -23,37 +98,37 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 	/* We will empty this queue in this kernel. */
 	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
 		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+		kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
 	}
-	/* Fetch use_queues_flag. */
-	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-	if(local_use_queues_flag) {
+
+	if(*kernel_split_params.use_queues_flag) {
 		ray_index = get_ray_index(kg, ray_index,
 		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                          kernel_split_state.queue_data,
 		                          kernel_split_params.queue_size,
 		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
 	}
 
-	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 
-		bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
-		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-		ShaderData *sd = &kernel_split_state.sd[ray_index];
-		ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+
+		bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
 
 		/* Sanitize volume stack. */
 		if(!hit) {
@@ -64,31 +139,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 			Ray volume_ray = *ray;
 			volume_ray.t = (hit)? isect->t: FLT_MAX;
 
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+#  ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+				bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+				{
+					/* integrate along volume segment with distance sampling */
+					VolumeIntegrateResult result = kernel_volume_integrate(
+						kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
-						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
-					else
-						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+					if(result == VOLUME_PATH_SCATTERED) {
+						/* direct lighting */
+						kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+
+						/* indirect light bounce */
+						if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+						}
+						else {
+							kernel_split_path_end(kg, ray_index);
+						}
+					}
+#  endif  /* __VOLUME_SCATTER__ */
+				}
+
+#  ifdef __BRANCHED_PATH__
+			}
+			else {
+				kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 				}
-#  endif
 			}
+#  endif  /* __BRANCHED_PATH__ */
 		}
+
 		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#endif
+#  ifdef __BRANCHED_PATH__
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_VOLUME_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
 }
 
 
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 89adeb64c8a..87498910d38 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
  *     flag RAY_SHADOW_RAY_CAST_AO
  */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
         ccl_local_param BackgroundAOLocals *locals)
@@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	}
 	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
+#ifdef __AO__
 	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -155,8 +157,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
 			}
 			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+				kernel_split_path_end(kg, ray_index);
 			}
 		}
 #endif  /* __HOLDOUT__ */
@@ -164,18 +165,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		/* Holdout mask objects do not write data passes. */
-		kernel_write_data_passes(kg,
-		                         buffer,
-		                         L,
-		                         sd,
-		                         sample,
-		                         state,
-		                         throughput);
+
+#ifdef __BRANCHED_PATH__
+		if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif  /* __BRANCHED_PATH__ */
+		{
+			/* Holdout mask objects do not write data passes. */
+			kernel_write_data_passes(kg,
+				                     buffer,
+				                     L,
+				                     sd,
+				                     sample,
+				                     state,
+				                     throughput);
+		}
+
 		/* Blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy.
 		 */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+		if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+		if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+		   (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif  /* __BRANCHED_PATH__ */
+		{
 			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 			if(blur_pdf < 1.0f) {
 				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,19 +215,32 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
+#ifndef __BRANCHED_PATH__
 		float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+		float probability = 1.0f;
+
+		if(!kernel_data.integrator.branched) {
+			probability = path_state_terminate_probability(kg, state, throughput);
+		}
+		else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+			probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+		}
+		else if(state->flag & PATH_RAY_TRANSPARENT) {
+			probability = path_state_terminate_probability(kg, state, throughput);
+		}
+#endif
 
 		if(probability == 0.0f) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+			kernel_split_path_end(kg, ray_index);
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 			if(probability != 1.0f) {
 				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
 				if(terminate >= probability) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					enqueue_flag = 1;
+					kernel_split_path_end(kg, ray_index);
 				}
 				else {
 					kernel_split_state.throughput[ray_index] = throughput/probability;
@@ -225,61 +252,23 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 #ifdef __AO__
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (sd->flag & SD_AO))
-		{
-			/* todo: solve correlation */
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
-			float3 ao_D;
-			float ao_pdf;
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray _ray;
-				_ray.P = ray_offset(sd->P, sd->Ng);
-				_ray.D = ao_D;
-				_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-				_ray.time = sd->time;
-#endif
-				_ray.dP = sd->dP;
-				_ray.dD = differential3_zero();
-				kernel_split_state.ao_light_ray[ray_index] = _ray;
-
-				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-			}
+		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+			enqueue_flag = 1;
 		}
 	}
 #endif  /* __AO__ */
-	kernel_split_state.rng[ray_index] = rng;
 
+	kernel_split_state.rng[ray_index] = rng;
 
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        &locals->queue_atomics_bg,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
-
 #ifdef __AO__
 	/* Enqueue to-shadow-ray-cast rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        enqueue_flag,
 	                        kernel_split_params.queue_size,
 	                        &locals->queue_atomics_ao,
 	                        kernel_split_state.queue_data,
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index 8192528622e..6fbc888e358 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 				if(state->bounce > kernel_data.integrator.ao_bounces) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					kernel_split_path_end(kg, ray_index);
 				}
 			}
 		}
@@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 #ifdef __PASSES__
 			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
 #endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				kernel_split_path_end(kg, ray_index);
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
 			path_radiance_accum_background(L, state, (*throughput), L_background);
 #endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			kernel_split_path_end(kg, ray_index);
 		}
 	}
 
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index a56e85abeb9..82bc2f01fd7 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 
-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-		kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched) {
+#endif
+		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+			ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+			kernel_path_subsurface_accum_indirect(ss_indirect, L);
 
-		/* Trace indirect subsurface rays by restarting the loop. this uses less
-		 * stack memory than invoking kernel_path_indirect.
-		 */
-		if(ss_indirect->num_rays) {
-			kernel_path_subsurface_setup_indirect(kg,
-			                                      ss_indirect,
-			                                      state,
-			                                      ray,
-			                                      L,
-			                                      throughput);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-		}
-		else {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			/* Trace indirect subsurface rays by restarting the loop. this uses less
+			 * stack memory than invoking kernel_path_indirect.
+			 */
+			if(ss_indirect->num_rays) {
+				kernel_path_subsurface_setup_indirect(kg,
+					                                  ss_indirect,
+					                                  state,
+					                                  ray,
+					                                  L,
+					                                  throughput);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
 		}
+#ifdef __BRANCHED_PATH__
 	}
+#endif
 
 #endif  /* __SUBSURFACE__ */
 
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 1bebc16e25b..71017fed19e 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
  *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+	/* continue in case of transparency */
+	*throughput *= shader_bsdf_transparency(kg, sd);
+
+	if(is_zero(*throughput)) {
+		kernel_split_path_end(kg, ray_index);
+	}
+	else {
+		/* Update Path State */
+		state->flag |= PATH_RAY_TRANSPARENT;
+		state->transparent_bounce++;
+
+		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+		ray->dP = sd->dP;
+		ray->dD.dx = -sd->dI.dx;
+		ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+	}
+}
+#endif  /* __BRANCHED_PATH__ */
+
 ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                             ccl_local_param unsigned int *local_queue_atomics)
 {
@@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}
 
-	char enqueue_flag = 0;
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -75,102 +120,125 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
-	/* Load ShaderData structure. */
-	PathRadiance *L = NULL;
-	ccl_global PathState *state = NULL;
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 
-	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		state = &kernel_split_state.path_state[ray_index];
-		L = &kernel_split_state.path_radiance[ray_index];
-		float3 _throughput = kernel_split_state.throughput[ray_index];
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
-			// TODO(mai): investigate correctness here
-			char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
-			if(update_path_radiance) {
-				path_radiance_accum_ao(L,
-				                       _throughput,
-				                       kernel_split_state.ao_alpha[ray_index],
-				                       kernel_split_state.ao_bsdf[ray_index],
-				                       shadow,
-				                       state->bounce);
-			}
-			else {
-				path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+	bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(active) {
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+			/* Compute direct lighting and next bounce. */
+			if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+				kernel_split_path_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
 		}
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = kernel_split_state.light_ray[ray_index].P;
-			// TODO(mai): investigate correctness here
-			char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
-			BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-			if(update_path_radiance) {
-				path_radiance_accum_light(L,
-				                          _throughput,
-				                          &L_light,
-				                          shadow,
-				                          1.0f,
-				                          state->bounce,
-				                          kernel_split_state.is_lamp[ray_index]);
+		else {
+			kernel_split_branched_indirect_light_init(kg, ray_index);
+
+			if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+			                                                          ray_index,
+			                                                          1.0f,
+			                                                          &kernel_split_state.branched_state[ray_index].sd,
+			                                                          true))
+			{
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 			}
 			else {
-				path_radiance_accum_total_light(L, _throughput, &L_light);
+				kernel_split_branched_indirect_light_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
+#endif  /* __BRANCHED_PATH__ */
+
+		kernel_split_state.rng[ray_index] = rng;
 	}
 
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
-		state = &kernel_split_state.path_state[ray_index];
-		L = &kernel_split_state.path_radiance[ray_index];
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+	/* iter loop */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+	}
 
-		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_LIGHT_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_reset_indirect(L);
+
+		if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+		                                                          ray_index,
+		                                                          1.0f,
+		                                                          &kernel_split_state.branched_state[ray_index].sd,
+		                                                          true))
+		{
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+		else {
+			kernel_split_branched_indirect_light_end(kg, ray_index);
 		}
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __VOLUME__
+	/* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
 	}
-#endif
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
+	                        QUEUE_VOLUME_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+	/* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
 	                        kernel_split_params.queue_size,
 	                        local_queue_atomics,
 	                        kernel_split_state.queue_data,
 	                        kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 0f1696e34a0..2801b32f285 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,54 +16,61 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* This kernel sets up the ShaderData structure from the values computed
+/* This kernel evaluates ShaderData structure from the values computed
  * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
  */
-ccl_device void kernel_shader_eval(KernelGlobals *kg,
-                                   ccl_local_param unsigned int *local_queue_atomics)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
 {
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		*local_queue_atomics = 0;
-	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	/* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+	int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+	if(ray_index >= queue_index) {
+		return;
+	}
 	ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+	                          QUEUE_SHADER_SORTED_RAYS,
+#endif
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-	char enqueue_flag = 0;
-	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
-		enqueue_flag = 1;
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
 	}
 
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        local_queue_atomics,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
-
-	/* Continue on with shader evaluation. */
-	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection isect = kernel_split_state.isect[ray_index];
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		Ray ray = kernel_split_state.ray[ray_index];
 
-		shader_setup_from_ray(kg,
-		                      &kernel_split_state.sd[ray_index],
-		                      &isect,
-		                      &ray);
+#ifndef __BRANCHED_PATH__
 		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
 		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+		ShaderContext ctx = SHADER_CONTEXT_MAIN;
+		float rbsdf = 0.0f;
+
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+		}
+
+		if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			ctx = SHADER_CONTEXT_INDIRECT;
+		}
+
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+		shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif  /* __BRANCHED_PATH__ */
+
 		kernel_split_state.rng[ray_index] = rng;
 	}
 }
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..0432689d9fa
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+                                    ccl_local_param unsigned int *local_queue_atomics)
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(ray_index >= queue_index) {
+		return;
+	}
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection isect = kernel_split_state.isect[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
+
+		shader_setup_from_ray(kg,
+		                      &kernel_split_state.sd[ray_index],
+		                      &isect,
+		                      &ray);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..297decb0bc2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+                                   ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+	int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(tid == 0) {
+		kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+	}
+
+	uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+	if(offset >= qsize) {
+		return;
+	}
+
+	int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+	uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+	ccl_local uint *local_value = &locals->local_value[0];
+	ccl_local ushort *local_index = &locals->local_index[0];
+
+	/* copy to local memory */
+	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint add = input + idx;
+		uint value = (~0);
+		if(idx < qsize) {
+			int ray_index = kernel_split_state.queue_data[add];
+			bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+			if(valid) {
+				value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+			}
+		}
+		local_value[i + lid] = value;
+		local_index[i + lid] = i + lid;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	/* skip sorting for cpu split kernel */
+#  ifdef __KERNEL_OPENCL__
+
+	/* bitonic sort */
+	for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for (uint inc = length; inc > 0; inc >>= 1) {
+			for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+				uint i = lid + ii;
+				bool direction = ((i & (length << 1)) != 0);
+				uint j = i ^ inc;
+				ushort ioff = local_index[i];
+				ushort joff = local_index[j];
+				uint iKey = local_value[ioff];
+				uint jKey = local_value[joff];
+				bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+				bool swap = smaller ^ (j < i) ^ direction;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+				local_index[i] = (swap) ? joff : ioff;
+				local_index[j] = (swap) ? ioff : joff;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+			}
+		}
+	}
+#  endif /* __KERNEL_OPENCL__ */
+
+	/* copy to destination */
+	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint lidx = local_index[i + lid];
+		uint outi = output + idx;
+		uint ini = input + offset + lidx;
+		uint value = local_value[lidx];
+		if(idx < qsize) {
+			kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+		}
+	}
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 4243e18de72..474286285a9 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
 		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
 	}
 
-	if(ray_index == QUEUE_EMPTY_SLOT)
+	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
+	}
 
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
-		float3 shadow;
-		Ray ray = *light_ray_global;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        &kernel_split_state.sd_DL_shadow[ray_index],
-		                                        state,
-		                                        &ray,
-		                                        &shadow));
-
-		*light_ray_global = ray;
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+		kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
 	}
+#endif
+
+	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index bb8f0157965..452b6e45a36 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -32,28 +32,71 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.light_ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
 
-	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	bool is_lamp = kernel_split_state.is_lamp[ray_index];
 
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+	bool use_branched = false;
+	int all = 0;
+
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		use_branched = true;
+		all = 1;
+	}
+#    if defined(__BRANCHED_PATH__)
+	else if(kernel_data.integrator.branched) {
+		use_branched = true;
+
+		if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			all = (kernel_data.integrator.sample_all_lights_indirect);
+		}
+		else
+		{
+			all = (kernel_data.integrator.sample_all_lights_direct);
+		}
+	}
+#    endif  /* __BRANCHED_PATH__ */
+
+	if(use_branched) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           &rng,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           all);
+	}
+	else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+	{
+		/* trace shadow ray */
 		float3 shadow;
-		Ray ray = *light_ray_global;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        &kernel_split_state.sd_DL_shadow[ray_index],
-		                                        state,
-		                                        &ray,
-		                                        &shadow));
-
-		*light_ray_global = ray;
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
+
+		if(!shadow_blocked(kg,
+			               emission_sd,
+			               state,
+			               &ray,
+			               &shadow))
+		{
+			/* accumulate */
+			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+		}
+		else {
+			path_radiance_accum_total_light(L, throughput, &L_light);
+		}
 	}
+
+	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 4303ba0a905..57f070d51e0 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -37,41 +37,42 @@
 
 #include "util/util_atomic.h"
 
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
 #endif
 
-#ifdef __VOLUME__
-#  include "kernel/kernel_volume.h"
-#endif
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
 
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
+#endif
 
-#ifdef __KERNEL_DEBUG__
-#  include "kernel/kernel_debug.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+	if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+	}
+	else {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+	}
+#else
+	ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 #endif
+}
 
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 0af8bfc89d5..913e0dfd08d 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -62,7 +62,46 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
 #else
 #  define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif  /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+	/* various state that must be kept and restored after an indirect loop */
+	PathState path_state;
+	float3 throughput;
+	Ray ray;
+
+	struct ShaderData sd;
+	Intersection isect;
+
+	char ray_state;
+
+	/* indirect loop state */
+	int next_closure;
+	int next_sample;
+	int num_samples;
+
+#ifdef __SUBSURFACE__
+	int ss_next_closure;
+	int ss_next_sample;
+	int next_hit;
+	int num_hits;
+
+	uint lcg_state;
+	SubsurfaceIntersection ss_isect;
+
+#  ifdef __VOLUME__
+	VolumeStack volume_stack[VOLUME_STACK_SIZE];
+#  endif  /* __VOLUME__ */
+#endif  /*__SUBSURFACE__ */
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */
 
 #define SPLIT_DATA_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@@ -72,9 +111,6 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
-	SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
 	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@@ -82,6 +118,7 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_BRANCHED_ENTRIES \
 	SPLIT_DATA_DEBUG_ENTRIES \
 
 /* struct that holds pointers to data in the shared state buffer */
@@ -125,6 +162,11 @@ typedef struct BackgroundAOLocals {
 	uint queue_atomics_ao;
 } BackgroundAOLocals;
 
+typedef struct ShaderSortLocals {
+	uint local_value[SHADER_SORT_BLOCK_SIZE];
+	ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
 CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 0b4d50c70ee..8364f185d75 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -16,42 +16,206 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		*local_queue_atomics = 0;
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	branched_state->ss_next_closure = 0;
+	branched_state->ss_next_sample = 0;
+
+	branched_state->num_hits = 0;
+	branched_state->next_hit = 0;
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &branched_state->sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
+		{
+			branched_state->lcg_state = lcg_state_init(&rng,
+			                                           branched_state->path_state.rng_offset,
+			                                           branched_state->path_state.sample,
+			                                           0x68bc21eb);
+		}
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg,
+			                     &bssrdf_rng,
+			                     &branched_state->path_state,
+			                     j,
+			                     num_samples,
+			                     PRNG_BSDF_U,
+			                     &bssrdf_u,
+			                     &bssrdf_v);
+
+			/* intersection is expensive so avoid doing multiple times for the same input */
+			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+				RNG lcg_state = branched_state->lcg_state;
+				SubsurfaceIntersection ss_isect_private;
+
+				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+				                                                              &ss_isect_private,
+				                                                              sd,
+				                                                              sc,
+				                                                              &lcg_state,
+				                                                              bssrdf_u, bssrdf_v,
+				                                                              true);
+
+				branched_state->lcg_state = lcg_state;
+				*ss_isect = ss_isect_private;
+			}
+
+#ifdef __VOLUME__
+			Ray volume_ray = branched_state->ray;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+				ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+				*bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+				                   * important as the indirect path will write into bssrdf_sd */
+
+				SubsurfaceIntersection ss_isect_private = *ss_isect;
+				subsurface_scatter_multi_setup(kg,
+				                               &ss_isect_private,
+				                               hit,
+				                               bssrdf_sd,
+				                               &branched_state->path_state,
+				                               branched_state->path_state.flag,
+				                               sc,
+				                               true);
+				*ss_isect = ss_isect_private;
+
+				ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+				*hit_state = branched_state->path_state;
+
+				path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+					/* this next part is expensive as it does scene intersection so only do once */
+					if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+						for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+							branched_state->volume_stack[k] = hit_state->volume_stack[k];
+						}
+
+						kernel_volume_stack_update_for_subsurface(kg,
+						                                          emission_sd,
+						                                          &volume_ray,
+						                                          branched_state->volume_stack);
+					}
+
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state->volume_stack[k] = branched_state->volume_stack[k];
+					}
+				}
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+				if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+					/* direct light */
+					if(kernel_data.integrator.use_direct_light) {
+						int all = (kernel_data.integrator.sample_all_lights_direct) ||
+							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+						kernel_branched_path_surface_connect_light(kg,
+						                                           &rng,
+						                                           bssrdf_sd,
+						                                           emission_sd,
+						                                           hit_state,
+						                                           branched_state->throughput,
+						                                           num_samples_inv,
+						                                           L,
+						                                           all);
+					}
+				}
+#endif  /* __EMISSION__ */
+
+				/* indirect light */
+				if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+				                                                          ray_index,
+				                                                          num_samples_inv,
+				                                                          bssrdf_sd,
+				                                                          false))
+				{
+					branched_state->ss_next_closure = i;
+					branched_state->ss_next_sample = j;
+					branched_state->next_hit = hit;
+
+					return true;
+				}
+
+				branched_state->next_closure = 0;
+			}
+
+			branched_state->next_hit = 0;
+		}
+
+		branched_state->ss_next_sample = 0;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
 	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-	char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+	                          1);
+	get_ray_index(kg, thread_index,
+	              QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
 
+#ifdef __SUBSURFACE__
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -64,34 +228,85 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		if(sd->flag & SD_BSSRDF) {
-			if(kernel_path_subsurface_scatter(kg,
-			                                  sd,
-			                                  emission_sd,
-			                                  L,
-			                                  state,
-			                                  &rng,
-			                                  ray,
-			                                  throughput,
-			                                  ss_indirect)) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched) {
+#endif
+				if(kernel_path_subsurface_scatter(kg,
+					                              sd,
+					                              emission_sd,
+					                              L,
+					                              state,
+					                              &rng,
+					                              ray,
+					                              throughput,
+					                              ss_indirect)) {
+					kernel_split_path_end(kg, ray_index);
+				}
+#ifdef __BRANCHED_PATH__
+			}
+			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+				float bssrdf_probability;
+				ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+				/* modify throughput for picking bssrdf or bsdf */
+				*throughput *= bssrdf_probability;
+
+				/* do bssrdf scatter step if we picked a bssrdf closure */
+				if(sc) {
+					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+
+					float bssrdf_u, bssrdf_v;
+					path_state_rng_2D(kg,
+						              &rng,
+						              state,
+						              PRNG_BSDF_U,
+						              &bssrdf_u, &bssrdf_v);
+					subsurface_scatter_step(kg,
+						                    sd,
+						                    state,
+						                    state->flag,
+						                    sc,
+						                    &lcg_state,
+						                    bssrdf_u, bssrdf_v,
+						                    false);
+				}
+			}
+			else {
+				kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
 			}
+#endif
 		}
 		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
 	}
-#endif
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        local_queue_atomics,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
 
 #endif  /* __SUBSURFACE__ */
 
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 4b5e4ebac00..328ff79223b 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -32,13 +32,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
 #ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-	ssef r_ssef;
-	float4 &r = (float4 &)r_ssef;
-	r = kernel_tex_image_interp(id, x, y);
-#  else
 	float4 r = kernel_tex_image_interp(id, x, y);
-#  endif
 #elif defined(__KERNEL_OPENCL__)
 	float4 r = kernel_tex_image_interp(kg, id, x, y);
 #else
@@ -152,7 +146,10 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
 	/* float4, byte4 and half4 */
 	const int texture_type = kernel_tex_type(id);
-	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || texture_type == IMAGE_DATA_TYPE_HALF4) {
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
 		r = kernel_tex_image_interp_float4(tex, x, y);
 	}
 	/* float, byte and half */
@@ -163,43 +160,22 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 #  endif
 #endif
 
-#ifdef __KERNEL_SSE2__
-	float alpha = r.w;
+	const float alpha = r.w;
 
 	if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
-		r_ssef = r_ssef / ssef(alpha);
+		r /= alpha;
 		const int texture_type = kernel_tex_type(id);
-		if(texture_type == IMAGE_DATA_TYPE_BYTE4 || texture_type == IMAGE_DATA_TYPE_BYTE) {
-			r_ssef = min(r_ssef, ssef(1.0f));
+		if(texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+		   texture_type == IMAGE_DATA_TYPE_BYTE)
+		{
+			r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
 		}
 		r.w = alpha;
 	}
 
 	if(srgb) {
-		r_ssef = color_srgb_to_scene_linear(r_ssef);
-		r.w = alpha;
+		r = color_srgb_to_scene_linear_v4(r);
 	}
-#else
-	if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
-		float invw = 1.0f/r.w;
-		r.x *= invw;
-		r.y *= invw;
-		r.z *= invw;
-		
-		const int texture_type = kernel_tex_type(id);
-		if(texture_type == IMAGE_DATA_TYPE_BYTE4 || texture_type == IMAGE_DATA_TYPE_BYTE) {
-			r.x = min(r.x, 1.0f);
-			r.y = min(r.y, 1.0f);
-			r.z = min(r.z, 1.0f);
-		}
-	}
-
-	if(srgb) {
-		r.x = color_srgb_to_scene_linear(r.x);
-		r.y = color_srgb_to_scene_linear(r.y);
-		r.z = color_srgb_to_scene_linear(r.z);
-	}
-#endif
 
 	return r;
 }
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 4d673dc34d8..c73beab98dc 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -157,16 +157,6 @@ ccl_device float3 xyz_to_rgb(float x, float y, float z)
 	                   0.055648f * x + -0.204043f * y +  1.057311f * z);
 }
 
-#ifndef __KERNEL_OPENCL__
-
-ccl_device float3 color_srgb_to_scene_linear(float3 c)
-{
-	return make_float3(
-		color_srgb_to_scene_linear(c.x),
-		color_srgb_to_scene_linear(c.y),
-		color_srgb_to_scene_linear(c.z));
-}
-
 #ifdef __KERNEL_SSE2__
 /*
  * Calculate initial guess for arg^exp based on float representation
@@ -222,17 +212,38 @@ ccl_device ssef color_srgb_to_scene_linear(const ssef &c)
 	ssef gte = fastpow24(gtebase);
 	return select(cmp, lt, gte);
 }
-#endif
+#endif  /* __KERNEL_SSE2__ */
 
-ccl_device float3 color_scene_linear_to_srgb(float3 c)
+ccl_device float3 color_srgb_to_scene_linear_v3(float3 c)
 {
-	return make_float3(
-		color_scene_linear_to_srgb(c.x),
-		color_scene_linear_to_srgb(c.y),
-		color_scene_linear_to_srgb(c.z));
+	return make_float3(color_srgb_to_scene_linear(c.x),
+	                   color_srgb_to_scene_linear(c.y),
+	                   color_srgb_to_scene_linear(c.z));
 }
 
+ccl_device float3 color_scene_linear_to_srgb_v3(float3 c)
+{
+	return make_float3(color_scene_linear_to_srgb(c.x),
+	                   color_scene_linear_to_srgb(c.y),
+	                   color_scene_linear_to_srgb(c.z));
+}
+
+ccl_device float4 color_srgb_to_scene_linear_v4(float4 c)
+{
+#ifdef __KERNEL_SSE2__
+	ssef r_ssef;
+	float4 &r = (float4 &)r_ssef;
+	r = c;
+	r_ssef = color_srgb_to_scene_linear(r_ssef);
+	r.w = c.w;
+	return r;
+#else
+	return make_float4(color_srgb_to_scene_linear(c.x),
+	                   color_srgb_to_scene_linear(c.y),
+	                   color_srgb_to_scene_linear(c.z),
+	                   c.w);
 #endif
+}
 
 ccl_device float linear_rgb_to_gray(float3 c)
 {
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-03 19:33:02 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-03 19:33:02 +0300
commit	82e242cc7287ce56d10f88a76de5418a7b20e239 (patch)
tree	c513d6537cfd3d3a1b3d098a8a4f45c55611f165 /intern
parent	2564d929fc5f75403bc5ca051fb33bc7b8874646 (diff)
parent	15189baa5231d70c1363192eb01b83fa946f36f0 (diff)