46 files changed, 643 insertions, 658 deletions
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 18e8e274172..ec15a254f81 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -228,6 +228,14 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
+	virtual void set_error(const string& error)
+	{
+		if(!have_error()) {
+			error_msg = error;
+		}
+		fprintf(stderr, "%s\n", error.c_str());
+		fflush(stderr);
+	}
 	virtual bool show_samples() const { return false; }
 
 	/* statistics */
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 5b892038ebb..8925ef47b2e 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -205,6 +205,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		 */
 		device->mem_zero(work_pool_wgs);
 		device->mem_zero(split_data);
+		device->mem_zero(ray_state);
 
 		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
 		                                   subtile,
@@ -254,7 +255,15 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 			activeRaysAvailable = false;
 
 			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-				if(int8_t(ray_state.get_data()[rayStateIter]) != RAY_INACTIVE) {
+				int8_t state = ray_state.get_data()[rayStateIter];
+
+				if(state != RAY_INACTIVE) {
+					if(state == RAY_INVALID) {
+						/* Something went wrong, abort to avoid looping endlessly. */
+						device->set_error("Split kernel error: invalid ray state");
+						return false;
+					}
+
 					/* Not all rays are RAY_INACTIVE. */
 					activeRaysAvailable = true;
 					break;
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index e2d0ff71786..a689c7eae26 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -356,10 +356,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 	cl_int ciErr;
 
 	program = clCreateProgramWithSource(device->cxContext,
-	                                   1,
-	                                   &source_str,
-	                                   &source_len,
-	                                   &ciErr);
+	                                    1,
+	                                    &source_str,
+	                                    &source_len,
+	                                    &ciErr);
 
 	if(ciErr != CL_SUCCESS) {
 		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
@@ -761,10 +761,10 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 		num_devices = 0;
 		cl_int ciErr;
 		if((ciErr = clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  0,
-		                  NULL,
-		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		                           device_type,
+		                           0,
+		                           NULL,
+		                           &num_devices)) != CL_SUCCESS || num_devices == 0)
 		{
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
 			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 4894ea58dba..e799855a65e 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -62,7 +62,7 @@ ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 wei
 {
 	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
 
-	if(!sc)
+	if(sc == NULL)
 		return NULL;
 
 	float sample_weight = fabsf(average(weight));
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 4a1316fd2a9..1c7b3eb9ddd 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -266,7 +266,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
-	       ((!bsdf_a->extra && !bsdf_b->extra) ||
+	       ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
 	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 2e63909a38c..96bc636d5ac 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -22,9 +22,10 @@ CCL_NAMESPACE_BEGIN
 /*
  * Queue utility functions for split kernel
  */
-
+#ifdef __KERNEL_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#endif
 
 /*
  * Enqueue ray index into the queue
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index f2ba3586c22..b6b891627bf 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1297,20 +1297,19 @@ enum QueueNumber {
 #define RAY_STATE_MASK 0x007
 #define RAY_FLAG_MASK 0x0F8
 enum RayState {
+	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
-	RAY_ACTIVE = 0,
+	RAY_ACTIVE,
 	/* Denotes ray has completed processing all samples and is inactive. */
-	RAY_INACTIVE = 1,
+	RAY_INACTIVE,
 	/* Denoted ray has exited path-iteration and needs to update output buffer. */
-	RAY_UPDATE_BUFFER = 2,
+	RAY_UPDATE_BUFFER,
 	/* Donotes ray has hit background */
-	RAY_HIT_BACKGROUND = 3,
+	RAY_HIT_BACKGROUND,
 	/* Denotes ray has to be regenerated */
-	RAY_TO_REGENERATE = 4,
+	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
-	RAY_REGENERATED = 5,
-	/* Denotes ray should skip direct lighting */
-	RAY_SKIP_DL = 6,
+	RAY_REGENERATED,
 	/* Flag's ray has to execute shadow blocked function in AO part */
 	RAY_SHADOW_RAY_CAST_AO = 16,
 	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index e42605c88e7..e8f574c5546 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -16,58 +16,27 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
+/* This kernel takes care of rays that hit the background (sceneintersect
+ * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
+ * accumulated radiance in the output buffer. This kernel also takes care of
+ * rays that have been determined to-be-regenerated.
  *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
  *
  * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel.
+ * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
+ * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
  *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------|                                      |--- ray_state
- * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
- * parallel_samples -------------------------------------|                                      |--- PathState_coop
- * end_sample -------------------------------------------|                                      |--- throughput_coop
- * kg (globals) -----------------------------------------|                                      |--- rng_coop
- * rng_state --------------------------------------------|                                      |--- Ray
- * PathRadiance_coop ------------------------------------|                                      |
- * sw ---------------------------------------------------|                                      |
- * sh ---------------------------------------------------|                                      |
- * sx ---------------------------------------------------|                                      |
- * sy ---------------------------------------------------|                                      |
- * stride -----------------------------------------------|                                      |
- * work_array -------------------------------------------|                                      |--- work_array
- * queuesize --------------------------------------------|                                      |
- * start_sample -----------------------------------------|                                      |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------|                                      |
- * num_samples ------------------------------------------|                                      |
- *
- * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
  */
 ccl_device void kernel_buffer_update(KernelGlobals *kg)
 {
@@ -225,4 +194,3 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 9b62d65ffd9..ed447049e48 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -16,16 +16,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
+/* This kernel Initializes structures needed in path-iteration kernels.
  *
- * Note on Queues :
+ * Note on Queues:
  * All slots in queues are initialized to queue empty slot;
  * The number of elements in the queues is initialized to 0;
  */
 
-/* distributes an amount of work across all threads
- * note: work done inside the loop may not show up to all threads till after the current kernel has completed
+/* Distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after
+ * the current kernel has completed
  */
 #define parallel_for(kg, iter_name, work_size) \
 	for(size_t _size = (work_size), \
@@ -151,4 +151,3 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 5163b8edc04..dfe461fb357 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -16,36 +16,29 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
+/* This kernel takes care of direct lighting logic.
+ * However, the "shadow ray cast" part of direct lighting is handled
  * in the next kernel.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with direct lighting should be executed. Those rays for which
+ * a shadow_blocked() function for direct-lighting must be executed, are
+ * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
+ * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
+ * the corresponding shadow_blocked part, after direct lighting, the ray is
+ * marked with RAY_SHADOW_RAY_CAST_DL flag.
  *
- * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------|                             |--- ISLamp_coop
- * sd -----------------------------------------------|                             |--- LightRay_coop
- * ray_state ----------------------------------------|                             |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
- * kg (globals) -------------------------------------|                             |
- * queuesize ----------------------------------------|                             |
- *
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ * State of queues when this kernel is called:
+ * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
+ *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
+ *   kernel call.
+ * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
+ *   shadow_blocked function must be executed, after this kernel call
+ *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
 ccl_device void kernel_direct_lighting(KernelGlobals *kg)
 {
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 7168efa59ae..bb948ad24b0 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -16,59 +16,41 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
+/* This kernel takes care of the logic to process "material of type holdout",
+ * indirect primitive emission, bsdf blurring, probabilistic path termination
+ * and AO.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with AO should be executed. Those rays for which a
+ * shadow_blocked() function for AO must be executed are marked with flag
+ * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS
  *
  * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFFER, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                           |--- PathState_coop
- * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
- * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
- * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                                           |--- ShaderData
- * ray_state --------------------------------------------|                                                           |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
- * kg (globals) -----------------------------------------|                                                           |--- AOBSDF_coop
- * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------|                                                           |
- * sw ---------------------------------------------------|                                                           |
- * sh ---------------------------------------------------|                                                           |
- * sx ---------------------------------------------------|                                                           |
- * sy ---------------------------------------------------|                                                           |
- * stride -----------------------------------------------|                                                           |
- * work_array -------------------------------------------|                                                           |
- * queuesize --------------------------------------------|                                                           |
- * start_sample -----------------------------------------|                                                           |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
+ *     flag RAY_SHADOW_RAY_CAST_AO
  */
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobals *kg)
 {
@@ -288,4 +270,3 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index e314a98105e..96ca0f094b1 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -18,40 +18,39 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_indirect_background(KernelGlobals *kg)
 {
-	/*
-	ccl_local unsigned int local_queue_atomics;
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		local_queue_atomics = 0;
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int ray_index;
+
+	if(kernel_data.integrator.ao_bounces) {
+		ray_index = get_ray_index(kg, thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index != QUEUE_EMPTY_SLOT) {
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+				if(state->bounce > kernel_data.integrator.ao_bounces) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				}
+			}
+		}
 	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
-	// */
 
-	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-	ray_index = get_ray_index(kg, ray_index,
+	ray_index = get_ray_index(kg, thread_index,
 	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
 	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
 	}
-#endif
 
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
-
-	ccl_global char *ray_state = kernel_split_state.ray_state;
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
@@ -78,9 +77,6 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 		}
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
 
 }
 
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 84de231b78c..f61643cceef 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -16,25 +16,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
+/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
  * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals) ---------------------------------------|                           |
- * Intersection_coop ----------------------------------|                           |
- * ray_state ------------------------------------------|                           |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
- * queuesize ------------------------------------------|                           |
- * use_queues_flag ------------------------------------|                           |
- * sw -------------------------------------------------|                           |
- * sh -------------------------------------------------|                           |
  */
 ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 {
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index a6f26278116..ad1f6c78e8f 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -16,48 +16,33 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
+/*This kernel takes care of setting up ray for the next iteration of
+ * path-iteration and accumulating radiance corresponding to AO and
+ * direct-lighting
  *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ * Ray state of rays that are terminated in this kernel are changed
+ * to RAY_UPDATE_BUFFER.
  *
- * The input and output are as follows,
+ * Note on queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFF state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFF, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
- * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                 |--- PathState_coop
- * ray_state --------------------------------------------|                                 |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
- * Ray_coop ---------------------------------------------|                                 |
- * kg (globals) -----------------------------------------|                                 |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
 ccl_device void kernel_next_iteration_setup(KernelGlobals *kg)
 {
@@ -182,4 +167,3 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index f44aff30fa9..f879fca5009 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -21,7 +21,6 @@ CCL_NAMESPACE_BEGIN
  *
  * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
  */
-
 ccl_device void kernel_path_init(KernelGlobals *kg) {
 	int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
 
@@ -101,4 +100,3 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
index 70ec92b394b..f4a4657d23f 100644
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -16,36 +16,24 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
- * queuesize -------------------------------------------|                           |
+/* This kernel enqueues rays of different ray state into their
+ * appropriate queues:
  *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ * 1. Rays that have been determined to hit the background from the
+ *    "kernel_scene_intersect" kernel are enqueued in
+ *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in pat
+ *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
  *
- * State of queue during other times this kernel is called :
+ * State of queue during other times this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
+ *     and RAY_UPDATE_BUFFER rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
  */
 ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
 {
@@ -101,4 +89,3 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 144cba67e23..33d0df7a2cb 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -16,51 +16,13 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
+/* This kernel takes care of scene_intersect function.
  *
  * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
  * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------|                                          |--- Intersection
- * ray_state --------------------------------------|                                          |--- ray_state
- * use_queues_flag --------------------------------|                                          |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
- * kg (globals) -----------------------------------|                                          |
- * rng_coop ---------------------------------------|                                          |
- * sw ---------------------------------------------|                                          |
- * sh ---------------------------------------------|                                          |
- * queuesize --------------------------------------|                                          |
- *
- * Note on Queues :
- * Ideally we would want kernel_scene_intersect to work on queues.
- * But during the very first time, the queues will be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ * This kernel determines the rays that have hit the background and changes
+ * their ray state to RAY_HIT_BACKGROUND.
  */
-
 ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 {
 	/* Fetch use_queues_flag */
@@ -116,6 +78,11 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
 	}
 
+	if(state.bounce > kernel_data.integrator.ao_bounces) {
+		visibility = PATH_RAY_SHADOW;
+		ray.t = kernel_data.background.ao_distance;
+	}
+
 	bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
 	bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
@@ -141,4 +108,3 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 4bd5c8b6eb0..43872c6f388 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -16,35 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
  *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd
- * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------|                         |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
- * ray_state ------------------------------------------|                         |
- * kg (globals) ---------------------------------------|                         |
- * queuesize ------------------------------------------|                         |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
  */
-
 ccl_device void kernel_shader_eval(KernelGlobals *kg)
 {
 	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
@@ -91,4 +68,3 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index e153c16bd68..a1eb0d1eccd 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -16,33 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                            |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * kg (globals) ------------------------------------|                            |
- * queuesize ---------------------------------------|                            |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS queue. We will empty this queues in this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty at kernel exit.
- */
+/* Shadow ray cast for AO. */
 ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
 {
 	int lidx = ccl_local_id(1) * ccl_local_id(0) + ccl_local_id(0);
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index cfd8d78c2de..2e5629944dc 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -16,33 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                            |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * kg (globals) ------------------------------------|                            |
- * queuesize ---------------------------------------|                            |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_DL_RAYS queue. We will empty this queue in this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_DL, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
+/* Shadow ray cast for direct visible light. */
 ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 {
 	int lidx = ccl_local_id(1) * ccl_local_id(0) + ccl_local_id(0);
@@ -88,4 +62,3 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index fcdd805f27b..e282ac00a63 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -1,4 +1,18 @@
-
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/release/scripts/modules/bpy_types.py b/release/scripts/modules/bpy_types.py
index d64acd2ce3b..b6a9b2f2f62 100644
--- a/release/scripts/modules/bpy_types.py
+++ b/release/scripts/modules/bpy_types.py
@@ -725,11 +725,30 @@ class Header(StructRNA, _GenericUI, metaclass=RNAMeta):
 class Menu(StructRNA, _GenericUI, metaclass=RNAMeta):
     __slots__ = ()
 
-    def path_menu(self, searchpaths, operator,
-                  props_default=None, filter_ext=None):
+    def path_menu(self, searchpaths, operator, *,
+                  props_default=None, prop_filepath="filepath",
+                  filter_ext=None, display_name=None):
+        """
+        Populate a menu from a list of paths.
+
+        :arg searchpaths: Paths to scan.
+        :type searchpaths: sequence of strings.
+        :arg operator: The operator id to use with each file.
+        :type operator: string
+        :arg prop_filepath: Optional operator filepath property (defaults to "filepath").
+        :type prop_filepath: string
+        :arg props_default: Properties to assign to each operator.
+        :type props_default: dict
+        :arg filter_ext: Optional callback that takes the file extensions.
+
+           Returning false excludes the file from the list.
+
+        :type filter_ext: Callable that takes a string and returns a bool.
+        :arg display_name: Optional callback that takes the full path, returns the name to display.
+        :type display_name: Callable that takes a string and returns a string.
+        """
 
         layout = self.layout
-        # hard coded to set the operators 'filepath' to the filename.
 
         import os
         import bpy.utils
@@ -752,15 +771,19 @@ class Menu(StructRNA, _GenericUI, metaclass=RNAMeta):
         files.sort()
 
         for f, filepath in files:
-            props = layout.operator(operator,
-                                    text=bpy.path.display_name(f),
-                                    translate=False)
+            # Intentionally pass the full path to 'display_name' callback,
+            # since the callback may want to use part a directory in the name.
+            props = layout.operator(
+                operator,
+                text=display_name(filepath) if display_name else bpy.path.display_name(f),
+                translate=False,
+            )
 
             if props_default is not None:
                 for attr, value in props_default.items():
                     setattr(props, attr, value)
 
-            props.filepath = filepath
+            setattr(props, prop_filepath, filepath)
             if operator == "script.execute_preset":
                 props.menu_idname = self.bl_idname
 
diff --git a/release/scripts/startup/bl_operators/presets.py b/release/scripts/startup/bl_operators/presets.py
index e01e509b292..cb332c18127 100644
--- a/release/scripts/startup/bl_operators/presets.py
+++ b/release/scripts/startup/bl_operators/presets.py
@@ -135,7 +135,7 @@ class AddPresetBase:
 
                             file_preset.write("%s = %r\n" % (rna_path_step, value))
 
-                    file_preset = open(filepath, 'w')
+                    file_preset = open(filepath, 'w', encoding="utf-8")
                     file_preset.write("import bpy\n")
 
                     if hasattr(self, "preset_defines"):
diff --git a/release/scripts/startup/bl_ui/space_text.py b/release/scripts/startup/bl_ui/space_text.py
index 1fd10575e07..8c5418161ca 100644
--- a/release/scripts/startup/bl_ui/space_text.py
+++ b/release/scripts/startup/bl_ui/space_text.py
@@ -215,20 +215,22 @@ class TEXT_MT_templates_py(Menu):
     bl_label = "Python"
 
     def draw(self, context):
-        self.path_menu(bpy.utils.script_paths("templates_py"),
-                       "text.open",
-                       {"internal": True},
-                       )
+        self.path_menu(
+            bpy.utils.script_paths("templates_py"),
+            "text.open",
+            props_default={"internal": True},
+        )
 
 
 class TEXT_MT_templates_osl(Menu):
     bl_label = "Open Shading Language"
 
     def draw(self, context):
-        self.path_menu(bpy.utils.script_paths("templates_osl"),
-                       "text.open",
-                       {"internal": True},
-                       )
+        self.path_menu(
+            bpy.utils.script_paths("templates_osl"),
+            "text.open",
+            props_default={"internal": True},
+        )
 
 
 class TEXT_MT_templates(Menu):
diff --git a/source/blender/blenkernel/intern/mesh.c b/source/blender/blenkernel/intern/mesh.c
index 54f1dc569d5..0f0d84cc87d 100644
--- a/source/blender/blenkernel/intern/mesh.c
+++ b/source/blender/blenkernel/intern/mesh.c
@@ -2149,6 +2149,8 @@ static int split_faces_prepare_new_verts(
 				/* If vert is already used by another smooth fan, we need a new vert for this one. */
 				const int new_vert_idx = vert_used ? num_verts++ : vert_idx;
 
+				BLI_assert(*lnor_space);
+
 				if ((*lnor_space)->loops) {
 					for (LinkNode *lnode = (*lnor_space)->loops; lnode; lnode = lnode->next) {
 						const int ml_fan_idx = GET_INT_FROM_POINTER(lnode->link);
@@ -2381,19 +2383,24 @@ void BKE_mesh_split_faces(Mesh *mesh, bool free_loop_normals)
 		 * loops' vertex and edge indices to new, to-be-created split ones). */
 
 		const int num_new_edges = split_faces_prepare_new_edges(mesh, &new_edges, memarena);
-		BLI_assert(num_new_edges > 0);
+		/* We can have to split a vertex without having to add a single new edge... */
+		const bool do_edges = (num_new_edges > 0);
 
 		/* Reallocate all vert and edge related data. */
 		mesh->totvert += num_new_verts;
 		mesh->totedge += num_new_edges;
 		CustomData_realloc(&mesh->vdata, mesh->totvert);
-		CustomData_realloc(&mesh->edata, mesh->totedge);
+		if (do_edges) {
+			CustomData_realloc(&mesh->edata, mesh->totedge);
+		}
 		/* Update pointers to a newly allocated memory. */
 		BKE_mesh_update_customdata_pointers(mesh, false);
 
 		/* Perform actual split of vertices and edges. */
 		split_faces_split_new_verts(mesh, new_verts, num_new_verts);
-		split_faces_split_new_edges(mesh, new_edges, num_new_edges);
+		if (do_edges) {
+			split_faces_split_new_edges(mesh, new_edges, num_new_edges);
+		}
 	}
 
 	/* Note: after this point mesh is expected to be valid again. */
diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index e96a434194c..0d0055113b7 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -304,13 +304,13 @@ void BKE_mesh_calc_normals_poly(
 void BKE_mesh_calc_normals(Mesh *mesh)
 {
 #ifdef DEBUG_TIME
-	TIMEIT_START(BKE_mesh_calc_normals);
+	TIMEIT_START_AVERAGED(BKE_mesh_calc_normals);
 #endif
 	BKE_mesh_calc_normals_poly(mesh->mvert, NULL, mesh->totvert,
 	                           mesh->mloop, mesh->mpoly, mesh->totloop, mesh->totpoly,
 	                           NULL, false);
 #ifdef DEBUG_TIME
-	TIMEIT_END(BKE_mesh_calc_normals);
+	TIMEIT_END_AVERAGED(BKE_mesh_calc_normals);
 #endif
 }
 
@@ -630,7 +630,6 @@ typedef struct LoopSplitTaskDataCommon {
 	 * Note we do not need to protect it, though, since two different tasks will *always* affect different
 	 * elements in the arrays. */
 	MLoopNorSpaceArray *lnors_spacearr;
-	BLI_bitmap *sharp_verts;
 	float (*loopnors)[3];
 	short (*clnors_data)[2];
 
@@ -643,11 +642,8 @@ typedef struct LoopSplitTaskDataCommon {
 	const int *loop_to_poly;
 	const float (*polynors)[3];
 
+	int numLoops;
 	int numPolys;
-
-	/* ***** Workers communication. ***** */
-	ThreadQueue *task_queue;
-
 } LoopSplitTaskDataCommon;
 
 #define INDEX_UNSET INT_MIN
@@ -655,6 +651,50 @@ typedef struct LoopSplitTaskDataCommon {
 /* See comment about edge_to_loops below. */
 #define IS_EDGE_SHARP(_e2l) (ELEM((_e2l)[1], INDEX_UNSET, INDEX_INVALID))
 
+static void loop_manifold_fan_around_vert_next(
+        const MLoop *mloops, const MPoly *mpolys,
+        const int *loop_to_poly, const int *e2lfan_curr, const uint mv_pivot_index,
+        const MLoop **r_mlfan_curr, int *r_mlfan_curr_index, int *r_mlfan_vert_index, int *r_mpfan_curr_index)
+{
+	const MLoop *mlfan_next;
+	const MPoly *mpfan_next;
+
+	/* Warning! This is rather complex!
+	 * We have to find our next edge around the vertex (fan mode).
+	 * First we find the next loop, which is either previous or next to mlfan_curr_index, depending
+	 * whether both loops using current edge are in the same direction or not, and whether
+	 * mlfan_curr_index actually uses the vertex we are fanning around!
+	 * mlfan_curr_index is the index of mlfan_next here, and mlfan_next is not the real next one
+	 * (i.e. not the future mlfan_curr)...
+	 */
+	*r_mlfan_curr_index = (e2lfan_curr[0] == *r_mlfan_curr_index) ? e2lfan_curr[1] : e2lfan_curr[0];
+	*r_mpfan_curr_index = loop_to_poly[*r_mlfan_curr_index];
+
+	BLI_assert(*r_mlfan_curr_index >= 0);
+	BLI_assert(*r_mpfan_curr_index >= 0);
+
+	mlfan_next = &mloops[*r_mlfan_curr_index];
+	mpfan_next = &mpolys[*r_mpfan_curr_index];
+	if (((*r_mlfan_curr)->v == mlfan_next->v && (*r_mlfan_curr)->v == mv_pivot_index) ||
+	    ((*r_mlfan_curr)->v != mlfan_next->v && (*r_mlfan_curr)->v != mv_pivot_index))
+	{
+		/* We need the previous loop, but current one is our vertex's loop. */
+		*r_mlfan_vert_index = *r_mlfan_curr_index;
+		if (--(*r_mlfan_curr_index) < mpfan_next->loopstart) {
+			*r_mlfan_curr_index = mpfan_next->loopstart + mpfan_next->totloop - 1;
+		}
+	}
+	else {
+		/* We need the next loop, which is also our vertex's loop. */
+		if (++(*r_mlfan_curr_index) >= mpfan_next->loopstart + mpfan_next->totloop) {
+			*r_mlfan_curr_index = mpfan_next->loopstart;
+		}
+		*r_mlfan_vert_index = *r_mlfan_curr_index;
+	}
+	*r_mlfan_curr = &mloops[*r_mlfan_curr_index];
+	/* And now we are back in sync, mlfan_curr_index is the index of mlfan_curr! Pff! */
+}
+
 static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *data)
 {
 	MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr;
@@ -680,7 +720,7 @@ static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopS
 	 */
 	copy_v3_v3(*lnor, polynors[mp_index]);
 
-	/* printf("BASIC: handling loop %d / edge %d / vert %d / poly %d\n", ml_curr_index, ml_curr->e, ml_curr->v, mp_index); */
+//	printf("BASIC: handling loop %d / edge %d / vert %d / poly %d\n", ml_curr_index, ml_curr->e, ml_curr->v, mp_index);
 
 	/* If needed, generate this (simple!) lnor space. */
 	if (lnors_spacearr) {
@@ -747,8 +787,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 	const MEdge *me_org = &medges[ml_curr->e];  /* ml_curr would be mlfan_prev if we needed that one */
 	const int *e2lfan_curr;
 	float vec_curr[3], vec_prev[3], vec_org[3];
-	const MLoop *mlfan_curr, *mlfan_next;
-	const MPoly *mpfan_next;
+	const MLoop *mlfan_curr;
 	float lnor[3] = {0.0f, 0.0f, 0.0f};
 	/* mlfan_vert_index: the loop of our current edge might not be the loop of our current vertex! */
 	int mlfan_curr_index, mlfan_vert_index, mpfan_curr_index;
@@ -787,7 +826,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 		}
 	}
 
-	/* printf("FAN: vert %d, start edge %d\n", mv_pivot_index, ml_curr->e); */
+//	printf("FAN: vert %d, start edge %d\n", mv_pivot_index, ml_curr->e);
 
 	while (true) {
 		const MEdge *me_curr = &medges[mlfan_curr->e];
@@ -803,7 +842,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 			normalize_v3(vec_curr);
 		}
 
-		/* printf("\thandling edge %d / loop %d\n", mlfan_curr->e, mlfan_curr_index); */
+//		printf("\thandling edge %d / loop %d\n", mlfan_curr->e, mlfan_curr_index);
 
 		{
 			/* Code similar to accumulate_vertex_normals_poly. */
@@ -845,46 +884,16 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 			/* Current edge is sharp and we have finished with this fan of faces around this vert,
 			 * or this vert is smooth, and we have completed a full turn around it.
 			 */
-			/* printf("FAN: Finished!\n"); */
+//			printf("FAN: Finished!\n");
 			break;
 		}
 
 		copy_v3_v3(vec_prev, vec_curr);
 
-		/* Warning! This is rather complex!
-		 * We have to find our next edge around the vertex (fan mode).
-		 * First we find the next loop, which is either previous or next to mlfan_curr_index, depending
-		 * whether both loops using current edge are in the same direction or not, and whether
-		 * mlfan_curr_index actually uses the vertex we are fanning around!
-		 * mlfan_curr_index is the index of mlfan_next here, and mlfan_next is not the real next one
-		 * (i.e. not the future mlfan_curr)...
-		 */
-		mlfan_curr_index = (e2lfan_curr[0] == mlfan_curr_index) ? e2lfan_curr[1] : e2lfan_curr[0];
-		mpfan_curr_index = loop_to_poly[mlfan_curr_index];
-
-		BLI_assert(mlfan_curr_index >= 0);
-		BLI_assert(mpfan_curr_index >= 0);
-
-		mlfan_next = &mloops[mlfan_curr_index];
-		mpfan_next = &mpolys[mpfan_curr_index];
-		if ((mlfan_curr->v == mlfan_next->v && mlfan_curr->v == mv_pivot_index) ||
-		    (mlfan_curr->v != mlfan_next->v && mlfan_curr->v != mv_pivot_index))
-		{
-			/* We need the previous loop, but current one is our vertex's loop. */
-			mlfan_vert_index = mlfan_curr_index;
-			if (--mlfan_curr_index < mpfan_next->loopstart) {
-				mlfan_curr_index = mpfan_next->loopstart + mpfan_next->totloop - 1;
-			}
-		}
-		else {
-			/* We need the next loop, which is also our vertex's loop. */
-			if (++mlfan_curr_index >= mpfan_next->loopstart + mpfan_next->totloop) {
-				mlfan_curr_index = mpfan_next->loopstart;
-			}
-			mlfan_vert_index = mlfan_curr_index;
-		}
-		mlfan_curr = &mloops[mlfan_curr_index];
-		/* And now we are back in sync, mlfan_curr_index is the index of mlfan_curr! Pff! */
+		/* Find next loop of the smooth fan. */
+		loop_manifold_fan_around_vert_next(
+		            mloops, mpolys, loop_to_poly, e2lfan_curr, mv_pivot_index,
+		            &mlfan_curr, &mlfan_curr_index, &mlfan_vert_index, &mpfan_curr_index);
 
 		e2lfan_curr = edge_to_loops[mlfan_curr->e];
 	}
@@ -955,31 +964,25 @@ static void loop_split_worker_do(
 	}
 }
 
-static void loop_split_worker(TaskPool * __restrict UNUSED(pool), void *taskdata, int UNUSED(threadid))
+static void loop_split_worker(TaskPool * __restrict pool, void *taskdata, int UNUSED(threadid))
 {
-	LoopSplitTaskDataCommon *common_data = taskdata;
-	LoopSplitTaskData *data_buff;
+	LoopSplitTaskDataCommon *common_data = BLI_task_pool_userdata(pool);
+	LoopSplitTaskData *data = taskdata;
 
 	/* Temp edge vectors stack, only used when computing lnor spacearr. */
 	BLI_Stack *edge_vectors = common_data->lnors_spacearr ? BLI_stack_new(sizeof(float[3]), __func__) : NULL;
 
 #ifdef DEBUG_TIME
-	TIMEIT_START(loop_split_worker);
+	TIMEIT_START_AVERAGED(loop_split_worker);
 #endif
 
-	while ((data_buff = BLI_thread_queue_pop(common_data->task_queue))) {
-		LoopSplitTaskData *data = data_buff;
-		int i;
-
-		for (i = 0; i < LOOP_SPLIT_TASK_BLOCK_SIZE; i++, data++) {
-			/* A NULL ml_curr is used to tag ended data! */
-			if (data->ml_curr == NULL) {
-				break;
-			}
-			loop_split_worker_do(common_data, data, edge_vectors);
+	for (int i = 0; i < LOOP_SPLIT_TASK_BLOCK_SIZE; i++, data++) {
+		/* A NULL ml_curr is used to tag ended data! */
+		if (data->ml_curr == NULL) {
+			break;
 		}
 
-		MEM_freeN(data_buff);
+		loop_split_worker_do(common_data, data, edge_vectors);
 	}
 
 	if (edge_vectors) {
@@ -987,38 +990,104 @@ static void loop_split_worker(TaskPool * __restrict UNUSED(pool), void *taskdata
 	}
 
 #ifdef DEBUG_TIME
-	TIMEIT_END(loop_split_worker);
+	TIMEIT_END_AVERAGED(loop_split_worker);
 #endif
 }
 
-static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const bool threaded)
+/* Check whether gievn loop is part of an unknown-so-far cyclic smooth fan, or not.
+ * Needed because cyclic smooth fans have no obvious 'entry point', and yet we need to walk them once, and only once. */
+static bool loop_split_generator_check_cyclic_smooth_fan(
+        const MLoop *mloops, const MPoly *mpolys,
+        const int (*edge_to_loops)[2], const int *loop_to_poly, const int *e2l_prev, BLI_bitmap *skip_loops,
+        const MLoop *ml_curr, const MLoop *ml_prev, const int ml_curr_index, const int ml_prev_index,
+        const int mp_curr_index)
+{
+	const unsigned int mv_pivot_index = ml_curr->v;  /* The vertex we are "fanning" around! */
+	const int *e2lfan_curr;
+	const MLoop *mlfan_curr;
+	/* mlfan_vert_index: the loop of our current edge might not be the loop of our current vertex! */
+	int mlfan_curr_index, mlfan_vert_index, mpfan_curr_index;
+
+	e2lfan_curr = e2l_prev;
+	if (IS_EDGE_SHARP(e2lfan_curr)) {
+		/* Sharp loop, so not a cyclic smooth fan... */
+		return false;
+	}
+
+	mlfan_curr = ml_prev;
+	mlfan_curr_index = ml_prev_index;
+	mlfan_vert_index = ml_curr_index;
+	mpfan_curr_index = mp_curr_index;
+
+	BLI_assert(mlfan_curr_index >= 0);
+	BLI_assert(mlfan_vert_index >= 0);
+	BLI_assert(mpfan_curr_index >= 0);
+
+	BLI_assert(!BLI_BITMAP_TEST(skip_loops, mlfan_vert_index));
+	BLI_BITMAP_ENABLE(skip_loops, mlfan_vert_index);
+
+	while(true) {
+		/* Find next loop of the smooth fan. */
+		loop_manifold_fan_around_vert_next(
+		            mloops, mpolys, loop_to_poly, e2lfan_curr, mv_pivot_index,
+		            &mlfan_curr, &mlfan_curr_index, &mlfan_vert_index, &mpfan_curr_index);
+
+		e2lfan_curr = edge_to_loops[mlfan_curr->e];
+
+		if (IS_EDGE_SHARP(e2lfan_curr)) {
+			/* Sharp loop/edge, so not a cyclic smooth fan... */
+			return false;
+		}
+		/* Smooth loop/edge... */
+		else if (BLI_BITMAP_TEST(skip_loops, mlfan_vert_index)) {
+			if (mlfan_vert_index == ml_curr_index) {
+				/* We walked around a whole cyclic smooth fan without finding any already-processed loop, means we can
+				 * use initial ml_curr/ml_prev edge as start for this smooth fan. */
+				return true;
+			}
+			/* ... already checked in some previous looping, we can abort. */
+			return false;
+		}
+		else {
+			/* ... we can skip it in future, and keep checking the smooth fan. */
+			BLI_BITMAP_ENABLE(skip_loops, mlfan_vert_index);
+		}
+	}
+}
+
+static void loop_split_generator(TaskPool *pool, LoopSplitTaskDataCommon *common_data)
 {
 	MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr;
-	BLI_bitmap *sharp_verts = common_data->sharp_verts;
 	float (*loopnors)[3] = common_data->loopnors;
 
 	const MLoop *mloops = common_data->mloops;
 	const MPoly *mpolys = common_data->mpolys;
+	const int *loop_to_poly = common_data->loop_to_poly;
 	const int (*edge_to_loops)[2] = common_data->edge_to_loops;
+	const int numLoops = common_data->numLoops;
 	const int numPolys = common_data->numPolys;
 
 	const MPoly *mp;
 	int mp_index;
 
-	LoopSplitTaskData *data, *data_buff = NULL, data_mem;
+	const MLoop *ml_curr;
+	const MLoop *ml_prev;
+	int ml_curr_index;
+	int ml_prev_index;
+
+	BLI_bitmap *skip_loops = BLI_BITMAP_NEW(numLoops, __func__);
+
+	LoopSplitTaskData *data_buff = NULL;
 	int data_idx = 0;
 
 	/* Temp edge vectors stack, only used when computing lnor spacearr (and we are not multi-threading). */
 	BLI_Stack *edge_vectors = NULL;
 
 #ifdef DEBUG_TIME
-	TIMEIT_START(loop_split_generator);
+	TIMEIT_START_AVERAGED(loop_split_generator);
 #endif
 
-	if (!threaded) {
-		memset(&data_mem, 0, sizeof(data_mem));
-		data = &data_mem;
-
+	if (!pool) {
 		if (lnors_spacearr) {
 			edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
 		}
@@ -1028,11 +1097,10 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 	 * Now, time to generate the normals.
 	 */
 	for (mp = mpolys, mp_index = 0; mp_index < numPolys; mp++, mp_index++) {
-		const MLoop *ml_curr, *ml_prev;
 		float (*lnors)[3];
 		const int ml_last_index = (mp->loopstart + mp->totloop) - 1;
-		int ml_curr_index = mp->loopstart;
-		int ml_prev_index = ml_last_index;
+		ml_curr_index = mp->loopstart;
+		ml_prev_index = ml_last_index;
 
 		ml_curr = &mloops[ml_curr_index];
 		ml_prev = &mloops[ml_prev_index];
@@ -1042,23 +1110,40 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 			const int *e2l_curr = edge_to_loops[ml_curr->e];
 			const int *e2l_prev = edge_to_loops[ml_prev->e];
 
-			if (!IS_EDGE_SHARP(e2l_curr) && (!lnors_spacearr || BLI_BITMAP_TEST_BOOL(sharp_verts, ml_curr->v))) {
-				/* A smooth edge, and we are not generating lnor_spacearr, or the related vertex is sharp.
-				 * We skip it because it is either:
-				 * - in the middle of a 'smooth fan' already computed (or that will be as soon as we hit
-				 *   one of its ends, i.e. one of its two sharp edges), or...
-				 * - the related vertex is a "full smooth" one, in which case pre-populated normals from vertex
-				 *   are just fine (or it has already be handled in a previous loop in case of needed lnors spacearr)!
-				 */
-				/* printf("Skipping loop %d / edge %d / vert %d(%d)\n", ml_curr_index, ml_curr->e, ml_curr->v, sharp_verts[ml_curr->v]); */
+//			printf("Checking loop %d / edge %u / vert %u (sharp edge: %d, skiploop: %d)...",
+//			       ml_curr_index, ml_curr->e, ml_curr->v, IS_EDGE_SHARP(e2l_curr), BLI_BITMAP_TEST_BOOL(skip_loops, ml_curr_index));
+
+			/* A smooth edge, we have to check for cyclic smooth fan case.
+			 * If we find a new, never-processed cyclic smooth fan, we can do it now using that loop/edge as
+			 * 'entry point', otherwise we can skip it. */
+			/* Note: In theory, we could make loop_split_generator_check_cyclic_smooth_fan() store
+			 * mlfan_vert_index'es and edge indexes in two stacks, to avoid having to fan again around the vert during
+			 * actual computation of clnor & clnorspace. However, this would complicate the code, add more memory usage,
+			 * and despite its logical complexity, loop_manifold_fan_around_vert_next() is quite cheap in term of
+			 * CPU cycles, so really think it's not worth it. */
+			if (!IS_EDGE_SHARP(e2l_curr) &&
+			    (BLI_BITMAP_TEST(skip_loops, ml_curr_index) ||
+			     !loop_split_generator_check_cyclic_smooth_fan(
+			              mloops, mpolys, edge_to_loops, loop_to_poly, e2l_prev, skip_loops,
+			              ml_curr, ml_prev, ml_curr_index, ml_prev_index, mp_index)))
+			{
+//				printf("SKIPPING!\n");
 			}
 			else {
-				if (threaded) {
+				LoopSplitTaskData *data, data_local;
+
+//				printf("PROCESSING!\n");
+
+				if (pool) {
 					if (data_idx == 0) {
 						data_buff = MEM_callocN(sizeof(*data_buff) * LOOP_SPLIT_TASK_BLOCK_SIZE, __func__);
 					}
 					data = &data_buff[data_idx];
 				}
+				else {
+					data = &data_local;
+					memset(data, 0, sizeof(*data));
+				}
 
 				if (IS_EDGE_SHARP(e2l_curr) && IS_EDGE_SHARP(e2l_prev)) {
 					data->lnor = lnors;
@@ -1094,22 +1179,18 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 					data->mp_index = mp_index;
 					if (lnors_spacearr) {
 						data->lnor_space = BKE_lnor_space_create(lnors_spacearr);
-						/* Tag related vertex as sharp, to avoid fanning around it again (in case it was a smooth one).
-						 * This *has* to be done outside of workers tasks! */
-						BLI_BITMAP_ENABLE(sharp_verts, ml_curr->v);
 					}
 				}
 
-				if (threaded) {
+				if (pool) {
 					data_idx++;
 					if (data_idx == LOOP_SPLIT_TASK_BLOCK_SIZE) {
-						BLI_thread_queue_push(common_data->task_queue, data_buff);
+						BLI_task_pool_push(pool, loop_split_worker, data_buff, true, TASK_PRIORITY_LOW);
 						data_idx = 0;
 					}
 				}
 				else {
 					loop_split_worker_do(common_data, data, edge_vectors);
-					memset(data, 0, sizeof(data_mem));
 				}
 			}
 
@@ -1118,38 +1199,27 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 		}
 	}
 
-	if (threaded) {
-		/* Last block of data... Since it is calloc'ed and we use first NULL item as stopper, everything is fine. */
-		if (LIKELY(data_idx)) {
-			BLI_thread_queue_push(common_data->task_queue, data_buff);
-		}
-
-		/* This will signal all other worker threads to wake up and finish! */
-		BLI_thread_queue_nowait(common_data->task_queue);
+	/* Last block of data... Since it is calloc'ed and we use first NULL item as stopper, everything is fine. */
+	if (pool && data_idx) {
+		BLI_task_pool_push(pool, loop_split_worker, data_buff, true, TASK_PRIORITY_LOW);
 	}
 
 	if (edge_vectors) {
 		BLI_stack_free(edge_vectors);
 	}
+	MEM_freeN(skip_loops);
 
 #ifdef DEBUG_TIME
-	TIMEIT_END(loop_split_generator);
+	TIMEIT_END_AVERAGED(loop_split_generator);
 #endif
 }
 
-static void loop_split_generator(TaskPool * __restrict UNUSED(pool), void *taskdata, int UNUSED(threadid))
-{
-	LoopSplitTaskDataCommon *common_data = taskdata;
-
-	loop_split_generator_do(common_data, true);
-}
-
 /**
  * Compute split normals, i.e. vertex normals associated with each poly (hence 'loop normals').
  * Useful to materialize sharp edges (or non-smooth faces) without actually modifying the geometry (splitting edges).
  */
 void BKE_mesh_normals_loop_split(
-        const MVert *mverts, const int numVerts, MEdge *medges, const int numEdges,
+        const MVert *mverts, const int UNUSED(numVerts), MEdge *medges, const int numEdges,
         MLoop *mloops, float (*r_loopnors)[3], const int numLoops,
         MPoly *mpolys, const float (*polynors)[3], const int numPolys,
         const bool use_split_normals, float split_angle,
@@ -1187,8 +1257,6 @@ void BKE_mesh_normals_loop_split(
 		return;
 	}
 
-	{
-
 	/* Mapping edge -> loops.
 	 * If that edge is used by more than two loops (polys), it is always sharp (and tagged as such, see below).
 	 * We also use the second loop index as a kind of flag: smooth edge: > 0,
@@ -1198,33 +1266,25 @@ void BKE_mesh_normals_loop_split(
 	 * store the negated value of loop index instead of INDEX_INVALID to retrieve the real value later in code).
 	 * Note also that lose edges always have both values set to 0!
 	 */
-	int (*edge_to_loops)[2] = MEM_callocN(sizeof(int[2]) * (size_t)numEdges, __func__);
+	int (*edge_to_loops)[2] = MEM_callocN(sizeof(*edge_to_loops) * (size_t)numEdges, __func__);
 
 	/* Simple mapping from a loop to its polygon index. */
-	int *loop_to_poly = r_loop_to_poly ? r_loop_to_poly : MEM_mallocN(sizeof(int) * (size_t)numLoops, __func__);
+	int *loop_to_poly = r_loop_to_poly ? r_loop_to_poly : MEM_mallocN(sizeof(*loop_to_poly) * (size_t)numLoops, __func__);
 
 	MPoly *mp;
-	int mp_index, me_index;
-	bool check_angle = (split_angle < (float)M_PI);
-	int i;
+	int mp_index;
 
-	BLI_bitmap *sharp_verts = NULL;
-	MLoopNorSpaceArray _lnors_spacearr = {NULL};
+	/* When using custom loop normals, disable the angle feature! */
+	const bool check_angle = (split_angle < (float)M_PI) && (clnors_data == NULL);
 
-	LoopSplitTaskDataCommon common_data = {NULL};
+	MLoopNorSpaceArray _lnors_spacearr = {NULL};
 
 #ifdef DEBUG_TIME
-	TIMEIT_START(BKE_mesh_normals_loop_split);
+	TIMEIT_START_AVERAGED(BKE_mesh_normals_loop_split);
 #endif
 
 	if (check_angle) {
-		/* When using custom loop normals, disable the angle feature! */
-		if (clnors_data) {
-			check_angle = false;
-		}
-		else {
-			split_angle = cosf(split_angle);
-		}
+		split_angle = cosf(split_angle);
 	}
 
 	if (!r_lnors_spacearr && clnors_data) {
@@ -1233,7 +1293,6 @@ void BKE_mesh_normals_loop_split(
 	}
 	if (r_lnors_spacearr) {
 		BKE_lnor_spacearr_init(r_lnors_spacearr, numLoops);
-		sharp_verts = BLI_BITMAP_NEW((size_t)numVerts, __func__);
 	}
 
 	/* This first loop check which edges are actually smooth, and compute edge vectors. */
@@ -1287,60 +1346,38 @@ void BKE_mesh_normals_loop_split(
 		}
 	}
 
-	if (r_lnors_spacearr) {
-		/* Tag vertices that have at least one sharp edge as 'sharp' (used for the lnor spacearr computation).
-		 * XXX This third loop over edges is a bit disappointing, could not find any other way yet.
-		 *     Not really performance-critical anyway.
-		 */
-		for (me_index = 0; me_index < numEdges; me_index++) {
-			const int *e2l = edge_to_loops[me_index];
-			const MEdge *me = &medges[me_index];
-			if (IS_EDGE_SHARP(e2l)) {
-				BLI_BITMAP_ENABLE(sharp_verts, me->v1);
-				BLI_BITMAP_ENABLE(sharp_verts, me->v2);
-			}
-		}
-	}
-
 	/* Init data common to all tasks. */
-	common_data.lnors_spacearr = r_lnors_spacearr;
-	common_data.loopnors = r_loopnors;
-	common_data.clnors_data = clnors_data;
-
-	common_data.mverts = mverts;
-	common_data.medges = medges;
-	common_data.mloops = mloops;
-	common_data.mpolys = mpolys;
-	common_data.sharp_verts = sharp_verts;
-	common_data.edge_to_loops = (const int(*)[2])edge_to_loops;
-	common_data.loop_to_poly = loop_to_poly;
-	common_data.polynors = polynors;
-	common_data.numPolys = numPolys;
+	LoopSplitTaskDataCommon common_data = {
+	    .lnors_spacearr = r_lnors_spacearr,
+	    .loopnors = r_loopnors,
+	    .clnors_data = clnors_data,
+	    .mverts = mverts,
+	    .medges = medges,
+	    .mloops = mloops,
+	    .mpolys = mpolys,
+	    .edge_to_loops = (const int(*)[2])edge_to_loops,
+	    .loop_to_poly = loop_to_poly,
+	    .polynors = polynors,
+	    .numLoops = numLoops,
+	    .numPolys = numPolys,
+	};
 
 	if (numLoops < LOOP_SPLIT_TASK_BLOCK_SIZE * 8) {
 		/* Not enough loops to be worth the whole threading overhead... */
-		loop_split_generator_do(&common_data, false);
+		loop_split_generator(NULL, &common_data);
 	}
 	else {
 		TaskScheduler *task_scheduler;
 		TaskPool *task_pool;
-		int nbr_workers;
-
-		common_data.task_queue = BLI_thread_queue_init();
 
 		task_scheduler = BLI_task_scheduler_get();
-		task_pool = BLI_task_pool_create(task_scheduler, NULL);
+		task_pool = BLI_task_pool_create(task_scheduler, &common_data);
+
+		loop_split_generator(task_pool, &common_data);
 
-		nbr_workers = max_ii(2, BLI_task_scheduler_num_threads(task_scheduler));
-		for (i = 1; i < nbr_workers; i++) {
-			BLI_task_pool_push(task_pool, loop_split_worker, &common_data, false, TASK_PRIORITY_HIGH);
-		}
-		BLI_task_pool_push(task_pool, loop_split_generator, &common_data, false, TASK_PRIORITY_HIGH);
 		BLI_task_pool_work_and_wait(task_pool);
 
 		BLI_task_pool_free(task_pool);
-
-		BLI_thread_queue_free(common_data.task_queue);
 	}
 
 	MEM_freeN(edge_to_loops);
@@ -1349,17 +1386,14 @@ void BKE_mesh_normals_loop_split(
 	}
 
 	if (r_lnors_spacearr) {
-		MEM_freeN(sharp_verts);
 		if (r_lnors_spacearr == &_lnors_spacearr) {
 			BKE_lnor_spacearr_free(r_lnors_spacearr);
 		}
 	}
 
 #ifdef DEBUG_TIME
-	TIMEIT_END(BKE_mesh_normals_loop_split);
+	TIMEIT_END_AVERAGED(BKE_mesh_normals_loop_split);
 #endif
-
-	}
 }
 
 #undef INDEX_UNSET
diff --git a/source/blender/bmesh/intern/bmesh_mesh.c b/source/blender/bmesh/intern/bmesh_mesh.c
index 57a6d8d2e1a..d92d0cbb469 100644
--- a/source/blender/bmesh/intern/bmesh_mesh.c
+++ b/source/blender/bmesh/intern/bmesh_mesh.c
@@ -486,8 +486,7 @@ static void bm_mesh_edges_sharp_tag(
         BMesh *bm, const float (*vnos)[3], const float (*fnos)[3], float split_angle,
         float (*r_lnos)[3])
 {
-	BMIter eiter, viter;
-	BMVert *v;
+	BMIter eiter;
 	BMEdge *e;
 	int i;
 
@@ -498,19 +497,13 @@ static void bm_mesh_edges_sharp_tag(
 	}
 
 	{
-		char htype = BM_LOOP;
+		char htype = BM_VERT | BM_LOOP;
 		if (fnos) {
 			htype |= BM_FACE;
 		}
 		BM_mesh_elem_index_ensure(bm, htype);
 	}
 
-	/* Clear all vertices' tags (means they are all smooth for now). */
-	BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
-		BM_elem_index_set(v, i); /* set_inline */
-		BM_elem_flag_disable(v, BM_ELEM_TAG);
-	}
-
 	/* This first loop checks which edges are actually smooth, and pre-populate lnos with vnos (as if they were
 	 * all smooth).
 	 */
@@ -551,20 +544,45 @@ static void bm_mesh_edges_sharp_tag(
 				no = vnos ? vnos[BM_elem_index_get(l_b->v)] : l_b->v->no;
 				copy_v3_v3(r_lnos[BM_elem_index_get(l_b)], no);
 			}
-			else {
-				/* Sharp edge, tag its verts as such. */
-				BM_elem_flag_enable(e->v1, BM_ELEM_TAG);
-				BM_elem_flag_enable(e->v2, BM_ELEM_TAG);
+		}
+	}
+
+	bm->elem_index_dirty &= ~BM_EDGE;
+}
+
+/* Check whether gievn loop is part of an unknown-so-far cyclic smooth fan, or not.
+ * Needed because cyclic smooth fans have no obvious 'entry point', and yet we need to walk them once, and only once. */
+static bool bm_mesh_loop_check_cyclic_smooth_fan(BMLoop *l_curr)
+{
+	BMLoop *lfan_pivot_next = l_curr;
+	BMEdge *e_next = l_curr->e;
+
+	BLI_assert(!BM_elem_flag_test(lfan_pivot_next, BM_ELEM_TAG));
+	BM_elem_flag_enable(lfan_pivot_next, BM_ELEM_TAG);
+
+	while (true) {
+		/* Much simpler than in sibling code with basic Mesh data! */
+		lfan_pivot_next = BM_vert_step_fan_loop(lfan_pivot_next, &e_next);
+
+		if (!lfan_pivot_next || !BM_elem_flag_test(e_next, BM_ELEM_TAG)) {
+			/* Sharp loop/edge, so not a cyclic smooth fan... */
+			return false;
+		}
+		/* Smooth loop/edge... */
+		else if (BM_elem_flag_test(lfan_pivot_next, BM_ELEM_TAG)) {
+			if (lfan_pivot_next == l_curr) {
+				/* We walked around a whole cyclic smooth fan without finding any already-processed loop, means we can
+				 * use initial l_curr/l_prev edge as start for this smooth fan. */
+				return true;
 			}
+			/* ... already checked in some previous looping, we can abort. */
+			return false;
 		}
 		else {
-			/* Sharp edge, tag its verts as such. */
-			BM_elem_flag_enable(e->v1, BM_ELEM_TAG);
-			BM_elem_flag_enable(e->v2, BM_ELEM_TAG);
+			/* ... we can skip it in future, and keep checking the smooth fan. */
+			BM_elem_flag_enable(lfan_pivot_next, BM_ELEM_TAG);
 		}
 	}
-
-	bm->elem_index_dirty &= ~(BM_EDGE | BM_VERT);
 }
 
 /* BMesh version of BKE_mesh_normals_loop_split() in mesh_evaluate.c
@@ -587,13 +605,11 @@ static void bm_mesh_loops_calc_normals(
 	BLI_Stack *edge_vectors = NULL;
 
 	{
-		char htype = BM_LOOP;
+		char htype = 0;
 		if (vcos) {
 			htype |= BM_VERT;
 		}
-		if (fnos) {
-			htype |= BM_FACE;
-		}
+		/* Face/Loop indices are set inline below. */
 		BM_mesh_elem_index_ensure(bm, htype);
 	}
 
@@ -606,6 +622,21 @@ static void bm_mesh_loops_calc_normals(
 		edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
 	}
 
+	/* Clear all loops' tags (means none are to be skipped for now). */
+	int index_face, index_loop = 0;
+	BM_ITER_MESH_INDEX (f_curr, &fiter, bm, BM_FACES_OF_MESH, index_face) {
+		BMLoop *l_curr, *l_first;
+
+		BM_elem_index_set(f_curr, index_face); /* set_inline */
+
+		l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
+		do {
+			BM_elem_index_set(l_curr, index_loop++); /* set_inline */
+			BM_elem_flag_disable(l_curr, BM_ELEM_TAG);
+		} while ((l_curr = l_curr->next) != l_first);
+	}
+	bm->elem_index_dirty &= ~(BM_FACE|BM_LOOP);
+
 	/* We now know edges that can be smoothed (they are tagged), and edges that will be hard (they aren't).
 	 * Now, time to generate the normals.
 	 */
@@ -614,16 +645,16 @@ static void bm_mesh_loops_calc_normals(
 
 		l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
 		do {
+			/* A smooth edge, we have to check for cyclic smooth fan case.
+			 * If we find a new, never-processed cyclic smooth fan, we can do it now using that loop/edge as
+			 * 'entry point', otherwise we can skip it. */
+			/* Note: In theory, we could make bm_mesh_loop_check_cyclic_smooth_fan() store mlfan_pivot's in a stack,
+			 * to avoid having to fan again around the vert during actual computation of clnor & clnorspace.
+			 * However, this would complicate the code, add more memory usage, and BM_vert_step_fan_loop()
+			 * is quite cheap in term of CPU cycles, so really think it's not worth it. */
 			if (BM_elem_flag_test(l_curr->e, BM_ELEM_TAG) &&
-			    (!r_lnors_spacearr || BM_elem_flag_test(l_curr->v, BM_ELEM_TAG)))
+			    (BM_elem_flag_test(l_curr, BM_ELEM_TAG) || !bm_mesh_loop_check_cyclic_smooth_fan(l_curr)))
 			{
-				/* A smooth edge, and we are not generating lnors_spacearr, or the related vertex is sharp.
-				 * We skip it because it is either:
-				 * - in the middle of a 'smooth fan' already computed (or that will be as soon as we hit
-				 *   one of its ends, i.e. one of its two sharp edges), or...
-				 * - the related vertex is a "full smooth" one, in which case pre-populated normals from vertex
-				 *   are just fine!
-				 */
 			}
 			else if (!BM_elem_flag_test(l_curr->e, BM_ELEM_TAG) &&
 			         !BM_elem_flag_test(l_curr->prev->e, BM_ELEM_TAG))
@@ -2008,4 +2039,4 @@ void BM_mesh_toolflags_set(BMesh *bm, bool use_toolflags)
 	        vpool_dst, epool_dst, NULL, fpool_dst);
 
 	bm->use_toolflags = use_toolflags;
-}
-\ No newline at end of file
+}
diff --git a/source/blender/bmesh/intern/bmesh_opdefines.c b/source/blender/bmesh/intern/bmesh_opdefines.c
index 0d0fdda2c4c..6b388a75436 100644
--- a/source/blender/bmesh/intern/bmesh_opdefines.c
+++ b/source/blender/bmesh/intern/bmesh_opdefines.c
@@ -1284,7 +1284,7 @@ static BMOpDefine bmo_bisect_plane_def = {
 	 {"clear_inner",   BMO_OP_SLOT_BOOL},    /* when enabled. remove all geometry on the negative side of the plane */
 	 {{'\0'}},
 	},
-	{{"geom_cut.out", BMO_OP_SLOT_ELEMENT_BUF, {BM_VERT | BM_EDGE}},  /* output new geometry from the cut */
+	{{"geom_cut.out", BMO_OP_SLOT_ELEMENT_BUF, {BM_VERT | BM_EDGE}},  /* output geometry aligned with the plane (new and existing) */
 	 {"geom.out",     BMO_OP_SLOT_ELEMENT_BUF, {BM_VERT | BM_EDGE | BM_FACE}},  /* input and output geometry (result of cut)  */
 	 {{'\0'}}},
 	bmo_bisect_plane_exec,
diff --git a/source/blender/bmesh/operators/bmo_bisect_plane.c b/source/blender/bmesh/operators/bmo_bisect_plane.c
index bed1ea5cb94..2c80ff651b8 100644
--- a/source/blender/bmesh/operators/bmo_bisect_plane.c
+++ b/source/blender/bmesh/operators/bmo_bisect_plane.c
@@ -38,7 +38,8 @@
 #include "intern/bmesh_operators_private.h" /* own include */
 
 #define ELE_NEW 1
-#define ELE_INPUT 2
+#define ELE_CUT 2
+#define ELE_INPUT 4
 
 void bmo_bisect_plane_exec(BMesh *bm, BMOperator *op)
 {
@@ -69,7 +70,7 @@ void bmo_bisect_plane_exec(BMesh *bm, BMOperator *op)
 
 
 	BM_mesh_bisect_plane(bm, plane, use_snap_center, true,
-	                     ELE_NEW, dist);
+	                     ELE_CUT, ELE_NEW, dist);
 
 
 	if (clear_outer || clear_inner) {
@@ -108,5 +109,5 @@ void bmo_bisect_plane_exec(BMesh *bm, BMOperator *op)
 	}
 
 	BMO_slot_buffer_from_enabled_flag(bm, op, op->slots_out, "geom.out", BM_ALL_NOLOOP, ELE_NEW | ELE_INPUT);
-	BMO_slot_buffer_from_enabled_flag(bm, op, op->slots_out, "geom_cut.out", BM_VERT | BM_EDGE, ELE_NEW);
+	BMO_slot_buffer_from_enabled_flag(bm, op, op->slots_out, "geom_cut.out", BM_VERT | BM_EDGE, ELE_CUT);
 }
diff --git a/source/blender/bmesh/tools/bmesh_bisect_plane.c b/source/blender/bmesh/tools/bmesh_bisect_plane.c
index 51b92a3c45e..828c50c39fd 100644
--- a/source/blender/bmesh/tools/bmesh_bisect_plane.c
+++ b/source/blender/bmesh/tools/bmesh_bisect_plane.c
@@ -110,7 +110,7 @@ static int bm_vert_sortval_cb(const void *v_a_v, const void *v_b_v)
 }
 
 
-static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], const short oflag_center)
+static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], const short oflag_center, const short oflag_new)
 {
 	/* unlikely more than 2 verts are needed */
 	const unsigned int f_len_orig = (unsigned int)f->len;
@@ -154,10 +154,11 @@ static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], con
 			/* common case, just cut the face once */
 			BM_face_split(bm, f, l_a, l_b, &l_new, NULL, true);
 			if (l_new) {
-				if (oflag_center) {
-					BMO_edge_flag_enable(bm, l_new->e, oflag_center);
-					BMO_face_flag_enable(bm, l_new->f, oflag_center);
-					BMO_face_flag_enable(bm, f,        oflag_center);
+				if (oflag_center | oflag_new) {
+					BMO_edge_flag_enable(bm, l_new->e, oflag_center | oflag_new);
+				}
+				if (oflag_new) {
+					BMO_face_flag_enable(bm, l_new->f, oflag_new);
 				}
 			}
 		}
@@ -269,10 +270,11 @@ static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], con
 						f_tmp = BM_face_split(bm, face_split_arr[j], l_a, l_b, &l_new, NULL, true);
 
 						if (l_new) {
-							if (oflag_center) {
-								BMO_edge_flag_enable(bm, l_new->e,          oflag_center);
-								BMO_face_flag_enable(bm, l_new->f,          oflag_center);
-								BMO_face_flag_enable(bm, face_split_arr[j], oflag_center);
+							if (oflag_center | oflag_new) {
+								BMO_edge_flag_enable(bm, l_new->e, oflag_center | oflag_new);
+							}
+							if (oflag_new) {
+								BMO_face_flag_enable(bm, l_new->f, oflag_new);
 							}
 						}
 
@@ -307,7 +309,7 @@ finally:
 void BM_mesh_bisect_plane(
         BMesh *bm, const float plane[4],
         const bool use_snap_center, const bool use_tag,
-        const short oflag_center, const float eps)
+        const short oflag_center, const short oflag_new, const float eps)
 {
 	unsigned int einput_len;
 	unsigned int i;
@@ -390,7 +392,7 @@ void BM_mesh_bisect_plane(
 		const float dist[2] = {BM_VERT_DIST(e->v1), BM_VERT_DIST(e->v2)};
 
 		if (side[0] && side[1] && (side[0] != side[1])) {
-			const float e_fac = fabsf(dist[0]) / fabsf(dist[0] - dist[1]);
+			const float e_fac = dist[0] / (dist[0] - dist[1]);
 			BMVert *v_new;
 
 			if (e->l) {
@@ -404,10 +406,17 @@ void BM_mesh_bisect_plane(
 				} while ((l_iter = l_iter->radial_next) != l_first);
 			}
 
-			v_new = BM_edge_split(bm, e, e->v1, NULL, e_fac);
+			{
+				BMEdge *e_new;
+				v_new = BM_edge_split(bm, e, e->v1, &e_new, e_fac);
+				if (oflag_new) {
+					BMO_edge_flag_enable(bm, e_new, oflag_new);
+				}
+			}
+
 			vert_is_center_enable(v_new);
-			if (oflag_center) {
-				BMO_vert_flag_enable(bm, v_new, oflag_center);
+			if (oflag_new | oflag_center) {
+				BMO_vert_flag_enable(bm, v_new, oflag_new | oflag_center);
 			}
 
 			BM_VERT_DIR(v_new) = 0;
@@ -448,7 +457,7 @@ void BM_mesh_bisect_plane(
 	MEM_freeN(edges_arr);
 
 	while ((f = BLI_LINKSTACK_POP(face_stack))) {
-		bm_face_bisect_verts(bm, f, plane, oflag_center);
+		bm_face_bisect_verts(bm, f, plane, oflag_center, oflag_new);
 	}
 
 	/* now we have all faces to split in the stack */
diff --git a/source/blender/bmesh/tools/bmesh_bisect_plane.h b/source/blender/bmesh/tools/bmesh_bisect_plane.h
index 7f3a97c4c79..fb99a1c8214 100644
--- a/source/blender/bmesh/tools/bmesh_bisect_plane.h
+++ b/source/blender/bmesh/tools/bmesh_bisect_plane.h
@@ -30,6 +30,6 @@
 void BM_mesh_bisect_plane(
         BMesh *bm, const float plane[4],
         const bool use_snap_center, const bool use_tag,
-        const short oflag_center, const float eps);
+        const short oflag_center, const short oflag_new, const float eps);
 
 #endif /* __BMESH_BISECT_PLANE_H__ */
diff --git a/source/blender/collada/ArmatureExporter.cpp b/source/blender/collada/ArmatureExporter.cpp
index 9c26ba83b44..40065956ecb 100644
--- a/source/blender/collada/ArmatureExporter.cpp
+++ b/source/blender/collada/ArmatureExporter.cpp
@@ -156,11 +156,6 @@ void ArmatureExporter::find_objects_using_armature(Object *ob_arm, std::vector<O
 }
 #endif
 
-std::string ArmatureExporter::get_joint_sid(Bone *bone, Object *ob_arm)
-{
-	return get_joint_id(bone, ob_arm);
-}
-
 // parent_mat is armature-space
 void ArmatureExporter::add_bone_node(Bone *bone, Object *ob_arm, Scene *sce,
                                      SceneExporter *se,
diff --git a/source/blender/collada/ArmatureExporter.h b/source/blender/collada/ArmatureExporter.h
index 883a6aca847..d271b505aa9 100644
--- a/source/blender/collada/ArmatureExporter.h
+++ b/source/blender/collada/ArmatureExporter.h
@@ -83,8 +83,6 @@ private:
 	void find_objects_using_armature(Object *ob_arm, std::vector<Object *>& objects, Scene *sce);
 #endif
 
-	std::string get_joint_sid(Bone *bone, Object *ob_arm);
-
 	// Scene, SceneExporter and the list of child_objects
 	// are required for writing bone parented objects
 	void add_bone_node(Bone *bone, Object *ob_arm, Scene *sce, SceneExporter *se,
diff --git a/source/blender/collada/ArmatureImporter.cpp b/source/blender/collada/ArmatureImporter.cpp
index 17334ca326c..f4ce3992771 100644
--- a/source/blender/collada/ArmatureImporter.cpp
+++ b/source/blender/collada/ArmatureImporter.cpp
@@ -106,7 +106,7 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 	*/
 
 	std::map<COLLADAFW::UniqueId, SkinInfo>::iterator skin_it;
-	bool bone_is_not_skinned = true;
+	bool bone_is_skinned = false;
 	for (skin_it = skin_by_data_uid.begin(); skin_it != skin_by_data_uid.end(); skin_it++) {
 
 		SkinInfo *b = &skin_it->second;
@@ -123,13 +123,13 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 				mul_m4_m4m4(mat, invmat, mat);
 			}
 
-			bone_is_not_skinned = false;
+			bone_is_skinned = true;
 			break;
 		}
 	}
 
 	// create a bone even if there's no joint data for it (i.e. it has no influence)
-	if (bone_is_not_skinned) {
+	if (!bone_is_skinned) {
 		float obmat[4][4];
 		// bone-space
 		get_node_mat(obmat, node, NULL, NULL);
@@ -141,6 +141,7 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 		else {
 			copy_m4_m4(mat, obmat);
 		}
+
 	}
 
 	if (parent) bone->parent = parent;
diff --git a/source/blender/collada/ControllerExporter.cpp b/source/blender/collada/ControllerExporter.cpp
index 06e151c363b..a868adc1e66 100644
--- a/source/blender/collada/ControllerExporter.cpp
+++ b/source/blender/collada/ControllerExporter.cpp
@@ -157,11 +157,6 @@ void ArmatureExporter::find_objects_using_armature(Object *ob_arm, std::vector<O
 }
 #endif
 
-std::string ControllerExporter::get_joint_sid(Bone *bone, Object *ob_arm)
-{
-	return get_joint_id(bone, ob_arm);
-}
-
 std::string ControllerExporter::get_controller_id(Object *ob_arm, Object *ob)
 {
 	return translate_id(id_name(ob_arm)) + "_" + translate_id(id_name(ob)) + SKIN_CONTROLLER_ID_SUFFIX;
diff --git a/source/blender/collada/ControllerExporter.h b/source/blender/collada/ControllerExporter.h
index 0be51187f6f..80b858ca6dd 100644
--- a/source/blender/collada/ControllerExporter.h
+++ b/source/blender/collada/ControllerExporter.h
@@ -84,8 +84,6 @@ private:
 	void find_objects_using_armature(Object *ob_arm, std::vector<Object *>& objects, Scene *sce);
 #endif
 
-	std::string get_joint_sid(Bone *bone, Object *ob_arm);
-
 	std::string get_controller_id(Object *ob_arm, Object *ob);
 
 	std::string get_controller_id(Key *key, Object *ob);
diff --git a/source/blender/collada/collada_internal.cpp b/source/blender/collada/collada_internal.cpp
index e1a13559b08..70b44ebc222 100644
--- a/source/blender/collada/collada_internal.cpp
+++ b/source/blender/collada/collada_internal.cpp
@@ -341,7 +341,12 @@ std::string get_light_id(Object *ob)
 
 std::string get_joint_id(Bone *bone, Object *ob_arm)
 {
-	return translate_id(/*id_name(ob_arm) + "_" +*/ bone->name);
+	return translate_id(id_name(ob_arm) + "_" + bone->name);
+}
+
+std::string get_joint_sid(Bone *bone, Object *ob_arm)
+{
+	return translate_id(bone->name);
 }
 
 std::string get_camera_id(Object *ob)
diff --git a/source/blender/collada/collada_internal.h b/source/blender/collada/collada_internal.h
index 4aa637a6876..482dbf9ab31 100644
--- a/source/blender/collada/collada_internal.h
+++ b/source/blender/collada/collada_internal.h
@@ -104,6 +104,7 @@ extern std::string get_geometry_id(Object *ob, bool use_instantiation);
 extern std::string get_light_id(Object *ob);
 
 extern std::string get_joint_id(Bone *bone, Object *ob_arm);
+extern std::string get_joint_sid(Bone *bone, Object *ob_arm);
 
 extern std::string get_camera_id(Object *ob);
 
diff --git a/source/blender/editors/screen/screen_ops.c b/source/blender/editors/screen/screen_ops.c
index 481c60373ce..66ff0f24506 100644
--- a/source/blender/editors/screen/screen_ops.c
+++ b/source/blender/editors/screen/screen_ops.c
@@ -2908,10 +2908,23 @@ static void SCREEN_OT_spacedata_cleanup(wmOperatorType *ot)
 
 static int repeat_last_exec(bContext *C, wmOperator *UNUSED(op))
 {
-	wmOperator *lastop = CTX_wm_manager(C)->operators.last;
-	
-	if (lastop)
+	wmWindowManager *wm = CTX_wm_manager(C);
+	wmOperator *lastop = wm->operators.last;
+
+	/* Seek last registered operator */
+	while (lastop) {
+		if (lastop->type->flag & OPTYPE_REGISTER) {
+			break;
+		}
+		else {
+			lastop = lastop->prev;
+		}
+	}
+
+	if (lastop) {
+		WM_operator_free_all_after(wm, lastop);
 		WM_operator_repeat(C, lastop);
+	}
 	
 	return OPERATOR_CANCELLED;
 }
@@ -2946,8 +2959,9 @@ static int repeat_history_invoke(bContext *C, wmOperator *op, const wmEvent *UNU
 	layout = UI_popup_menu_layout(pup);
 	
 	for (i = items - 1, lastop = wm->operators.last; lastop; lastop = lastop->prev, i--)
-		if (WM_operator_repeat_check(C, lastop))
+		if ((lastop->type->flag & OPTYPE_REGISTER) && WM_operator_repeat_check(C, lastop)) {
 			uiItemIntO(layout, RNA_struct_ui_name(lastop->type->srna), ICON_NONE, op->type->idname, "index", i);
+		}
 	
 	UI_popup_menu_end(C, pup);
 	
diff --git a/source/blender/editors/space_view3d/drawarmature.c b/source/blender/editors/space_view3d/drawarmature.c
index 3cb83546a44..d61058744d0 100644
--- a/source/blender/editors/space_view3d/drawarmature.c
+++ b/source/blender/editors/space_view3d/drawarmature.c
@@ -1449,7 +1449,7 @@ static void draw_b_bone(const short dt, int armflag, int boneflag, short constfl
 	else {
 		/* wire */
 		if (armflag & ARM_POSEMODE) {
-			if (constflag) {
+			if (constflag && ((G.f & G_PICKSEL) == 0)) {
 				/* set constraint colors */
 				if (set_pchan_color(PCHAN_COLOR_CONSTS, boneflag, constflag)) {
 					glEnable(GL_BLEND);
@@ -1604,7 +1604,7 @@ static void draw_bone(const short dt, int armflag, int boneflag, short constflag
 			set_ebone_color(boneflag);
 		}
 		else if (armflag & ARM_POSEMODE) {
-			if (constflag) {
+			if (constflag && ((G.f & G_PICKSEL) == 0)) {
 				/* draw constraint colors */
 				if (set_pchan_color(PCHAN_COLOR_CONSTS, boneflag, constflag)) {
 					glEnable(GL_BLEND);
diff --git a/source/blender/editors/util/undo.c b/source/blender/editors/util/undo.c
index 73a0afa0692..f5830e451e3 100644
--- a/source/blender/editors/util/undo.c
+++ b/source/blender/editors/util/undo.c
@@ -420,6 +420,9 @@ int ED_undo_operator_repeat(bContext *C, struct wmOperator *op)
 
 			if (G.debug & G_DEBUG)
 				printf("redo_cb: operator redo %s\n", op->type->name);
+
+			WM_operator_free_all_after(wm, op);
+
 			ED_undo_pop_op(C, op);
 
 			if (op->type->check) {
diff --git a/source/blender/makesrna/intern/rna_wm_api.c b/source/blender/makesrna/intern/rna_wm_api.c
index 1f23ab938fb..677ea92aea1 100644
--- a/source/blender/makesrna/intern/rna_wm_api.c
+++ b/source/blender/makesrna/intern/rna_wm_api.c
@@ -87,6 +87,11 @@ static void rna_Operator_report(wmOperator *op, int type, const char *msg)
 	BKE_report(op->reports, type, msg);
 }
 
+static int rna_Operator_is_repeat(wmOperator *op, bContext *C)
+{
+	return WM_operator_is_repeat(C, op);
+}
+
 /* since event isn't needed... */
 static void rna_Operator_enum_search_invoke(bContext *C, wmOperator *op)
 {
@@ -521,6 +526,12 @@ void RNA_api_operator(StructRNA *srna)
 	parm = RNA_def_string(func, "message", NULL, 0, "Report Message", "");
 	RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
 
+	/* utility, not for registering */
+	func = RNA_def_function(srna, "is_repeat", "rna_Operator_is_repeat");
+	RNA_def_function_flag(func, FUNC_USE_CONTEXT);
+	/* return */
+	parm = RNA_def_boolean(func, "result", 0, "result", "");
+	RNA_def_function_return(func, parm);
 
 	/* Registration */
 
diff --git a/source/blender/windowmanager/WM_api.h b/source/blender/windowmanager/WM_api.h
index c8d3c85c5da..f264af899da 100644
--- a/source/blender/windowmanager/WM_api.h
+++ b/source/blender/windowmanager/WM_api.h
@@ -263,6 +263,7 @@ int         WM_operator_confirm_message(struct bContext *C, struct wmOperator *o
 
 		/* operator api */
 void		WM_operator_free		(struct wmOperator *op);
+void		WM_operator_free_all_after(wmWindowManager *wm, struct wmOperator *op);
 void		WM_operator_type_set(struct wmOperator *op, struct wmOperatorType *ot);
 void		WM_operator_stack_clear(struct wmWindowManager *wm);
 void		WM_operator_handlers_clear(wmWindowManager *wm, struct wmOperatorType *ot);
@@ -287,6 +288,7 @@ int			WM_operator_call		(struct bContext *C, struct wmOperator *op);
 int			WM_operator_call_notest(struct bContext *C, struct wmOperator *op);
 int			WM_operator_repeat		(struct bContext *C, struct wmOperator *op);
 bool        WM_operator_repeat_check(const struct bContext *C, struct wmOperator *op);
+bool        WM_operator_is_repeat(const struct bContext *C, const struct wmOperator *op);
 int         WM_operator_name_call_ptr(struct bContext *C, struct wmOperatorType *ot, short context, struct PointerRNA *properties);
 int			WM_operator_name_call(struct bContext *C, const char *opstring, short context, struct PointerRNA *properties);
 int			WM_operator_call_py(struct bContext *C, struct wmOperatorType *ot, short context, struct PointerRNA *properties, struct ReportList *reports, const bool is_undo);
diff --git a/source/blender/windowmanager/intern/wm.c b/source/blender/windowmanager/intern/wm.c
index b76a1f1d422..4351cd22b18 100644
--- a/source/blender/windowmanager/intern/wm.c
+++ b/source/blender/windowmanager/intern/wm.c
@@ -107,6 +107,17 @@ void WM_operator_free(wmOperator *op)
 	MEM_freeN(op);
 }
 
+void WM_operator_free_all_after(wmWindowManager *wm, struct wmOperator *op)
+{
+	op = op->next;
+	while (op != NULL) {
+		wmOperator *op_next = op->next;
+		BLI_remlink(&wm->operators, op);
+		WM_operator_free(op);
+		op = op_next;
+	}
+}
+
 /**
  * Use with extreme care!,
  * properties, customdata etc - must be compatible.
@@ -149,18 +160,23 @@ static void wm_reports_free(wmWindowManager *wm)
 void wm_operator_register(bContext *C, wmOperator *op)
 {
 	wmWindowManager *wm = CTX_wm_manager(C);
-	int tot;
+	int tot = 0;
 
 	BLI_addtail(&wm->operators, op);
-	tot = BLI_listbase_count(&wm->operators);
-	
-	while (tot > MAX_OP_REGISTERED) {
-		wmOperator *opt = wm->operators.first;
-		BLI_remlink(&wm->operators, opt);
-		WM_operator_free(opt);
-		tot--;
+
+	/* only count registered operators */
+	while (op) {
+		wmOperator *op_prev = op->prev;
+		if (op->type->flag & OPTYPE_REGISTER) {
+			tot += 1;
+		}
+		if (tot > MAX_OP_REGISTERED) {
+			BLI_remlink(&wm->operators, op);
+			WM_operator_free(op);
+		}
+		op = op_prev;
 	}
-	
+
 	/* so the console is redrawn */
 	WM_event_add_notifier(C, NC_SPACE | ND_SPACE_INFO_REPORT, NULL);
 	WM_event_add_notifier(C, NC_WM | ND_HISTORY, NULL);
diff --git a/source/blender/windowmanager/intern/wm_event_system.c b/source/blender/windowmanager/intern/wm_event_system.c
index a2aca589f72..6107bdd93b5 100644
--- a/source/blender/windowmanager/intern/wm_event_system.c
+++ b/source/blender/windowmanager/intern/wm_event_system.c
@@ -713,7 +713,9 @@ static void wm_operator_reports(bContext *C, wmOperator *op, int retval, bool ca
  */
 static bool wm_operator_register_check(wmWindowManager *wm, wmOperatorType *ot)
 {
-	return wm && (wm->op_undo_depth == 0) && (ot->flag & OPTYPE_REGISTER);
+	/* Check undo flag here since undo operators are also added to the list,
+	 * to support checking if the same operator is run twice. */
+	return wm && (wm->op_undo_depth == 0) && (ot->flag & (OPTYPE_REGISTER | OPTYPE_UNDO));
 }
 
 static void wm_operator_finished(bContext *C, wmOperator *op, const bool repeat)
@@ -876,6 +878,20 @@ bool WM_operator_repeat_check(const bContext *UNUSED(C), wmOperator *op)
 	return false;
 }
 
+bool WM_operator_is_repeat(const bContext *C, const wmOperator *op)
+{
+	/* may be in the operators list or not */
+	wmOperator *op_prev;
+	if (op->prev == NULL && op->next == NULL) {
+		wmWindowManager *wm = CTX_wm_manager(C);
+		op_prev = wm->operators.last;
+	}
+	else {
+		op_prev = op->prev;
+	}
+	return (op_prev && (op->type == op_prev->type));
+}
+
 static wmOperator *wm_operator_create(wmWindowManager *wm, wmOperatorType *ot,
                                       PointerRNA *properties, ReportList *reports)
 {
diff --git a/source/blender/windowmanager/intern/wm_files.c b/source/blender/windowmanager/intern/wm_files.c
index 760dc526b12..00607695edf 100644
--- a/source/blender/windowmanager/intern/wm_files.c
+++ b/source/blender/windowmanager/intern/wm_files.c
@@ -1426,7 +1426,16 @@ static int wm_homefile_read_exec(bContext *C, wmOperator *op)
 		G.fileflags &= ~G_FILE_NO_UI;
 	}
 
-	return wm_homefile_read(C, op->reports, from_memory, filepath) ? OPERATOR_FINISHED : OPERATOR_CANCELLED;
+	if (wm_homefile_read(C, op->reports, from_memory, filepath)) {
+		/* Load a file but keep the splash open */
+		if (RNA_boolean_get(op->ptr, "use_splash")) {
+			WM_init_splash(C);
+		}
+		return OPERATOR_FINISHED;
+	}
+	else {
+		return OPERATOR_CANCELLED;
+	}
 }
 
 void WM_OT_read_homefile(wmOperatorType *ot)
@@ -1449,6 +1458,10 @@ void WM_OT_read_homefile(wmOperatorType *ot)
 	                       "Load user interface setup from the .blend file");
 	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
 
+	/* So the splash can be kept open after loading a file (for templates). */
+	prop = RNA_def_boolean(ot->srna, "use_splash", true, "Splash", "");
+	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
+
 	/* omit poll to run in background mode */
 }