20 files changed, 145 insertions, 507 deletions
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index e2762a85fc8..2313feac089 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -87,7 +87,6 @@ ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals
 	PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
 
 	path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
-	inactive_L->direct_throughput = L->direct_throughput;
 	path_radiance_copy_indirect(inactive_L, L);
 
 	ray_state[inactive_ray] = RAY_REGENERATED;
@@ -110,7 +109,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
 	ShaderData *sd = saved_sd;
-	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	float3 throughput = branched_state->throughput;
 	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
@@ -157,37 +155,38 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(rng, i);
 
 		for(int j = branched_state->next_sample; j < num_samples; j++) {
 			if(reset_path_state) {
 				*ps = branched_state->path_state;
 			}
 
+			ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
 			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
 			*tp = throughput;
 
 			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
 
 			if(!kernel_branched_path_surface_bounce(kg,
-			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        tp,
 			                                        ps,
-			                                        L,
+			                                        &L->state,
 			                                        bsdf_ray,
 			                                        sum_sample_weight))
 			{
 				continue;
 			}
 
+			ps->rng_hash = branched_state->path_state.rng_hash;
+
 			/* update state for next iteration */
 			branched_state->next_closure = i;
 			branched_state->next_sample = j+1;
-			branched_state->num_samples = num_samples;
 
 			/* start the indirect path */
 			*tp *= num_samples_inv;
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 4c1fdd2d69c..511334e0550 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -75,107 +75,65 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 
-	ccl_global uint *rng_state = kernel_split_params.rng_state;
-	int stride = kernel_split_params.stride;
-
 	ccl_global char *ray_state = kernel_split_state.ray_state;
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
-#endif
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
-	ccl_global float *buffer = kernel_split_params.buffer;
-
-	unsigned int work_index;
-	ccl_global uint *initial_rng;
-
-	unsigned int sample;
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-
-	work_index = kernel_split_state.work_array[ray_index];
-	sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-	                        &tile_x, &tile_y,
-	                        work_index,
-	                        ray_index);
-	initial_rng = rng_state;
-
-	rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
-	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-#ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
-#endif
+		uint sample = state->sample;
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
 
 		/* accumulate result in output buffer */
-		bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
-		kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
-
-		path_rng_end(kg, rng_state, rng);
+		kernel_write_result(kg, buffer, sample, L);
 
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(kg, &work_index, ray_index);
-		if(!valid_work) {
+		ccl_global uint *work_pools = kernel_split_params.work_pools;
+		uint total_work_size = kernel_split_params.total_work_size;
+		uint work_index;
+
+		if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
 			/* If work is invalid, this means no more work is available and the thread may exit */
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-			kernel_split_state.work_array[ray_index] = work_index;
-			/* Get the sample associated with the current work */
-			sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-			/* Get pixel and tile position associated with current work */
-			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
+			ccl_global WorkTile *tile = &kernel_split_params.tile;
+			uint x, y, sample;
+			get_work_pixel(tile, work_index, &x, &y, &sample);
 
-			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
-			/* Remap buffer according to the current work */
-			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+			/* Store buffer offset for writing to passes. */
+			uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+			kernel_split_state.buffer_offset[ray_index] = buffer_offset;
 
 			/* Initialize random numbers and ray. */
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray);
+			uint rng_hash;
+			kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
 
 			if(ray->t != 0.0f) {
-				/* Initialize throughput, L_transparent, Ray, PathState;
+				/* Initialize throughput, path radiance, Ray, PathState;
 				 * These rays proceed with path-iteration.
 				 */
 				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-				*L_transparent = 0.0f;
 				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray);
+				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray);
 #ifdef __SUBSURFACE__
 				kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
 #endif
-#ifdef __KERNEL_DEBUG__
-				debug_data_init(debug_data);
-#endif
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 				enqueue_flag = 1;
 			}
 			else {
-				/* These rays do not participate in path-iteration. */
-				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(buffer, sample, L_rad);
-				path_rng_end(kg, rng_state, rng);
-
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 			}
 		}
 	}
-	kernel_split_state.rng[ray_index] = rng;
 
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index e4545d66eff..77fb61b80a8 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN
  * The number of elements in the queues is initialized to 0;
  */
 
-/* Distributes an amount of work across all threads
- * note: work done inside the loop may not show up to all threads till after
- * the current kernel has completed
- */
-#define parallel_for(kg, iter_name, work_size) \
-	for(size_t _size = (work_size), \
-	    _global_size = ccl_global_size(0) * ccl_global_size(1), \
-	    _n = _size / _global_size, \
-		_thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
-	    iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
-		; \
-		(iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
-		; \
-		iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
-	)
-
 #ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
 #else
@@ -49,12 +33,9 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
         ccl_global void *split_data_buffer,
         int num_elements,
         ccl_global char *ray_state,
-        ccl_global uint *rng_state,
 
 #ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "kernel/kernel_textures.h"
+		KERNEL_BUFFER_PARAMS,
 #endif
 
         int start_sample,
@@ -75,34 +56,32 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 	kg->data = data;
 #endif
 
-	kernel_split_params.x = sx;
-	kernel_split_params.y = sy;
-	kernel_split_params.w = sw;
-	kernel_split_params.h = sh;
+	kernel_split_params.tile.x = sx;
+	kernel_split_params.tile.y = sy;
+	kernel_split_params.tile.w = sw;
+	kernel_split_params.tile.h = sh;
 
-	kernel_split_params.offset = offset;
-	kernel_split_params.stride = stride;
+	kernel_split_params.tile.start_sample = start_sample;
+	kernel_split_params.tile.num_samples = num_samples;
 
-	kernel_split_params.rng_state = rng_state;
+	kernel_split_params.tile.offset = offset;
+	kernel_split_params.tile.stride = stride;
 
-	kernel_split_params.start_sample = start_sample;
-	kernel_split_params.end_sample = end_sample;
+	kernel_split_params.tile.buffer = buffer;
+
+	kernel_split_params.total_work_size = sw * sh * num_samples;
 
 	kernel_split_params.work_pools = work_pools;
-	kernel_split_params.num_samples = num_samples;
 
 	kernel_split_params.queue_index = Queue_index;
 	kernel_split_params.queue_size = queuesize;
 	kernel_split_params.use_queues_flag = use_queues_flag;
 
-	kernel_split_params.buffer = buffer;
-
 	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
 
 #ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 #endif
 
 	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
@@ -124,30 +103,6 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 		 */
 		*use_queues_flag = 0;
 	}
-
-	/* zero the tiles pixels and initialize rng_state if this is the first sample */
-	if(start_sample == 0) {
-		parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
-			int pixel = i / kernel_data.film.pass_stride;
-			int pass = i % kernel_data.film.pass_stride;
-
-			int x = sx + pixel % sw;
-			int y = sy + pixel / sw;
-
-			int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
-
-			*(buffer + index) = 0.0f;
-		}
-
-		parallel_for(kg, i, sw * sh) {
-			int x = sx + i % sw;
-			int y = sy + i / sw;
-
-			int index = (offset + x + y*stride);
-			*(rng_state + index) = hash_int_2d(x, y);
-		}
-	}
-
 #endif  /* KERENL_STUB */
 }
 
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 3336c968a44..2aac66ecb84 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -62,8 +62,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 
 		/* direct lighting */
 #ifdef __EMISSION__
-		RNG rng = kernel_split_state.rng[ray_index];
-
 		bool flag = (kernel_data.integrator.use_direct_light &&
 		             (sd->flag & SD_BSDF_HAS_EVAL));
 
@@ -83,23 +81,20 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 
 		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
-			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
 			float light_u, light_v;
-			path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			float terminate = path_state_rng_light_termination(kg, &rng, state);
+			path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, state);
 
 			LightSample ls;
 			if(light_sample(kg,
-			                light_t, light_u, light_v,
+			                light_u, light_v,
 			                sd->time,
 			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
-#  ifdef __OBJECT_MOTION__
 				light_ray.time = sd->time;
-#  endif
 
 				BsdfEval L_light;
 				bool is_lamp;
@@ -115,7 +110,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 				}
 			}
 		}
-		kernel_split_state.rng[ray_index] = rng;
 #endif  /* __EMISSION__ */
 	}
 
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 9f8dd2392d9..491487f1230 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -30,7 +30,6 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
@@ -58,22 +57,21 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 
 		/* integrate along volume segment with distance sampling */
 		VolumeIntegrateResult result = kernel_volume_integrate(
-			kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+			kg, ps, sd, &volume_ray, L, tp, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
 		if(result == VOLUME_PATH_SCATTERED) {
 			/* direct lighting */
-			kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
 
 			/* indirect light bounce */
-			if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+			if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
 				continue;
 			}
 
 			/* start the indirect path */
 			branched_state->next_closure = 0;
 			branched_state->next_sample = j+1;
-			branched_state->num_samples = num_samples;
 
 			/* Attempting to share too many samples is slow for volumes as it causes us to
 			 * loop here more and have many calls to kernel_volume_integrate which evaluates
@@ -141,7 +139,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
 		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -165,15 +162,15 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 				{
 					/* integrate along volume segment with distance sampling */
 					VolumeIntegrateResult result = kernel_volume_integrate(
-						kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+						kg, state, sd, &volume_ray, L, throughput, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
 					if(result == VOLUME_PATH_SCATTERED) {
 						/* direct lighting */
-						kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+						kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
 
 						/* indirect light bounce */
-						if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+						if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
 							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 						}
 						else {
@@ -194,8 +191,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 			}
 #  endif  /* __BRANCHED_PATH__ */
 		}
-
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
 #  ifdef __BRANCHED_PATH__
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 670a557f084..906bad8ceb6 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -90,163 +90,58 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 
-	int stride = kernel_split_params.stride;
-
-	unsigned int work_index;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int sample;
-
-	RNG rng = kernel_split_state.rng[ray_index];
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	ccl_global float *buffer = kernel_split_params.buffer;
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
 		throughput = kernel_split_state.throughput[ray_index];
 		state = &kernel_split_state.path_state[ray_index];
 
-		work_index = kernel_split_state.work_array[ray_index];
-		sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        work_index,
-		                        ray_index);
-
-		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
-
-#ifdef __SHADOW_TRICKS__
-		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			if(state->flag & PATH_RAY_CAMERA) {
-				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
-				state->catcher_object = sd->object;
-				if(!kernel_data.background.transparent) {
-					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-					ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-					L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
-				}
-			}
-		}
-		else {
-			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
-		}
-#endif  /* __SHADOW_TRICKS__ */
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(((sd->flag & SD_HOLDOUT) ||
-		    (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-		   (state->flag & PATH_RAY_CAMERA))
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
 		{
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
-			}
-			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				kernel_split_path_end(kg, ray_index);
-			}
+			kernel_split_path_end(kg, ray_index);
 		}
-#endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
-		if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
-#endif  /* __BRANCHED_PATH__ */
-		{
-			/* Holdout mask objects do not write data passes. */
-			kernel_write_data_passes(kg,
-				                     buffer,
-				                     L,
-				                     sd,
-				                     sample,
-				                     state,
-				                     throughput);
-		}
-
-		/* Blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy.
-		 */
-#ifndef __BRANCHED_PATH__
-		if(kernel_data.integrator.filter_glossy != FLT_MAX)
-#else
-		if(kernel_data.integrator.filter_glossy != FLT_MAX &&
-		   (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
-#endif  /* __BRANCHED_PATH__ */
-		{
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd->flag & SD_EMISSION) {
-			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(
-			        kg,
-			        sd,
-			        kernel_split_state.isect[ray_index].t,
-			        state->flag,
-			        state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
-		}
-#endif  /* __EMISSION__ */
-
 		/* Path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
-#ifndef __BRANCHED_PATH__
-		float probability = path_state_terminate_probability(kg, state, throughput);
-#else
-		float probability = 1.0f;
-
-		if(!kernel_data.integrator.branched) {
-			probability = path_state_terminate_probability(kg, state, throughput);
-		}
-		else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-			int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
-			probability = path_state_terminate_probability(kg, state, throughput*num_samples);
-		}
-		else if(state->flag & PATH_RAY_TRANSPARENT) {
-			probability = path_state_terminate_probability(kg, state, throughput);
-		}
-#endif
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			kernel_split_path_end(kg, ray_index);
 		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
-				if(terminate >= probability) {
-					kernel_split_path_end(kg, ray_index);
-				}
-				else {
-					kernel_split_state.throughput[ray_index] = throughput/probability;
-				}
+		else if(probability < 1.0f) {
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+			if(terminate >= probability) {
+				kernel_split_path_end(kg, ray_index);
 			}
+			else {
+				kernel_split_state.throughput[ray_index] = throughput/probability;
+			}
+		}
 
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}
@@ -260,8 +155,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	}
 #endif  /* __AO__ */
 
-	kernel_split_state.rng[ray_index] = rng;
-
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index f0ebb90f60a..437043a5971 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -33,7 +33,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 		if(ray_index != QUEUE_EMPTY_SLOT) {
 			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-				if(state->bounce > kernel_data.integrator.ao_bounces) {
+				if(path_state_ao_bounce(kg, state)) {
 					kernel_split_path_end(kg, ray_index);
 				}
 			}
@@ -50,33 +50,16 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 		return;
 	}
 
-	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
-
 	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		/* eval background shader if nothing hit */
-		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-			*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				kernel_split_path_end(kg, ray_index);
-		}
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
-			path_radiance_accum_background(L, state, (*throughput), L_background);
-#endif
-			kernel_split_path_end(kg, ray_index);
-		}
+		kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+		kernel_split_path_end(kg, ray_index);
 	}
-
-
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index 82bc2f01fd7..e9fe5552e8c 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -54,7 +54,6 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
 #endif
 		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
 			ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-			kernel_path_subsurface_accum_indirect(ss_indirect, L);
 
 			/* Trace indirect subsurface rays by restarting the loop. this uses less
 			 * stack memory than invoking kernel_path_indirect.
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index c669d79ddcd..448456d167d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -57,27 +57,10 @@ ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 
 		float3 throughput = kernel_split_state.throughput[ray_index];
 		Ray ray = kernel_split_state.ray[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += kernel_split_state.isect[ray_index].t;
-			light_ray.D = ray.D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-			/* intersect with lamp */
-			float3 emission;
-
-			if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L);
 	}
 }
 
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 7758e35fd32..c3373174582 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -126,7 +126,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 	if(active) {
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -135,7 +134,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
 #endif
 			/* Compute direct lighting and next bounce. */
-			if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+			if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
 				kernel_split_path_end(kg, ray_index);
 			}
 #ifdef __BRANCHED_PATH__
@@ -157,8 +156,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 			}
 		}
 #endif  /* __BRANCHED_PATH__ */
-
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
 	/* Enqueue RAY_UPDATE_BUFFER rays. */
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index a7ecde7c80d..5ad62b585fe 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -29,77 +29,53 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 	 */
 	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
 
-	unsigned int my_sample;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-	unsigned int tile_x;
-	unsigned int tile_y;
-
-	unsigned int work_index = 0;
 	/* Get work. */
-	if(!get_next_work(kg, &work_index, ray_index)) {
+	ccl_global uint *work_pools = kernel_split_params.work_pools;
+	uint total_work_size = kernel_split_params.total_work_size;
+	uint work_index;
+
+	if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
 		/* No more work, mark ray as inactive */
 		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
 
 		return;
 	}
 
-	/* Get the sample associated with the work. */
-	my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
-	/* Get pixel and tile position associated with the work. */
-	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-	                             &tile_x, &tile_y,
-	                             work_index,
-	                             ray_index);
-	kernel_split_state.work_array[ray_index] = work_index;
-
-	ccl_global uint *rng_state = kernel_split_params.rng_state;
-	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+	ccl_global WorkTile *tile = &kernel_split_params.tile;
+	uint x, y, sample;
+	get_work_pixel(tile, work_index, &x, &y, &sample);
 
-	ccl_global float *buffer = kernel_split_params.buffer;
-	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
-
-	RNG rng = kernel_split_state.rng[ray_index];
+	/* Store buffer offset for writing to passes. */
+	uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+	kernel_split_state.buffer_offset[ray_index] = buffer_offset;
 
 	/* Initialize random numbers and ray. */
+	uint rng_hash;
 	kernel_path_trace_setup(kg,
-	                        rng_state,
-	                        my_sample,
-	                        pixel_x, pixel_y,
-	                        &rng,
+	                        sample,
+	                        x, y,
+	                        &rng_hash,
 	                        &kernel_split_state.ray[ray_index]);
 
 	if(kernel_split_state.ray[ray_index].t != 0.0f) {
-		/* Initialize throughput, L_transparent, Ray, PathState;
+		/* Initialize throughput, path radiance, Ray, PathState;
 		 * These rays proceed with path-iteration.
 		 */
 		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-		kernel_split_state.L_transparent[ray_index] = 0.0f;
 		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
 		path_state_init(kg,
 		                &kernel_split_state.sd_DL_shadow[ray_index],
 		                &kernel_split_state.path_state[ray_index],
-		                &rng,
-		                my_sample,
+		                rng_hash,
+		                sample,
 		                &kernel_split_state.ray[ray_index]);
 #ifdef __SUBSURFACE__
 		kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
 #endif
-
-#ifdef __KERNEL_DEBUG__
-		debug_data_init(&kernel_split_state.debug_data[ray_index]);
-#endif
 	}
 	else {
-		/* These rays do not participate in path-iteration. */
-		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-		/* Accumulate result in output buffer. */
-		kernel_write_pass_float4(buffer, my_sample, L_rad);
-		path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
 		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
 	}
-	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
index e2e841f36d3..66ce2dfb6f1 100644
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
 	int queue_number = -1;
 
 	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
 		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 	}
 	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 45984ca509b..f5378bc172b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -59,52 +59,14 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 		return;
 	}
 
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
-#endif
-	Intersection isect;
-	PathState state = kernel_split_state.path_state[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	Ray ray = kernel_split_state.ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
-	/* intersect scene */
-	uint visibility = path_state_ray_visibility(kg, &state);
-
-	if(state.bounce > kernel_data.integrator.ao_bounces) {
-		visibility = PATH_RAY_SHADOW;
-		ray.t = kernel_data.background.ao_distance;
-	}
-
-#ifdef __HAIR__
-	float difl = 0.0f, extmax = 0.0f;
-	uint lcg_state = 0;
-	RNG rng = kernel_split_state.rng[ray_index];
-
-	if(kernel_data.bvh.have_curves) {
-		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
-			float3 pixdiff = ray.dD.dx + ray.dD.dy;
-			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-		}
-
-		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d);
-	}
-
-	bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-	bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+	Intersection isect;
+	bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L);
 	kernel_split_state.isect[ray_index] = isect;
 
-#ifdef __KERNEL_DEBUG__
-	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes;
-		debug_data->num_bvh_traversed_instances += isect.num_traversed_instances;
-		debug_data->num_bvh_intersections += isect.num_intersections;
-	}
-	debug_data->num_ray_bounces++;
-#endif
-
 	if(!hit) {
 		/* Change the state of rays that hit the background;
 		 * These rays undergo special processing in the
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 2801b32f285..7032461b04a 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -48,30 +48,18 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
 
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-#ifndef __BRANCHED_PATH__
-		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
-#else
-		ShaderContext ctx = SHADER_CONTEXT_MAIN;
-		float rbsdf = 0.0f;
-
-		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-			rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
-
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
+#ifdef __BRANCHED_PATH__
+		if(kernel_data.integrator.branched) {
+			shader_merge_closures(&kernel_split_state.sd[ray_index]);
 		}
-
-		if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-			ctx = SHADER_CONTEXT_INDIRECT;
+		else
+#endif
+		{
+			shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
 		}
-
-		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
-		shader_merge_closures(&kernel_split_state.sd[ray_index]);
-#endif  /* __BRANCHED_PATH__ */
-
-		kernel_split_state.rng[ray_index] = rng;
 	}
 }
 
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
index 297decb0bc2..5a55b680695 100644
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 	ccl_local ushort *local_index = &locals->local_index[0];
 
 	/* copy to local memory */
-	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
 		uint idx = offset + i + lid;
 		uint add = input + idx;
 		uint value = (~0);
@@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  ifdef __KERNEL_OPENCL__
 
 	/* bitonic sort */
-	for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-		for (uint inc = length; inc > 0; inc >>= 1) {
-			for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+	for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for(uint inc = length; inc > 0; inc >>= 1) {
+			for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
 				uint i = lid + ii;
 				bool direction = ((i & (length << 1)) != 0);
 				uint j = i ^ inc;
@@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  endif /* __KERNEL_OPENCL__ */
 
 	/* copy to destination */
-	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
 		uint idx = offset + i + lid;
 		uint lidx = local_index[i + lid];
 		uint outi = output + idx;
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 474286285a9..79aa2c9435b 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -37,21 +37,18 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
 	float3 throughput = kernel_split_state.throughput[ray_index];
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
 #endif
-		kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+		kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
 #ifdef __BRANCHED_PATH__
 	}
 	else {
-		kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
 	}
 #endif
-
-	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 78e61709b01..b52f9a5eb81 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -45,7 +45,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
 	float3 throughput = kernel_split_state.throughput[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
 
 	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -75,7 +74,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 
 	if(use_branched) {
 		kernel_branched_path_surface_connect_light(kg,
-		                                           &rng,
 		                                           sd,
 		                                           emission_sd,
 		                                           state,
@@ -91,10 +89,11 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 		float3 shadow;
 
 		if(!shadow_blocked(kg,
-			               emission_sd,
-			               state,
-			               &ray,
-			               &shadow))
+		                   sd,
+		                   emission_sd,
+		                   state,
+		                   &ray,
+		                   &shadow))
 		{
 			/* accumulate */
 			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
@@ -103,8 +102,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 			path_radiance_accum_total_light(L, state, throughput, &L_light);
 		}
 	}
-
-	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 08f0124b529..558d327bc76 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -63,7 +63,7 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
 		PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
 
 		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(orig_ray_L, L, 1);
+		path_radiance_accum_sample(orig_ray_L, L);
 
 		atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
 
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 4bb2f0d3d80..b0e6e5f5250 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -22,28 +22,15 @@ CCL_NAMESPACE_BEGIN
 /* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
 
 typedef struct SplitParams {
-	int x;
-	int y;
-	int w;
-	int h;
-
-	int offset;
-	int stride;
-
-	ccl_global uint *rng_state;
-
-	int start_sample;
-	int end_sample;
+	WorkTile tile;
+	uint total_work_size;
 
 	ccl_global unsigned int *work_pools;
-	unsigned int num_samples;
 
 	ccl_global int *queue_index;
 	int queue_size;
 	ccl_global char *use_queues_flag;
 
-	ccl_global float *buffer;
-
 	/* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
 	int dummy_sd_flag;
 } SplitParams;
@@ -56,14 +43,6 @@ typedef struct SplitParams {
 
 /* SPLIT_DATA_ENTRY(type, name, num) */
 
-#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
-/* DebugData memory */
-#  define SPLIT_DATA_DEBUG_ENTRIES \
-	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
-#else
-#  define SPLIT_DATA_DEBUG_ENTRIES
-#endif  /* DEBUG */
-
 #ifdef __BRANCHED_PATH__
 
 typedef ccl_global struct SplitBranchedState {
@@ -80,7 +59,6 @@ typedef ccl_global struct SplitBranchedState {
 	/* indirect loop state */
 	int next_closure;
 	int next_sample;
-	int num_samples;
 
 #ifdef __SUBSURFACE__
 	int ss_next_closure;
@@ -122,9 +100,7 @@ typedef ccl_global struct SplitBranchedState {
 #endif /* __VOLUME__ */
 
 #define SPLIT_DATA_ENTRIES \
-	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
 	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
 	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
@@ -133,19 +109,16 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
-	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+	SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
 	SPLIT_DATA_SUBSURFACE_ENTRIES \
 	SPLIT_DATA_VOLUME_ENTRIES \
 	SPLIT_DATA_BRANCHED_ENTRIES \
-	SPLIT_DATA_DEBUG_ENTRIES \
 
 /* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
 #define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
-	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
 	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
 	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
@@ -158,7 +131,6 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_SUBSURFACE_ENTRIES \
 	SPLIT_DATA_VOLUME_ENTRIES \
 	SPLIT_DATA_BRANCHED_ENTRIES \
-	SPLIT_DATA_DEBUG_ENTRIES \
 
 /* struct that holds pointers to data in the shared state buffer */
 typedef struct SplitData {
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index d5083b23f80..3b957856aea 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -38,7 +38,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
 	ShaderData *sd = &branched_state->sd;
-	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
@@ -52,14 +51,12 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
 		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
 		{
-			branched_state->lcg_state = lcg_state_init(&rng,
-			                                           branched_state->path_state.rng_offset,
-			                                           branched_state->path_state.sample,
-			                                           0x68bc21eb);
+			branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+			                                                     0x68bc21eb);
 		}
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(rng, i);
+		uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
 
 		/* do subsurface scatter step with copy of shader data, this will
 		 * replace the BSSRDF with a diffuse BSDF closure */
@@ -67,7 +64,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
 			float bssrdf_u, bssrdf_v;
 			path_branched_rng_2D(kg,
-			                     &bssrdf_rng,
+			                     bssrdf_rng_hash,
 			                     &branched_state->path_state,
 			                     j,
 			                     num_samples,
@@ -77,7 +74,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 
 			/* intersection is expensive so avoid doing multiple times for the same input */
 			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-				RNG lcg_state = branched_state->lcg_state;
+				uint lcg_state = branched_state->lcg_state;
 				SubsurfaceIntersection ss_isect_private;
 
 				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
@@ -152,7 +149,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 						int all = (kernel_data.integrator.sample_all_lights_direct) ||
 							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
 						kernel_branched_path_surface_connect_light(kg,
-						                                           &rng,
 						                                           bssrdf_sd,
 						                                           emission_sd,
 						                                           hit_state,
@@ -229,7 +225,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
@@ -246,7 +241,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 				                                  emission_sd,
 				                                  L,
 				                                  state,
-				                                  &rng,
 				                                  ray,
 				                                  throughput,
 				                                  ss_indirect))
@@ -256,21 +250,17 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 #ifdef __BRANCHED_PATH__
 			}
 			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-				float bssrdf_probability;
-				ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+				float bssrdf_u, bssrdf_v;
+				path_state_rng_2D(kg,
+				                  state,
+				                  PRNG_BSDF_U,
+				                  &bssrdf_u, &bssrdf_v);
 
-				/* modify throughput for picking bssrdf or bsdf */
-				*throughput *= bssrdf_probability;
+				const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
 
 				/* do bssrdf scatter step if we picked a bssrdf closure */
 				if(sc) {
-					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
-					float bssrdf_u, bssrdf_v;
-					path_state_rng_2D(kg,
-					                  &rng,
-					                  state,
-					                  PRNG_BSDF_U,
-					                  &bssrdf_u, &bssrdf_v);
+					uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
 					subsurface_scatter_step(kg,
 					                        sd,
 					                        state,
@@ -290,7 +280,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 			}
 #endif
 		}
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
 #  ifdef __BRANCHED_PATH__