From 2c503d8303299c27b874e11e89a1229c00dfa55d Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Thu, 21 May 2015 17:40:04 +0500
Subject: Cycles: Restructure kernel files organization

Since the kernel split work we're now having quite a few of new files, majority
of which are related on the kernel entry points. Keeping those files in the
root kernel folder will eventually make it really hard to follow which files are
actual implementation of Cycles kernel.

Those files are now moved to kernel/kernels/<device_type>. This way adding extra
entry points will be less noisy. It is also nice to have all device-specific
files grouped together.

Another change is in the way how split kernel invokes logic. Previously all the
logic was implemented directly in the .cl files, which makes it a bit tricky to
re-use the logic across other devices. Since we'll likely be looking into doing
same split work for CUDA devices eventually it makes sense to move logic from
.cl files to header files. Those files are stored in kernel/split. This does not
mean the header files will not give error messages when tried to be included
from other devices and their arguments will likely be changed, but having such
separation is a good start anyway.

There should be no functional changes.

Reviewers: juicyfruit, dingto

Differential Revision: https://developer.blender.org/D1314
---
 intern/cycles/kernel/CMakeLists.txt                |  96 ++---
 intern/cycles/kernel/SConscript                    |   7 +-
 intern/cycles/kernel/kernel.cl                     | 174 ---------
 intern/cycles/kernel/kernel.cpp                    | 132 -------
 intern/cycles/kernel/kernel.cu                     | 180 ----------
 intern/cycles/kernel/kernel_avx.cpp                |  86 -----
 intern/cycles/kernel/kernel_avx2.cpp               |  87 -----
 .../kernel/kernel_background_buffer_update.cl      | 282 ---------------
 intern/cycles/kernel/kernel_data_init.cl           | 400 ---------------------
 intern/cycles/kernel/kernel_direct_lighting.cl     | 138 -------
 ...holdout_emission_blurring_pathtermination_ao.cl | 283 ---------------
 intern/cycles/kernel/kernel_lamp_emission.cl       | 209 -----------
 .../cycles/kernel/kernel_next_iteration_setup.cl   | 176 ---------
 intern/cycles/kernel/kernel_queue_enqueue.cl       |  98 -----
 intern/cycles/kernel/kernel_scene_intersect.cl     | 164 ---------
 intern/cycles/kernel/kernel_shader_eval.cl         |  93 -----
 intern/cycles/kernel/kernel_shadow_blocked.cl      | 126 -------
 intern/cycles/kernel/kernel_split.h                |  62 ----
 intern/cycles/kernel/kernel_sse2.cpp               |  83 -----
 intern/cycles/kernel/kernel_sse3.cpp               |  84 -----
 intern/cycles/kernel/kernel_sse41.cpp              |  85 -----
 intern/cycles/kernel/kernel_sum_all_radiance.cl    |  59 ---
 intern/cycles/kernel/kernels/cpu/kernel.cpp        | 132 +++++++
 intern/cycles/kernel/kernels/cpu/kernel_avx.cpp    |  86 +++++
 intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp   |  87 +++++
 intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp   |  83 +++++
 intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp   |  84 +++++
 intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp  |  85 +++++
 intern/cycles/kernel/kernels/cuda/kernel.cu        | 180 ++++++++++
 intern/cycles/kernel/kernels/opencl/kernel.cl      | 174 +++++++++
 .../opencl/kernel_background_buffer_update.cl      |  81 +++++
 .../kernel/kernels/opencl/kernel_data_init.cl      | 242 +++++++++++++
 .../kernels/opencl/kernel_direct_lighting.cl       |  47 +++
 ...holdout_emission_blurring_pathtermination_ao.cl |  67 ++++
 .../kernel/kernels/opencl/kernel_lamp_emission.cl  |  52 +++
 .../kernels/opencl/kernel_next_iteration_setup.cl  |  59 +++
 .../kernel/kernels/opencl/kernel_queue_enqueue.cl  |  29 ++
 .../kernels/opencl/kernel_scene_intersect.cl       |  53 +++
 .../kernel/kernels/opencl/kernel_shader_eval.cl    |  43 +++
 .../kernel/kernels/opencl/kernel_shadow_blocked.cl |  47 +++
 .../kernels/opencl/kernel_sum_all_radiance.cl      |  38 ++
 .../kernel/split/kernel_background_buffer_update.h | 282 +++++++++++++++
 intern/cycles/kernel/split/kernel_data_init.h      | 400 +++++++++++++++++++++
 .../cycles/kernel/split/kernel_direct_lighting.h   | 138 +++++++
 ..._holdout_emission_blurring_pathtermination_ao.h | 283 +++++++++++++++
 intern/cycles/kernel/split/kernel_lamp_emission.h  | 209 +++++++++++
 .../kernel/split/kernel_next_iteration_setup.h     | 176 +++++++++
 intern/cycles/kernel/split/kernel_queue_enqueue.h  |  98 +++++
 .../cycles/kernel/split/kernel_scene_intersect.h   | 164 +++++++++
 intern/cycles/kernel/split/kernel_shader_eval.h    |  93 +++++
 intern/cycles/kernel/split/kernel_shadow_blocked.h | 126 +++++++
 intern/cycles/kernel/split/kernel_split_common.h   |  62 ++++
 .../cycles/kernel/split/kernel_sum_all_radiance.h  |  59 +++
 53 files changed, 3819 insertions(+), 3044 deletions(-)
 delete mode 100644 intern/cycles/kernel/kernel.cl
 delete mode 100644 intern/cycles/kernel/kernel.cpp
 delete mode 100644 intern/cycles/kernel/kernel.cu
 delete mode 100644 intern/cycles/kernel/kernel_avx.cpp
 delete mode 100644 intern/cycles/kernel/kernel_avx2.cpp
 delete mode 100644 intern/cycles/kernel/kernel_background_buffer_update.cl
 delete mode 100644 intern/cycles/kernel/kernel_data_init.cl
 delete mode 100644 intern/cycles/kernel/kernel_direct_lighting.cl
 delete mode 100644 intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
 delete mode 100644 intern/cycles/kernel/kernel_lamp_emission.cl
 delete mode 100644 intern/cycles/kernel/kernel_next_iteration_setup.cl
 delete mode 100644 intern/cycles/kernel/kernel_queue_enqueue.cl
 delete mode 100644 intern/cycles/kernel/kernel_scene_intersect.cl
 delete mode 100644 intern/cycles/kernel/kernel_shader_eval.cl
 delete mode 100644 intern/cycles/kernel/kernel_shadow_blocked.cl
 delete mode 100644 intern/cycles/kernel/kernel_split.h
 delete mode 100644 intern/cycles/kernel/kernel_sse2.cpp
 delete mode 100644 intern/cycles/kernel/kernel_sse3.cpp
 delete mode 100644 intern/cycles/kernel/kernel_sse41.cpp
 delete mode 100644 intern/cycles/kernel/kernel_sum_all_radiance.cl
 create mode 100644 intern/cycles/kernel/kernels/cpu/kernel.cpp
 create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
 create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
 create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
 create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
 create mode 100644 intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
 create mode 100644 intern/cycles/kernel/kernels/cuda/kernel.cu
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
 create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
 create mode 100644 intern/cycles/kernel/split/kernel_background_buffer_update.h
 create mode 100644 intern/cycles/kernel/split/kernel_data_init.h
 create mode 100644 intern/cycles/kernel/split/kernel_direct_lighting.h
 create mode 100644 intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
 create mode 100644 intern/cycles/kernel/split/kernel_lamp_emission.h
 create mode 100644 intern/cycles/kernel/split/kernel_next_iteration_setup.h
 create mode 100644 intern/cycles/kernel/split/kernel_queue_enqueue.h
 create mode 100644 intern/cycles/kernel/split/kernel_scene_intersect.h
 create mode 100644 intern/cycles/kernel/split/kernel_shader_eval.h
 create mode 100644 intern/cycles/kernel/split/kernel_shadow_blocked.h
 create mode 100644 intern/cycles/kernel/split/kernel_split_common.h
 create mode 100644 intern/cycles/kernel/split/kernel_sum_all_radiance.h

(limited to 'intern/cycles/kernel')

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3e16c4b5fd9..89dd3542ef6 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -12,20 +12,20 @@ set(INC_SYS
 )
 
 set(SRC
-	kernel.cpp
-	kernel.cl
-	kernel_data_init.cl
-	kernel_queue_enqueue.cl
-	kernel_scene_intersect.cl
-	kernel_lamp_emission.cl
-	kernel_background_buffer_update.cl
-	kernel_shader_eval.cl
-	kernel_holdout_emission_blurring_pathtermination_ao.cl
-	kernel_direct_lighting.cl
-	kernel_shadow_blocked.cl
-	kernel_next_iteration_setup.cl
-	kernel_sum_all_radiance.cl
-	kernel.cu
+	kernels/cpu/kernel.cpp
+	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_queue_enqueue.cl
+	kernels/opencl/kernel_scene_intersect.cl
+	kernels/opencl/kernel_lamp_emission.cl
+	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_shader_eval.cl
+	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_direct_lighting.cl
+	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_next_iteration_setup.cl
+	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/cuda/kernel.cu
 )
 
 set(SRC_HEADERS
@@ -57,7 +57,6 @@ set(SRC_HEADERS
 	kernel_shader.h
 	kernel_shaderdata_vars.h
 	kernel_shadow.h
-	kernel_split.h
 	kernel_subsurface.h
 	kernel_textures.h
 	kernel_types.h
@@ -162,6 +161,22 @@ set(SRC_UTIL_HEADERS
 	../util/util_transform.h
 	../util/util_types.h
 )
+
+set(SRC_SPLIT_HEADERS
+	split/kernel_background_buffer_update.h
+	split/kernel_data_init.h
+	split/kernel_direct_lighting.h
+	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_lamp_emission.h
+	split/kernel_next_iteration_setup.h
+	split/kernel_queue_enqueue.h
+	split/kernel_scene_intersect.h
+	split/kernel_shader_eval.h
+	split/kernel_shadow_blocked.h
+	split/kernel_split_common.h
+	split/kernel_sum_all_radiance.h
+)
+
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
@@ -187,7 +202,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
@@ -213,7 +228,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			COMMAND ${CUDA_NVCC_EXECUTABLE}
 					-arch=${arch}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -261,28 +276,28 @@ include_directories(SYSTEM ${INC_SYS})
 
 if(CXX_HAS_SSE)
 	list(APPEND SRC
-		kernel_sse2.cpp
-		kernel_sse3.cpp
-		kernel_sse41.cpp
+		kernels/cpu/kernel_sse2.cpp
+		kernels/cpu/kernel_sse3.cpp
+		kernels/cpu/kernel_sse41.cpp
 	)
 
-	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
 	list(APPEND SRC
-		kernel_avx.cpp
+		kernels/cpu/kernel_avx.cpp
 	)
-	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
 	list(APPEND SRC
-		kernel_avx2.cpp
+		kernels/cpu/kernel_avx2.cpp
 	)
-	set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS})
@@ -301,22 +316,23 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 24cc4517eaf..e8d51013924 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -57,8 +57,9 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     build_dir = os.path.join(root_build_dir, 'intern/cycles/kernel')
 
     # source directories and files
+    kernel_file_rel = os.path.join("kernels", "cuda", "kernel.cu")
     source_dir = Dir('.').srcnode().path
-    kernel_file = os.path.join(source_dir, "kernel.cu")
+    kernel_file = os.path.join(source_dir, kernel_file_rel)
     util_dir = os.path.join(source_dir, "../util")
     svm_dir = os.path.join(source_dir, "../svm")
     geom_dir = os.path.join(source_dir, "../geom")
@@ -83,7 +84,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
         nvcc_flags += " -D__KERNEL_DEBUG__"
 
     # dependencies
-    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
+    dependencies = [kernel_file_rel] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
     configs = (("kernel_%s.cubin", ''),
@@ -105,7 +106,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
             else:
                 command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
 
-            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Command(cubin_file, kernel_file_rel, command)
             kernel.Depends(cubin_file, dependencies)
 
             kernel_binaries.append(cubin_file)
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
deleted file mode 100644
index cbc0592fe1f..00000000000
--- a/intern/cycles/kernel/kernel.cl
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL kernel entry points - unfinished */
-
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-#ifdef __COMPILE_ONLY_MEGAKERNEL__
-
-__kernel void kernel_ocl_path_trace(
-	ccl_constant KernelData *data,
-	ccl_global float *buffer,
-	ccl_global uint *rng_state,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	int sample,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-#else // __COMPILE_ONLY_MEGAKERNEL__
-
-__kernel void kernel_ocl_shader(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
-}
-
-__kernel void kernel_ocl_bake(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-
-	if(x < sx + sw) {
-#if defined(__KERNEL_OPENCL_NVIDIA__) && __COMPUTE_CAPABILITY__ < 300
-		/* NVidia compiler is spending infinite amount of time trying
-		 * to deal with kernel_bake_evaluate() on architectures prior
-		 * to sm_30.
-		 * For now we disable baking kernel for those devices, so at
-		 * least rendering with split kernel could be compiled.
-		 */
-		output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-#else
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
-#endif
-	}
-}
-
-__kernel void kernel_ocl_convert_to_byte(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_convert_to_half_float(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif // __COMPILE_ONLY_MEGAKERNEL__
\ No newline at end of file
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
deleted file mode 100644
index a7eaa758f5d..00000000000
--- a/intern/cycles/kernel/kernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-#include "kernel_compat_cpu.h"
-#include "kernel.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Memory Copy */
-
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size)
-{
-	if(strcmp(name, "__data") == 0)
-		memcpy(&kg->__data, host, size);
-	else
-		assert(0);
-}
-
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation)
-{
-	if(0) {
-	}
-
-#define KERNEL_TEX(type, ttype, tname) \
-	else if(strcmp(name, #tname) == 0) { \
-		kg->tname.data = (type*)mem; \
-		kg->tname.width = width; \
-	}
-#define KERNEL_IMAGE_TEX(type, ttype, tname)
-#include "kernel_textures.h"
-
-	else if(strstr(name, "__tex_image_float")) {
-		texture_image_float4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_float_"));
-		int array_index = id;
-
-		if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
-			tex = &kg->texture_float_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (float4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-		}
-	}
-	else if(strstr(name, "__tex_image")) {
-		texture_image_uchar4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_"));
-		int array_index = id - MAX_FLOAT_IMAGES;
-
-		if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
-			tex = &kg->texture_byte_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (uchar4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-		}
-	}
-	else
-		assert(0);
-}
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
-#if defined(__x86_64__) || defined(_M_X64)
-#define __KERNEL_SSE2__
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-	/* do nothing */
-#endif
-
-/* Path Tracing */
-
-void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
-{
-#ifdef __BRANCHED_PATH__
-	if(kernel_data.integrator.branched)
-		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-	else
-#endif
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-/* Film */
-
-void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-/* Shader Evaluation */
-
-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
-{
-	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
-	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
deleted file mode 100644
index 64069fc049f..00000000000
--- a/intern/cycles/kernel/kernel.cu
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#include "kernel_compat_cuda.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-/* device data taken from CUDA occupancy calculator */
-
-#ifdef __CUDA_ARCH__
-
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#define CUDA_BLOCK_MAX_THREADS 1024
-#define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#define CUDA_THREADS_BLOCK_WIDTH 16
-#define CUDA_KERNEL_MAX_REGISTERS 32
-#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#define CUDA_BLOCK_MAX_THREADS 1024
-#define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#define CUDA_THREADS_BLOCK_WIDTH 16
-#define CUDA_KERNEL_MAX_REGISTERS 63
-#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#define CUDA_BLOCK_MAX_THREADS 1024
-#define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#define CUDA_THREADS_BLOCK_WIDTH 16
-#define CUDA_KERNEL_MAX_REGISTERS 63
-#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0 and 5.2 */
-#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
-#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#define CUDA_BLOCK_MAX_THREADS 1024
-#define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#define CUDA_THREADS_BLOCK_WIDTH 16
-#define CUDA_KERNEL_MAX_REGISTERS 40
-#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#error "Maximum number of registers per thread exceeded"
-#endif
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw)
-		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp
deleted file mode 100644
index f1027ad413d..00000000000
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#define __KERNEL_SSE2__
-#define __KERNEL_SSE3__
-#define __KERNEL_SSSE3__
-#define __KERNEL_SSE41__
-#define __KERNEL_AVX__
-#endif
- 
-#include "util_optimization.h"
- 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-
-#include "kernel_compat_cpu.h"
-#include "kernel.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Path Tracing */
-
-void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
-{
-#ifdef __BRANCHED_PATH__
-	if(kernel_data.integrator.branched)
-		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-	else
-#endif
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-/* Film */
-
-void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-/* Shader Evaluate */
-
-void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
-{
-	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
-	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
-}
-
-CCL_NAMESPACE_END
-#else
-
-/* needed for some linkers in combination with scons making empty compilation unit in a library */
-void __dummy_function_cycles_avx(void);
-void __dummy_function_cycles_avx(void) {}
-
-#endif
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
deleted file mode 100644
index b2f16ff54d8..00000000000
--- a/intern/cycles/kernel/kernel_avx2.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#define __KERNEL_SSE2__
-#define __KERNEL_SSE3__
-#define __KERNEL_SSSE3__
-#define __KERNEL_SSE41__
-#define __KERNEL_AVX__
-#define __KERNEL_AVX2__
-#endif
- 
-#include "util_optimization.h"
- 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-
-#include "kernel_compat_cpu.h"
-#include "kernel.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Path Tracing */
-
-void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
-{
-#ifdef __BRANCHED_PATH__
-	if(kernel_data.integrator.branched)
-		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-	else
-#endif
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-/* Film */
-
-void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-/* Shader Evaluate */
-
-void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
-{
-	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
-	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
-}
-
-CCL_NAMESPACE_END
-#else
-
-/* needed for some linkers in combination with scons making empty compilation unit in a library */
-void __dummy_function_cycles_avx2(void);
-void __dummy_function_cycles_avx2(void) {}
-
-#endif
diff --git a/intern/cycles/kernel/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernel_background_buffer_update.cl
deleted file mode 100644
index bf08477cfbf..00000000000
--- a/intern/cycles/kernel/kernel_background_buffer_update.cl
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------|                                                     |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------|                                                     |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------|                                                     |--- ray_state
- * PathState_coop ---------------------------------------|                                                     |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------|                                                     |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------|                                                     |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                                     |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                                     |--- work_array
- * parallel_samples -------------------------------------|                                                     |--- PathState_coop
- * end_sample -------------------------------------------|                                                     |--- throughput_coop
- * kg (globals + data) ----------------------------------|                                                     |--- rng_coop
- * rng_state --------------------------------------------|                                                     |--- Ray
- * PathRadiance_coop ------------------------------------|                                                     |
- * sw ---------------------------------------------------|                                                     |
- * sh ---------------------------------------------------|                                                     |
- * sx ---------------------------------------------------|                                                     |
- * sy ---------------------------------------------------|                                                     |
- * stride -----------------------------------------------|                                                     |
- * work_array -------------------------------------------|                                                     |--- work_array
- * queuesize --------------------------------------------|                                                     |
- * start_sample -----------------------------------------|                                                     |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------|                                                     |
- * num_samples ------------------------------------------|                                                     |
- *
- * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
- */
-__kernel void kernel_ocl_path_trace_background_buffer_update(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,
-	ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_state,
-	ccl_global uint *rng_coop,                   /* Required for buffer Update */
-	ccl_global float3 *throughput_coop,          /* Required for background hit processing */
-	PathRadiance *PathRadiance_coop,  /* Required for background hit processing and buffer Update */
-	ccl_global Ray *Ray_coop,                    /* Required for background hit processing */
-	ccl_global PathState *PathState_coop,        /* Required for background hit processing */
-	ccl_global float *L_transparent_coop,        /* Required for background hit processing and buffer Update */
-	ccl_global char *ray_state,                  /* Stores information on the current state of a ray */
-	int sw, int sh, int sx, int sy, int stride,
-	int rng_state_offset_x,
-	int rng_state_offset_y,
-	int rng_state_stride,
-	ccl_global unsigned int *work_array,         /* Denotes work of each ray */
-	ccl_global int *Queue_data,                  /* Queues memory */
-	ccl_global int *Queue_index,                 /* Tracks the number of elements in each queue */
-	int queuesize,                               /* Size (capacity) of each queue */
-	int end_sample,
-	int start_sample,
-#ifdef __WORK_STEALING__
-	ccl_global unsigned int *work_pool_wgs,
-	unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
-#endif
-	int parallel_samples                         /* Number of samples to be processed in parallel */
-	)
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(ray_index == 0) {
-		/* We will empty this queue in this kernel */
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	}
-	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not required.
-	 * If we are executing on a CPU device, then we need to keep all threads active
-	 * since we have barrier() calls later in the kernel. CPU devices
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		/* Load kernel globals structure and ShaderData strucuture */
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd = (ShaderData *)shader_data;
-
-#ifdef __KERNEL_DEBUG__
-		DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		PathRadiance *L = L = &PathRadiance_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work;
-		ccl_global float *initial_per_sample_output_buffers;
-		ccl_global uint *initial_rng;
-#endif
-		unsigned int sample;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-		my_sample_tile = 0;
-		initial_per_sample_output_buffers = per_sample_output_buffers;
-		initial_rng = rng_state;
-#else // __WORK_STEALING__
-		sample = work_array[ray_index];
-		int tile_index = ray_index / parallel_samples;
-		/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-				*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			}
-
-			if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
-			{
-#ifdef __BACKGROUND__
-				/* sample background shader */
-				float3 L_background = indirect_background(kg, state, ray, sd);
-				path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			}
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-			float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-			kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
-#ifdef __KERNEL_DEBUG__
-			kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
-#endif
-			float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-
-			/* accumulate result in output buffer */
-			kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-			path_rng_end(kg, rng_state, *rng);
-
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-			/* We have completed current work; So get next work */
-			int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-			if(!valid_work) {
-				/* If work is invalid, this means no more work is available and the thread may exit */
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-			}
-#else
-			if((sample + parallel_samples) >= end_sample) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-			}
-#endif
-			if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-				work_array[ray_index] = my_work;
-				/* Get the sample associated with the current work */
-				sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-				/* Get pixel and tile position associated with current work */
-				get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-				my_sample_tile = 0;
-
-				/* Remap rng_state according to the current work */
-				rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-				/* Remap per_sample_output_buffers according to the current work */
-				per_sample_output_buffers = initial_per_sample_output_buffers
-											+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else
-				work_array[ray_index] = sample + parallel_samples;
-				sample = work_array[ray_index];
-
-				/* Get ray position from ray index */
-				pixel_x = sx + ((ray_index / parallel_samples) % sw);
-				pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif
-
-				/* initialize random numbers and ray */
-				kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-
-				if(ray->t != 0.0f) {
-					/* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
-					*throughput = make_float3(1.0f, 1.0f, 1.0f);
-					*L_transparent = 0.0f;
-					path_radiance_init(L, kernel_data.film.use_light_pass);
-					path_state_init(kg, state, rng, sample, ray);
-#ifdef __KERNEL_DEBUG__
-					debug_data_init(debug_data);
-#endif
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-					enqueue_flag = 1;
-				} else {
-					/*These rays do not participate in path-iteration */
-					float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					/* accumulate result in output buffer */
-					kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-					path_rng_end(kg, rng_state, *rng);
-
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-				}
-			}
-		}
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays
-	 * will be made active during next SceneIntersectkernel
-	 */
-	enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
-}
diff --git a/intern/cycles/kernel/kernel_data_init.cl b/intern/cycles/kernel/kernel_data_init.cl
deleted file mode 100644
index 62b5c4e6a29..00000000000
--- a/intern/cycles/kernel/kernel_data_init.cl
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_ocl_path_trace_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------|                                                 |--- Initialized throughput
- * Un-initialized L_transparent ----|                                                 |--- Initialized L_transparent
- * Un-initialized PathRadiance -----|                                                 |--- Initialized PathRadiance
- * Un-initialized Ray --------------|                                                 |--- Initialized Ray
- * Un-initialized PathState --------|                                                 |--- Initialized PathState
- * Un-initialized QueueData --------|                                                 |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initilaized QueueIndex -------|                                                 |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---|                                                 |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------|                                                 |--- Initialized ray_state
- * parallel_samples --------------- |                                                 |--- Initialized per_sample_output_buffers
- * rng_state -----------------------|                                                 |--- Initialized work_array
- * data ----------------------------|                                                 |--- Initialized work_pool_wgs
- * start_sample --------------------|                                                 |
- * sx ------------------------------|                                                 |
- * sy ------------------------------|                                                 |
- * sw ------------------------------|                                                 |
- * sh ------------------------------|                                                 |
- * stride --------------------------|                                                 |
- * queuesize -----------------------|                                                 |
- * num_samples ---------------------|                                                 |
- *
- * Note on Queues :
- * All slots in queues are initialized to queue empty slot;
- * The number of elements in the queues is initialized to 0;
- */
-__kernel void kernel_ocl_path_trace_data_init(
-	ccl_global char *globals,
-	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
-	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
-
-	ccl_global float3 *P_sd,
-	ccl_global float3 *P_sd_DL_shadow,
-
-	ccl_global float3 *N_sd,
-	ccl_global float3 *N_sd_DL_shadow,
-
-	ccl_global float3 *Ng_sd,
-	ccl_global float3 *Ng_sd_DL_shadow,
-
-	ccl_global float3 *I_sd,
-	ccl_global float3 *I_sd_DL_shadow,
-
-	ccl_global int *shader_sd,
-	ccl_global int *shader_sd_DL_shadow,
-
-	ccl_global int *flag_sd,
-	ccl_global int *flag_sd_DL_shadow,
-
-	ccl_global int *prim_sd,
-	ccl_global int *prim_sd_DL_shadow,
-
-	ccl_global int *type_sd,
-	ccl_global int *type_sd_DL_shadow,
-
-	ccl_global float *u_sd,
-	ccl_global float *u_sd_DL_shadow,
-
-	ccl_global float *v_sd,
-	ccl_global float *v_sd_DL_shadow,
-
-	ccl_global int *object_sd,
-	ccl_global int *object_sd_DL_shadow,
-
-	ccl_global float *time_sd,
-	ccl_global float *time_sd_DL_shadow,
-
-	ccl_global float *ray_length_sd,
-	ccl_global float *ray_length_sd_DL_shadow,
-
-	ccl_global int *ray_depth_sd,
-	ccl_global int *ray_depth_sd_DL_shadow,
-
-	ccl_global int *transparent_depth_sd,
-	ccl_global int *transparent_depth_sd_DL_shadow,
-
-	/* Ray differentials. */
-	ccl_global differential3 *dP_sd,
-	ccl_global differential3 *dP_sd_DL_shadow,
-
-	ccl_global differential3 *dI_sd,
-	ccl_global differential3 *dI_sd_DL_shadow,
-
-	ccl_global differential *du_sd,
-	ccl_global differential *du_sd_DL_shadow,
-
-	ccl_global differential *dv_sd,
-	ccl_global differential *dv_sd_DL_shadow,
-
-	/* Dp/Du */
-	ccl_global float3 *dPdu_sd,
-	ccl_global float3 *dPdu_sd_DL_shadow,
-
-	ccl_global float3 *dPdv_sd,
-	ccl_global float3 *dPdv_sd_DL_shadow,
-
-	/* Object motion. */
-	ccl_global Transform *ob_tfm_sd,
-	ccl_global Transform *ob_tfm_sd_DL_shadow,
-
-	ccl_global Transform *ob_itfm_sd,
-	ccl_global Transform *ob_itfm_sd_DL_shadow,
-
-	ShaderClosure *closure_sd,
-	ShaderClosure *closure_sd_DL_shadow,
-
-	ccl_global int *num_closure_sd,
-	ccl_global int *num_closure_sd_DL_shadow,
-
-	ccl_global float *randb_closure_sd,
-	ccl_global float *randb_closure_sd_DL_shadow,
-
-	ccl_global float3 *ray_P_sd,
-	ccl_global float3 *ray_P_sd_DL_shadow,
-
-	ccl_global differential3 *ray_dP_sd,
-	ccl_global differential3 *ray_dP_sd_DL_shadow,
-
-	ccl_constant KernelData *data,
-	ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_state,
-	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
-	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel_textures.h"
-
-	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-	int rng_state_offset_x,
-	int rng_state_offset_y,
-	int rng_state_stride,
-	ccl_global int *Queue_data,                  /* Memory for queues */
-	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-	int queuesize,                               /* size (capacity) of the queue */
-	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
-	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-	unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
-#endif
-	int parallel_samples                         /* Number of samples to be processed in parallel */
-	)
-{
-
-	/* Load kernel globals structure */
-	KernelGlobals *kg = (KernelGlobals *)globals;
-
-	kg->data = data;
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel_textures.h"
-
-	/* Load ShaderData structure */
-	ShaderData *sd = (ShaderData *)shader_data_sd;
-	ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
-
-	sd->P = P_sd;
-	sd_DL_shadow->P = P_sd_DL_shadow;
-
-	sd->N = N_sd;
-	sd_DL_shadow->N = N_sd_DL_shadow;
-
-	sd->Ng = Ng_sd;
-	sd_DL_shadow->Ng = Ng_sd_DL_shadow;
-
-	sd->I = I_sd;
-	sd_DL_shadow->I = I_sd_DL_shadow;
-
-	sd->shader = shader_sd;
-	sd_DL_shadow->shader = shader_sd_DL_shadow;
-
-	sd->flag = flag_sd;
-	sd_DL_shadow->flag = flag_sd_DL_shadow;
-
-	sd->prim = prim_sd;
-	sd_DL_shadow->prim = prim_sd_DL_shadow;
-
-	sd->type = type_sd;
-	sd_DL_shadow->type = type_sd_DL_shadow;
-
-	sd->u = u_sd;
-	sd_DL_shadow->u = u_sd_DL_shadow;
-
-	sd->v = v_sd;
-	sd_DL_shadow->v = v_sd_DL_shadow;
-
-	sd->object = object_sd;
-	sd_DL_shadow->object = object_sd_DL_shadow;
-
-	sd->time = time_sd;
-	sd_DL_shadow->time = time_sd_DL_shadow;
-
-	sd->ray_length = ray_length_sd;
-	sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
-
-	sd->ray_depth = ray_depth_sd;
-	sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
-
-	sd->transparent_depth = transparent_depth_sd;
-	sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
-
-#ifdef __RAY_DIFFERENTIALS__
-	sd->dP = dP_sd;
-	sd_DL_shadow->dP = dP_sd_DL_shadow;
-
-	sd->dI = dI_sd;
-	sd_DL_shadow->dI = dI_sd_DL_shadow;
-
-	sd->du = du_sd;
-	sd_DL_shadow->du = du_sd_DL_shadow;
-
-	sd->dv = dv_sd;
-	sd_DL_shadow->dv = dv_sd_DL_shadow;
-#ifdef __DPDU__
-	sd->dPdu = dPdu_sd;
-	sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
-
-	sd->dPdv = dPdv_sd;
-	sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
-#endif
-#endif
-
-#ifdef __OBJECT_MOTION__
-	sd->ob_tfm = ob_tfm_sd;
-	sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
-
-	sd->ob_itfm = ob_itfm_sd;
-	sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
-#endif
-
-	sd->closure = closure_sd;
-	sd_DL_shadow->closure = closure_sd_DL_shadow;
-
-	sd->num_closure = num_closure_sd;
-	sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
-
-	sd->randb_closure = randb_closure_sd;
-	sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
-
-	sd->ray_P = ray_P_sd;
-	sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
-
-	sd->ray_dP = ray_dP_sd;
-	sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
-
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-#ifdef __WORK_STEALING__
-	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	/* Initialize work_pool_wgs */
-	if(lid == 0) {
-		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
-		work_pool_wgs[group_index] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-#endif // __WORK_STEALING__
-
-	/* Initialize queue data and queue index */
-	if(thread_index < queuesize) {
-		/* Initialize active ray queue */
-		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize background and buffer update queue */
-		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of AO queue */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of direct lighting queue */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-	}
-
-	if(thread_index == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-		/* The scene-intersect kernel should not use the queues very first time.
-		 * since the queue would be empty.
-		 */
-		use_queues_flag[0] = 0;
-	}
-
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < (sw * parallel_samples) && y < sh) {
-
-		int ray_index = x + y * (sw * parallel_samples);
-
-		/* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */
-		ray_state[ray_index] = RAY_ACTIVE;
-
-		unsigned int my_sample;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work = 0;
-		/* get work */
-		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		/* Get the sample associated with the work */
-		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
-		my_sample_tile = 0;
-
-		/* Get pixel and tile position associated with the work */
-		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-		work_array[ray_index] = my_work;
-#else // __WORK_STEALING__
-
-		unsigned int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-		my_sample = my_sample_tile + start_sample;
-
-		/* Initialize work array */
-		work_array[ray_index] = my_sample ;
-
-		/* Calculate pixel position of this ray */
-		pixel_x = sx + tile_x;
-		pixel_y = sy + tile_y;
-#endif // __WORK_STEALING__
-
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
-		/* Initialise per_sample_output_buffers to all zeros */
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
-		int per_sample_output_buffers_iterator = 0;
-		for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) {
-			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
-		}
-
-		/* initialize random numbers and ray */
-		kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]);
-
-		if(Ray_coop[ray_index].t != 0.0f) {
-			/* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
-			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-			L_transparent_coop[ray_index] = 0.0f;
-			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
-			path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
-			debug_data_init(&debugdata_coop[ray_index]);
-#endif
-		} else {
-			/*These rays do not participate in path-iteration */
-
-			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			/* accumulate result in output buffer */
-			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
-			path_rng_end(kg, rng_state, rng_coop[ray_index]);
-
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-		}
-	}
-
-	/* Mark rest of the ray-state indices as RAY_INACTIVE */
-	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
-		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
-		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
-	}
-}
diff --git a/intern/cycles/kernel/kernel_direct_lighting.cl b/intern/cycles/kernel/kernel_direct_lighting.cl
deleted file mode 100644
index f874122c508..00000000000
--- a/intern/cycles/kernel/kernel_direct_lighting.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
- * in the next kernel.
- *
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
- *
- * The input and output are as follows,
- *
- * rng_coop -----------------------------------------|--- kernel_ocl_path_trace_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------|                                            |--- ISLamp_coop
- * shader_data --------------------------------------|                                            |--- LightRay_coop
- * ray_state ----------------------------------------|                                            |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                            |
- * kg (globals + data) ------------------------------|                                            |
- * queuesize ----------------------------------------|                                            |
- *
- * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
- */
-__kernel void kernel_ocl_path_trace_direct_lighting(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,           /* Required for direct lighting */
-	ccl_global char *shader_DL,             /* Required for direct lighting */
-	ccl_global uint *rng_coop,              /* Required for direct lighting */
-	ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-	ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-	ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-	ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-	ccl_global char *ray_state,             /* Denotes the state of each ray */
-	ccl_global int *Queue_data,             /* Queue memory */
-	ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
-	int queuesize                           /* Size (capacity) of each queue */
-	)
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not required
-	 * If we are executing on a CPU device, then we need to keep all threads active
-	 * since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			/* Load kernel globals structure and ShaderData structure */
-			KernelGlobals *kg = (KernelGlobals *)globals;
-			ShaderData *sd = (ShaderData *)shader_data;
-			ShaderData *sd_DL  = (ShaderData *)shader_DL;
-
-			ccl_global PathState *state = &PathState_coop[ray_index];
-
-			/* direct lighting */
-#ifdef __EMISSION__
-			if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
-				/* sample illumination from lights to find path contribution */
-				ccl_global RNG* rng = &rng_coop[ray_index];
-				float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-				float light_u, light_v;
-				path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
-
-				Ray light_ray;
-#ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
-#endif
-
-				BsdfEval L_light;
-				bool is_lamp;
-				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
-					/* write intermediate data to global memory to access from the next kernel */
-					LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
-					/// mark ray state for next shadow kernel
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
-					enqueue_flag = 1;
-				}
-			}
-#endif
-		}
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-#ifdef __EMISSION__
-	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays */
-	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
-#endif
-}
diff --git a/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
deleted file mode 100644
index a2e57771522..00000000000
--- a/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
- *
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                                          |--- PathState_coop
- * PathRadiance_coop ------------------------------------|                                                                          |--- throughput_coop
- * Intersection_coop ------------------------------------|                                                                          |--- L_transparent_coop
- * PathState_coop ---------------------------------------|                                                                          |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------|                                                                          |--- PathRadiance_coop
- * shader_data ------------------------------------------|                                                                          |--- ShaderData
- * ray_state --------------------------------------------|                                                                          |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                                          |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                                          |--- AOAlpha_coop
- * kg (globals + data) ----------------------------------|                                                                          |--- AOBSDF_coop
- * parallel_samples -------------------------------------|                                                                          |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------|                                                                          |
- * sw ---------------------------------------------------|                                                                          |
- * sh ---------------------------------------------------|                                                                          |
- * sx ---------------------------------------------------|                                                                          |
- * sy ---------------------------------------------------|                                                                          |
- * stride -----------------------------------------------|                                                                          |
- * work_array -------------------------------------------|                                                                          |
- * queuesize --------------------------------------------|                                                                          |
- * start_sample -----------------------------------------|                                                                          |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
- */
-
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required throughout the kernel except probabilistic path termination and AO */
-	ccl_global float *per_sample_output_buffers,
-	ccl_global uint *rng_coop,                  /* Required for "kernel_write_data_passes" and AO */
-	ccl_global float3 *throughput_coop,         /* Required for handling holdout material and AO */
-	ccl_global float *L_transparent_coop,       /* Required for handling holdout material */
-	PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
-	ccl_global PathState *PathState_coop,       /* Required throughout the kernel and AO */
-	Intersection *Intersection_coop, /* Required for indirect primitive emission */
-	ccl_global float3 *AOAlpha_coop,            /* Required for AO */
-	ccl_global float3 *AOBSDF_coop,             /* Required for AO */
-	ccl_global Ray *AOLightRay_coop,            /* Required for AO */
-	int sw, int sh, int sx, int sy, int stride,
-	ccl_global char *ray_state,                /* Denotes the state of each ray */
-	ccl_global unsigned int *work_array,       /* Denotes the work that each ray belongs to */
-	ccl_global int *Queue_data,                /* Queue memory */
-	ccl_global int *Queue_index,               /* Tracks the number of elements in each queue */
-	int queuesize,                             /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
-	unsigned int start_sample,
-#endif
-	int parallel_samples                       /* Number of samples to be processed in parallel */
-	)
-{
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not required
-	 * If we are executing on a CPU device, then we need to keep all threads active
-	 * since we have barrier() calls later in the kernel. CPU devices
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		/* Load kernel globals structure and ShaderData structure */
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd = (ShaderData *)shader_data;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-#endif
-		unsigned int tile_x;
-		unsigned int tile_y;
-		int my_sample_tile;
-		unsigned int sample;
-
-		ccl_global RNG *rng = 0x0;
-		ccl_global PathState *state = 0x0;
-		float3 throughput;
-
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-
-			throughput = throughput_coop[ray_index];
-			state = &PathState_coop[ray_index];
-			rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
-			my_work = work_array[ray_index];
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
-#else // __WORK_STEALING__
-			sample = work_array[ray_index];
-			/* buffer's stride is "stride"; Find x and y using ray_index */
-			int tile_index = ray_index / parallel_samples;
-			tile_x = tile_index % sw;
-			tile_y = tile_index / sw;
-			my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif // __WORK_STEALING__
-			per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
-			/* holdout */
-#ifdef __HOLDOUT__
-			if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
-				if(kernel_data.background.transparent) {
-					float3 holdout_weight;
-
-					if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
-						holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-					else
-						holdout_weight = shader_holdout_eval(kg, sd);
-
-					/* any throughput is ok, should all be identical here */
-					L_transparent_coop[ray_index] += average(holdout_weight*throughput);
-				}
-
-				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					enqueue_flag = 1;
-				}
-			}
-#endif
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-
-			PathRadiance *L = &PathRadiance_coop[ray_index];
-			/* holdout mask objects do not write data passes */
-			kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
-
-			/* blurring of bsdf after bounces, for rays that have a small likelihood
-				* of following this particular path (diffuse, rough glossy) */
-			if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-				float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-
-				if(blur_pdf < 1.0f) {
-					float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-					shader_bsdf_blur(kg, sd, blur_roughness);
-				}
-			}
-
-#ifdef __EMISSION__
-			/* emission */
-			if(ccl_fetch(sd, flag) & SD_EMISSION) {
-				/* todo: is isect.t wrong here for transparent surfaces? */
-				float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
-#endif
-
-			/* path termination. this is a strange place to put the termination, it's
-			 * mainly due to the mixed in MIS that we use. gives too many unneeded
-			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, state, throughput);
-
-			if(probability == 0.0f) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
-			}
-
-			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-				if(probability != 1.0f) {
-					float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
-
-					if(terminate >= probability) {
-						ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-						enqueue_flag = 1;
-					} else {
-						throughput_coop[ray_index] = throughput/probability;
-					}
-				}
-			}
-		}
-
-#ifdef __AO__
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			/* ambient occlusion */
-			if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
-				/* todo: solve correlation */
-				float bsdf_u, bsdf_v;
-				path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-				float ao_factor = kernel_data.background.ao_factor;
-				float3 ao_N;
-				AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-				AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
-
-				float3 ao_D;
-				float ao_pdf;
-				sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-				if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-					Ray _ray;
-					_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-					_ray.D = ao_D;
-					_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-					_ray.time = ccl_fetch(sd, time);
-#endif
-					_ray.dP = ccl_fetch(sd, dP);
-					_ray.dD = differential3_zero();
-					AOLightRay_coop[ray_index] = _ray;
-
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-					enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-				}
-			}
-		}
-#endif
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays */
-	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays */
-	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
-#endif
-}
diff --git a/intern/cycles/kernel/kernel_lamp_emission.cl b/intern/cycles/kernel/kernel_lamp_emission.cl
deleted file mode 100644
index e7f8b227dd8..00000000000
--- a/intern/cycles/kernel/kernel_lamp_emission.cl
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
- * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_ocl_path_trace_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------|                                          |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------|                                          |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals + data) --------------------------------|                                          |
- * Intersection_coop ----------------------------------|                                          |
- * ray_state ------------------------------------------|                                          |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                                          |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                                          |
- * queuesize ------------------------------------------|                                          |
- * use_queues_flag ------------------------------------|                                          |
- * sw -------------------------------------------------|                                          |
- * sh -------------------------------------------------|                                          |
- * parallel_samples -----------------------------------|                                          |
- *
- * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_ocl_path_trace_lamp_emission, kernel.
- */
-__kernel void kernel_ocl_path_trace_lamp_emission(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required for lamp emission */
-	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
-	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
-	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
-	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
-	Intersection *Intersection_coop, /* Required for lamp emission */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
-	int sw, int sh,
-	ccl_global int *Queue_data,                 /* Memory for queues */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
-	int queuesize,                              /* Size (capacity) of queues */
-	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
-	int parallel_samples                        /* Number of samples to be processed in parallel */
-	)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* We will empty this queue in this kernel */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-	}
-
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh){
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd = (ShaderData *)shader_data;
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-
-		float3 throughput = throughput_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
-		PathState state = PathState_coop[ray_index];
-
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray.P - state.ray_t*ray.D;
-			state.ray_t += Intersection_coop[ray_index].t;
-			light_ray.D = ray.D;
-			light_ray.t = state.ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-			/* intersect with lamp */
-			float3 emission;
-
-			if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
-				path_radiance_accum_emission(L, throughput, emission, state.bounce);
-			}
-		}
-#endif
-		/* __VOLUME__ feature is disabled */
-#if 0
-#ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-				ShaderData volume_sd;
-
-				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-				kernel_volume_decoupled_record(kg, &state,
-					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					bool all = false;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &volume_sd, &throughput,
-						rphase, rscatter, &volume_segment, NULL, true);
-				}
-
-				if(result != VOLUME_PATH_SCATTERED)
-					throughput *= volume_segment.accum_transmittance;
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
-						continue;
-					else
-						break;
-				}
-			}
-			else
-#endif
-			{
-				/* integrate along volume segment with distance sampling */
-				ShaderData volume_sd;
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
-
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
-						continue;
-					else
-						break;
-				}
-#endif
-			}
-		}
-#endif
-#endif
-	}
-}
diff --git a/intern/cycles/kernel/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernel_next_iteration_setup.cl
deleted file mode 100644
index 3c0e4e9240d..00000000000
--- a/intern/cycles/kernel/kernel_next_iteration_setup.cl
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------|                                                |--- throughput_coop
- * PathState_coop ---------------------------------------|                                                |--- PathRadiance_coop
- * shader_data ------------------------------------------|                                                |--- PathState_coop
- * ray_state --------------------------------------------|                                                |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                                |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                |--- use_queues_flag
- * Ray_coop ---------------------------------------------|                                                |
- * kg (globals + data) ----------------------------------|                                                |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
- */
-
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Required for setting up ray for next iteration */
-	ccl_global uint *rng_coop,                  /* Required for setting up ray for next iteration */
-	ccl_global float3 *throughput_coop,         /* Required for setting up ray for next iteration */
-	PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
-	ccl_global Ray *Ray_coop,                   /* Required for setting up ray for next iteration */
-	ccl_global PathState *PathState_coop,       /* Required for setting up ray for next iteration */
-	ccl_global Ray *LightRay_dl_coop,           /* Required for radiance update - direct lighting */
-	ccl_global int *ISLamp_coop,                /* Required for radiance update - direct lighting */
-	ccl_global BsdfEval *BSDFEval_coop,         /* Required for radiance update - direct lighting */
-	ccl_global Ray *LightRay_ao_coop,           /* Required for radiance update - AO */
-	ccl_global float3 *AOBSDF_coop,             /* Required for radiance update - AO */
-	ccl_global float3 *AOAlpha_coop,            /* Required for radiance update - AO */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
-	ccl_global int *Queue_data,                 /* Queue memory */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
-	int queuesize,                              /* Size (capacity) of each queue */
-	ccl_global char *use_queues_flag            /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
-	)
-{
-
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* If we are here, then it means that scene-intersect kernel
-		* has already been executed atleast once. From the next time,
-		* scene-intersect kernel may operate on queues to fetch ray index
-		*/
-		use_queues_flag[0] = 1;
-
-		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS
-		 * queues that were made empty during the previous kernel
-		 */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not required
-	 * If we are executing on a CPU device, then we need to keep all threads active
-	 * since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		/* Load kernel globals structure and ShaderData structure */
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd = (ShaderData *)shader_data;
-		PathRadiance *L = 0x0;
-		ccl_global PathState *state = 0x0;
-
-		/* Path radiance update for AO/Direct_lighting's shadow blocked */
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			state = &PathState_coop[ray_index];
-			L = &PathRadiance_coop[ray_index];
-			float3 _throughput = throughput_coop[ray_index];
-
-			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-				float3 shadow = LightRay_ao_coop[ray_index].P;
-				char update_path_radiance = LightRay_ao_coop[ray_index].t;
-				if(update_path_radiance) {
-					path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce);
-				}
-				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-			}
-
-			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-				float3 shadow = LightRay_dl_coop[ray_index].P;
-				char update_path_radiance = LightRay_dl_coop[ray_index].t;
-				if(update_path_radiance) {
-					BsdfEval L_light = BSDFEval_coop[ray_index];
-					path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]);
-				}
-				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
-			}
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-
-			ccl_global float3 *throughput = &throughput_coop[ray_index];
-			ccl_global Ray *ray = &Ray_coop[ray_index];
-			ccl_global RNG* rng = &rng_coop[ray_index];
-			state = &PathState_coop[ray_index];
-			L = &PathRadiance_coop[ray_index];
-
-			/* compute direct lighting and next bounce */
-			if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
-			}
-		}
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays */
-	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
-}
diff --git a/intern/cycles/kernel/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernel_queue_enqueue.cl
deleted file mode 100644
index eee7860fb84..00000000000
--- a/intern/cycles/kernel/kernel_queue_enqueue.cl
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_queues.h"
-
-/*
- * The kernel "kernel_ocl_path_trace_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_ocl_path_trace_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_ocl_path_trace_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                          |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                          |
- * queuesize -------------------------------------------|                                          |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-
-__kernel void kernel_ocl_path_trace_queue_enqueue(
-	ccl_global int *Queue_data,   /* Queue memory */
-	ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
-	ccl_global char *ray_state,   /* Denotes the state of each ray */
-	int queuesize                 /* Size (capacity) of each queue */
-	)
-{
-	/* We have only 2 cases (Hit/Not-Hit) */
-	ccl_local unsigned int local_queue_atomics[2];
-
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-	if(lidx < 2 ) {
-		local_queue_atomics[lidx] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int queue_number = -1;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-	} else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	}
-
-	unsigned int my_lqidx;
-	if(queue_number != -1) {
-		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(lidx == 0) {
-		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index);
-		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	unsigned int my_gqidx;
-	if(queue_number != -1) {
-		my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics);
-		Queue_data[my_gqidx] = ray_index;
-	}
-}
diff --git a/intern/cycles/kernel/kernel_scene_intersect.cl b/intern/cycles/kernel/kernel_scene_intersect.cl
deleted file mode 100644
index 6817e28a302..00000000000
--- a/intern/cycles/kernel/kernel_scene_intersect.cl
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
- *
- * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
- * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_ocl_path_trace_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------|                                                         |--- Intersection
- * ray_state --------------------------------------|                                                         |--- ray_state
- * use_queues_flag --------------------------------|                                                         |
- * parallel_samples -------------------------------|                                                         |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                                         |
- * kg (data + globals) ----------------------------|                                                         |
- * rng_coop ---------------------------------------|                                                         |
- * sw ---------------------------------------------|                                                         |
- * sh ---------------------------------------------|                                                         |
- * queuesize --------------------------------------|                                                         |
- *
- * Note on Queues :
- * Ideally we would want kernel_ocl_path_trace_scene_intersect to work on queues.
- * But during the very first time, the queues wil be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
- */
-
-__kernel void kernel_ocl_path_trace_scene_intersect(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global uint *rng_coop,
-	ccl_global Ray *Ray_coop,                   /* Required for scene_intersect */
-	ccl_global PathState *PathState_coop,       /* Required for scene_intersect */
-	Intersection *Intersection_coop,            /* Required for scene_intersect */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
-	int sw, int sh,
-	ccl_global int *Queue_data,                 /* Memory for queues */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
-	int queuesize,                              /* Size (capacity) of queues */
-	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-	DebugData *debugdata_coop,
-#endif
-	int parallel_samples                        /* Number of samples to be processed in parallel */
-	)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh){
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	/* All regenerated rays become active here */
-	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-
-	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
-		return;
-
-	/* Load kernel globals structure */
-	KernelGlobals *kg = (KernelGlobals *)globals;
-
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	Intersection *isect = &Intersection_coop[ray_index];
-	PathState state = PathState_coop[ray_index];
-	Ray ray = Ray_coop[ray_index];
-
-	/* intersect scene */
-	uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-	float difl = 0.0f, extmax = 0.0f;
-	uint lcg_state = 0;
-	RNG rng = rng_coop[ray_index];
-
-	if(kernel_data.bvh.have_curves) {
-		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
-			float3 pixdiff = ray.dD.dx + ray.dD.dy;
-			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-		}
-
-		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
-	}
-
-	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
-#else
-	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
-#endif
-
-#ifdef __KERNEL_DEBUG__
-	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
-	}
-#endif
-
-	if(!hit) {
-		/* Change the state of rays that hit the background;
-		 * These rays undergo special processing in the
-		 * background_bufferUpdate kernel*/
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
-	}
-}
diff --git a/intern/cycles/kernel/kernel_shader_eval.cl b/intern/cycles/kernel/kernel_shader_eval.cl
deleted file mode 100644
index b3983081be6..00000000000
--- a/intern/cycles/kernel/kernel_shader_eval.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_ocl_path_trace_shader_eval --|--- shader_data
- * Ray_coop -------------------------------------------|                                        |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------|                                        |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------|                                        |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                                        |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                                        |
- * ray_state ------------------------------------------|                                        |
- * kg (globals + data) --------------------------------|                                        |
- * queuesize ------------------------------------------|                                        |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
- */
-
-__kernel void kernel_ocl_path_trace_shader_eval(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_data,               /* Output ShaderData structure to be filled */
-	ccl_global uint *rng_coop,                  /* Required for rbsdf calculation */
-	ccl_global Ray *Ray_coop,                   /* Required for setting up shader from ray */
-	ccl_global PathState *PathState_coop,       /* Required for all functions in this kernel */
-	Intersection *Intersection_coop, /* Required for setting up shader from ray */
-	ccl_global char *ray_state,                 /* Denotes the state of each ray */
-	ccl_global int *Queue_data,                 /* queue memory */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
-	int queuesize                               /* Size (capacity) of each queue */
-	)
-{
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
-
-	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
-
-	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	/* Continue on with shader evaluation */
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd = (ShaderData *)shader_data;
-		Intersection *isect = &Intersection_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
-
-		shader_setup_from_ray(kg, sd, isect, &ray, state->bounce, state->transparent_bounce);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
-	}
-}
diff --git a/intern/cycles/kernel/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernel_shadow_blocked.cl
deleted file mode 100644
index 2fc4be6d528..00000000000
--- a/intern/cycles/kernel/kernel_shadow_blocked.cl
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split.h"
-
-/*
- * Note on kernel_ocl_path_trace_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_ocl_path_trace_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                                           |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                                           |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                                           |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                                           |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                                           |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                                           |
- * kg (globals + data) -----------------------------|                                           |
- * queuesize ---------------------------------------|                                           |
- *
- * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
- * these queues this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
- * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
-
-__kernel void kernel_ocl_path_trace_shadow_blocked(
-	ccl_global char *globals,
-	ccl_constant KernelData *data,
-	ccl_global char *shader_shadow,             /* Required for shadow blocked */
-	ccl_global PathState *PathState_coop,       /* Required for shadow blocked */
-	ccl_global Ray *LightRay_dl_coop,           /* Required for direct lighting's shadow blocked */
-	ccl_global Ray *LightRay_ao_coop,           /* Required for AO's shadow blocked */
-	Intersection *Intersection_coop_AO,
-	Intersection *Intersection_coop_DL,
-	ccl_global char *ray_state,
-	ccl_global int *Queue_data,                 /* Queue memory */
-	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
-	int queuesize,                              /* Size (capacity) of each queue */
-	int total_num_rays
-	)
-{
-#if 0
-	/* we will make the Queue_index entries '0' in the next kernel */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* We empty this queue here */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-#endif
-
-	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
-	ccl_local unsigned int ao_queue_length;
-	ccl_local unsigned int dl_queue_length;
-	if(lidx == 0) {
-		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* flag determining if the current ray is to process shadow ray for AO or DL */
-	char shadow_blocked_type = -1;
-	/* flag determining if we need to update L */
-	char update_path_radiance = 0;
-
-	int ray_index = QUEUE_EMPTY_SLOT;
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(thread_index < ao_queue_length + dl_queue_length) {
-		if(thread_index < ao_queue_length) {
-			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
-		} else {
-			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
-		}
-	}
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-		/* Load kernel global structure */
-		KernelGlobals *kg = (KernelGlobals *)globals;
-		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
-
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
-		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
-		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
-		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
-
-		ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global;
-		Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
-
-		float3 shadow;
-		update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global));
-
-		/* We use light_ray_global's P and t to store shadow and update_path_radiance */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
-	}
-}
diff --git a/intern/cycles/kernel/kernel_split.h b/intern/cycles/kernel/kernel_split.h
deleted file mode 100644
index d4bcb9b9d8f..00000000000
--- a/intern/cycles/kernel/kernel_split.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef  _KERNEL_SPLIT_H_
-#define  _KERNEL_SPLIT_H_
-
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-
-#include "util_atomic.h"
-
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
-
-#include "geom/geom.h"
-
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#include "kernel_subsurface.h"
-#endif
-
-#ifdef __VOLUME__
-#include "kernel_volume.h"
-#endif
-
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
-
-#ifdef __KERNEL_DEBUG__
-#include "kernel_debug.h"
-#endif
-
-#include "kernel_queues.h"
-#include "kernel_work_stealing.h"
-
-#endif
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
deleted file mode 100644
index cc8c603e8f8..00000000000
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#define __KERNEL_SSE2__
-#endif
-
-#include "util_optimization.h"
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-
-#include "kernel_compat_cpu.h"
-#include "kernel.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Path Tracing */
-
-void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
-{
-#ifdef __BRANCHED_PATH__
-	if(kernel_data.integrator.branched)
-		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-	else
-#endif
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-/* Film */
-
-void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-/* Shader Evaluate */
-
-void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
-{
-	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
-	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
-}
-
-CCL_NAMESPACE_END
-
-#else
-
-/* needed for some linkers in combination with scons making empty compilation unit in a library */
-void __dummy_function_cycles_sse2(void);
-void __dummy_function_cycles_sse2(void) {}
-
-#endif
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
deleted file mode 100644
index 20919a4f26e..00000000000
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#define __KERNEL_SSE2__
-#define __KERNEL_SSE3__
-#define __KERNEL_SSSE3__
-#endif
-
-#include "util_optimization.h"
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-
-#include "kernel_compat_cpu.h"
-#include "kernel.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Path Tracing */
-
-void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
-{
-#ifdef __BRANCHED_PATH__
-	if(kernel_data.integrator.branched)
-		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-	else
-#endif
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-/* Film */
-
-void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-/* Shader Evaluate */
-
-void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
-{
-	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
-	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
-}
-
-CCL_NAMESPACE_END
-#else
-
-/* needed for some linkers in combination with scons making empty compilation unit in a library */
-void __dummy_function_cycles_sse3(void);
-void __dummy_function_cycles_sse3(void) {}
-
-#endif
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
deleted file mode 100644
index 48579d3b7e5..00000000000
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#define __KERNEL_SSE2__
-#define __KERNEL_SSE3__
-#define __KERNEL_SSSE3__
-#define __KERNEL_SSE41__
-#endif
-
-#include "util_optimization.h"
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-
-#include "kernel_compat_cpu.h"
-#include "kernel.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Path Tracing */
-
-void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
-{
-#ifdef __BRANCHED_PATH__
-	if(kernel_data.integrator.branched)
-		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-	else
-#endif
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
-}
-
-/* Film */
-
-void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
-{
-	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-/* Shader Evaluate */
-
-void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
-{
-	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
-	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
-}
-
-CCL_NAMESPACE_END
-#else
-
-/* needed for some linkers in combination with scons making empty compilation unit in a library */
-void __dummy_function_cycles_sse41(void);
-void __dummy_function_cycles_sse41(void) {}
-
-#endif
diff --git a/intern/cycles/kernel/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernel_sum_all_radiance.cl
deleted file mode 100644
index 739a85d4cc8..00000000000
--- a/intern/cycles/kernel/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-
-/*
-* Since we process various samples in parallel; The output radiance of different samples
-* are stored in different locations; This kernel combines the output radiance contributed
-* by all different samples and stores them in the RenderTile's output buffer.
-*/
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-	ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-	ccl_global float *buffer,                    /* Output buffer of RenderTile */
-	ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-	int parallel_samples, int sw, int sh, int stride,
-	int buffer_offset_x,
-	int buffer_offset_y,
-	int buffer_stride,
-	int start_sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < sw && y < sh) {
-		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
-		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
-		int sample_stride = (data->film.pass_stride);
-
-		int sample_iterator = 0;
-		int pass_stride_iterator = 0;
-		int num_floats = data->film.pass_stride;
-
-		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
-			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-				*(buffer + pass_stride_iterator) = (start_sample == 0 && sample_iterator == 0) ? *(per_sample_output_buffer + pass_stride_iterator)
-				: *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-			}
-			per_sample_output_buffer += sample_stride;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
new file mode 100644
index 00000000000..a7eaa758f5d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size)
+{
+	if(strcmp(name, "__data") == 0)
+		memcpy(&kg->__data, host, size);
+	else
+		assert(0);
+}
+
+void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation)
+{
+	if(0) {
+	}
+
+#define KERNEL_TEX(type, ttype, tname) \
+	else if(strcmp(name, #tname) == 0) { \
+		kg->tname.data = (type*)mem; \
+		kg->tname.width = width; \
+	}
+#define KERNEL_IMAGE_TEX(type, ttype, tname)
+#include "kernel_textures.h"
+
+	else if(strstr(name, "__tex_image_float")) {
+		texture_image_float4 *tex = NULL;
+		int id = atoi(name + strlen("__tex_image_float_"));
+		int array_index = id;
+
+		if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
+			tex = &kg->texture_float_images[array_index];
+		}
+
+		if(tex) {
+			tex->data = (float4*)mem;
+			tex->dimensions_set(width, height, depth);
+			tex->interpolation = interpolation;
+		}
+	}
+	else if(strstr(name, "__tex_image")) {
+		texture_image_uchar4 *tex = NULL;
+		int id = atoi(name + strlen("__tex_image_"));
+		int array_index = id - MAX_FLOAT_IMAGES;
+
+		if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
+			tex = &kg->texture_byte_images[array_index];
+		}
+
+		if(tex) {
+			tex->data = (uchar4*)mem;
+			tex->dimensions_set(width, height, depth);
+			tex->interpolation = interpolation;
+		}
+	}
+	else
+		assert(0);
+}
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
+#if defined(__x86_64__) || defined(_M_X64)
+#define __KERNEL_SSE2__
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+	/* do nothing */
+#endif
+
+/* Path Tracing */
+
+void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluation */
+
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..f1027ad413d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#endif
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx(void);
+void __dummy_function_cycles_avx(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..b2f16ff54d8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
+#endif
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx2(void);
+void __dummy_function_cycles_avx2(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..cc8c603e8f8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_sse2(void);
+void __dummy_function_cycles_sse2(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..20919a4f26e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_sse3(void);
+void __dummy_function_cycles_sse3(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..48579d3b7e5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_sse41(void);
+void __dummy_function_cycles_sse41(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
new file mode 100644
index 00000000000..29bf67d9750
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#include "../../kernel_compat_cuda.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_bake.h"
+
+/* device data taken from CUDA occupancy calculator */
+
+#ifdef __CUDA_ARCH__
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0 and 5.2 */
+#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 40
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+#ifdef __BRANCHED_PATH__
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
+kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+}
+#endif
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw)
+		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw)
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
new file mode 100644
index 00000000000..bffcd53bab3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points - unfinished */
+
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_bake.h"
+
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_path_trace(
+	ccl_constant KernelData *data,
+	ccl_global float *buffer,
+	ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int sample,
+	int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw)
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+}
+
+__kernel void kernel_ocl_bake(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw) {
+#if defined(__KERNEL_OPENCL_NVIDIA__) && __COMPUTE_CAPABILITY__ < 300
+		/* NVidia compiler is spending infinite amount of time trying
+		 * to deal with kernel_bake_evaluate() on architectures prior
+		 * to sm_30.
+		 * For now we disable baking kernel for those devices, so at
+		 * least rendering with split kernel could be compiled.
+		 */
+		output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+#endif
+	}
+}
+
+__kernel void kernel_ocl_convert_to_byte(
+	ccl_constant KernelData *data,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+__kernel void kernel_ocl_convert_to_half_float(
+	ccl_constant KernelData *data,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif // __COMPILE_ONLY_MEGAKERNEL__
\ No newline at end of file
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
new file mode 100644
index 00000000000..2d1944d01e6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_background_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        ccl_global int *Queue_data,            /* Queues memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	kernel_background_buffer_update(globals,
+	                                data,
+	                                shader_data,
+	                                per_sample_output_buffers,
+	                                rng_state,
+	                                rng_coop,
+	                                throughput_coop,
+	                                PathRadiance_coop,
+	                                Ray_coop,
+	                                PathState_coop,
+	                                L_transparent_coop,
+	                                ray_state,
+	                                sw, sh, sx, sy, stride,
+	                                rng_state_offset_x,
+	                                rng_state_offset_y,
+	                                rng_state_stride,
+	                                work_array,
+	                                Queue_data,
+	                                Queue_index,
+	                                queuesize,
+	                                end_sample,
+	                                start_sample,
+#ifdef __WORK_STEALING__
+	                                work_pool_wgs,
+	                                num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	                                debugdata_coop,
+#endif
+	                                parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
new file mode 100644
index 00000000000..015f0872413
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -0,0 +1,242 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_data_init.h"
+
+__kernel void kernel_ocl_path_trace_data_init(
+	ccl_global char *globals,
+	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+
+	ccl_global float3 *P_sd,
+	ccl_global float3 *P_sd_DL_shadow,
+
+	ccl_global float3 *N_sd,
+	ccl_global float3 *N_sd_DL_shadow,
+
+	ccl_global float3 *Ng_sd,
+	ccl_global float3 *Ng_sd_DL_shadow,
+
+	ccl_global float3 *I_sd,
+	ccl_global float3 *I_sd_DL_shadow,
+
+	ccl_global int *shader_sd,
+	ccl_global int *shader_sd_DL_shadow,
+
+	ccl_global int *flag_sd,
+	ccl_global int *flag_sd_DL_shadow,
+
+	ccl_global int *prim_sd,
+	ccl_global int *prim_sd_DL_shadow,
+
+	ccl_global int *type_sd,
+	ccl_global int *type_sd_DL_shadow,
+
+	ccl_global float *u_sd,
+	ccl_global float *u_sd_DL_shadow,
+
+	ccl_global float *v_sd,
+	ccl_global float *v_sd_DL_shadow,
+
+	ccl_global int *object_sd,
+	ccl_global int *object_sd_DL_shadow,
+
+	ccl_global float *time_sd,
+	ccl_global float *time_sd_DL_shadow,
+
+	ccl_global float *ray_length_sd,
+	ccl_global float *ray_length_sd_DL_shadow,
+
+	ccl_global int *ray_depth_sd,
+	ccl_global int *ray_depth_sd_DL_shadow,
+
+	ccl_global int *transparent_depth_sd,
+	ccl_global int *transparent_depth_sd_DL_shadow,
+
+	/* Ray differentials. */
+	ccl_global differential3 *dP_sd,
+	ccl_global differential3 *dP_sd_DL_shadow,
+
+	ccl_global differential3 *dI_sd,
+	ccl_global differential3 *dI_sd_DL_shadow,
+
+	ccl_global differential *du_sd,
+	ccl_global differential *du_sd_DL_shadow,
+
+	ccl_global differential *dv_sd,
+	ccl_global differential *dv_sd_DL_shadow,
+
+	/* Dp/Du */
+	ccl_global float3 *dPdu_sd,
+	ccl_global float3 *dPdu_sd_DL_shadow,
+
+	ccl_global float3 *dPdv_sd,
+	ccl_global float3 *dPdv_sd_DL_shadow,
+
+	/* Object motion. */
+	ccl_global Transform *ob_tfm_sd,
+	ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+	ccl_global Transform *ob_itfm_sd,
+	ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+	ShaderClosure *closure_sd,
+	ShaderClosure *closure_sd_DL_shadow,
+
+	ccl_global int *num_closure_sd,
+	ccl_global int *num_closure_sd_DL_shadow,
+
+	ccl_global float *randb_closure_sd,
+	ccl_global float *randb_closure_sd_DL_shadow,
+
+	ccl_global float3 *ray_P_sd,
+	ccl_global float3 *ray_P_sd_DL_shadow,
+
+	ccl_global differential3 *ray_dP_sd,
+	ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+	ccl_constant KernelData *data,
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_state,
+	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
+	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+	int rng_state_offset_x,
+	int rng_state_offset_y,
+	int rng_state_stride,
+	ccl_global int *Queue_data,                  /* Memory for queues */
+	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+	int queuesize,                               /* size (capacity) of the queue */
+	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+	unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                         /* Number of samples to be processed in parallel */
+	)
+{
+	kernel_data_init(globals,
+	                 shader_data_sd,
+	                 shader_data_sd_DL_shadow,
+	                 P_sd,
+	                 P_sd_DL_shadow,
+	                 N_sd,
+	                 N_sd_DL_shadow,
+	                 Ng_sd,
+	                 Ng_sd_DL_shadow,
+	                 I_sd,
+	                 I_sd_DL_shadow,
+	                 shader_sd,
+	                 shader_sd_DL_shadow,
+	                 flag_sd,
+	                 flag_sd_DL_shadow,
+	                 prim_sd,
+	                 prim_sd_DL_shadow,
+	                 type_sd,
+	                 type_sd_DL_shadow,
+	                 u_sd,
+	                 u_sd_DL_shadow,
+	                 v_sd,
+	                 v_sd_DL_shadow,
+	                 object_sd,
+	                 object_sd_DL_shadow,
+	                 time_sd,
+	                 time_sd_DL_shadow,
+	                 ray_length_sd,
+	                 ray_length_sd_DL_shadow,
+	                 ray_depth_sd,
+	                 ray_depth_sd_DL_shadow,
+	                 transparent_depth_sd,
+	                 transparent_depth_sd_DL_shadow,
+
+	                 /* Ray differentials. */
+	                 dP_sd,
+	                 dP_sd_DL_shadow,
+	                 dI_sd,
+	                 dI_sd_DL_shadow,
+	                 du_sd,
+	                 du_sd_DL_shadow,
+	                 dv_sd,
+	                 dv_sd_DL_shadow,
+
+	                 /* Dp/Du */
+	                 dPdu_sd,
+	                 dPdu_sd_DL_shadow,
+	                 dPdv_sd,
+	                 dPdv_sd_DL_shadow,
+
+	                 /* Object motion. */
+	                 ob_tfm_sd,
+	                 ob_tfm_sd_DL_shadow,
+	                 ob_itfm_sd,
+	                 ob_itfm_sd_DL_shadow,
+
+	                 closure_sd,
+	                 closure_sd_DL_shadow,
+	                 num_closure_sd,
+	                 num_closure_sd_DL_shadow,
+	                 randb_closure_sd,
+	                 randb_closure_sd_DL_shadow,
+	                 ray_P_sd,
+	                 ray_P_sd_DL_shadow,
+	                 ray_dP_sd,
+	                 ray_dP_sd_DL_shadow,
+	                 data,
+	                 per_sample_output_buffers,
+	                 rng_state,
+	                 rng_coop,
+	                 throughput_coop,
+	                 L_transparent_coop,
+	                 PathRadiance_coop,
+	                 Ray_coop,
+	                 PathState_coop,
+	                 ray_state,
+
+#define KERNEL_TEX(type, ttype, name) name,
+#include "../../kernel_textures.h"
+
+	                 start_sample, sx, sy, sw, sh, offset, stride,
+	                 rng_state_offset_x,
+	                 rng_state_offset_y,
+	                 rng_state_stride,
+	                 Queue_data,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_array,
+#ifdef __WORK_STEALING__
+	                 work_pool_wgs,
+	                 num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	                 debugdata_coop,
+#endif
+	                 parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
new file mode 100644
index 00000000000..0b22c6d0864
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_direct_lighting.h"
+
+__kernel void kernel_ocl_path_trace_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        ccl_global int *Queue_data,             /* Queue memory */
+        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+        int queuesize)                          /* Size (capacity) of each queue */
+{
+	kernel_direct_lighting(globals,
+	                       data,
+	                       shader_data,
+	                       shader_DL,
+	                       rng_coop,
+	                       PathState_coop,
+	                       ISLamp_coop,
+	                       LightRay_coop,
+	                       BSDFEval_coop,
+	                       ray_state,
+	                       Queue_data,
+	                       Queue_index,
+	                       queuesize);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644
index 00000000000..502f10a7a59
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	kernel_holdout_emission_blurring_pathtermination_ao(globals,
+	                                                    data,
+	                                                    shader_data,
+	                                                    per_sample_output_buffers,
+	                                                    rng_coop,
+	                                                    throughput_coop,
+	                                                    L_transparent_coop,
+	                                                    PathRadiance_coop,
+	                                                    PathState_coop,
+	                                                    Intersection_coop,
+	                                                    AOAlpha_coop,
+	                                                    AOBSDF_coop,
+	                                                    AOLightRay_coop,
+	                                                    sw, sh, sx, sy, stride,
+	                                                    ray_state,
+	                                                    work_array,
+	                                                    Queue_data,
+	                                                    Queue_index,
+	                                                    queuesize,
+#ifdef __WORK_STEALING__
+	                                                    start_sample,
+#endif
+	                                                    parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
new file mode 100644
index 00000000000..af83e68b53e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_lamp_emission.h"
+
+__kernel void kernel_ocl_path_trace_lamp_emission(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required for lamp emission */
+	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
+	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
+	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
+	Intersection *Intersection_coop, /* Required for lamp emission */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	int sw, int sh,
+	ccl_global int *Queue_data,                 /* Memory for queues */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+	int queuesize,                              /* Size (capacity) of queues */
+	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+	int parallel_samples                        /* Number of samples to be processed in parallel */
+	)
+{
+	kernel_lamp_emission(globals,
+	                     data,
+	                     shader_data,
+	                     throughput_coop,
+	                     PathRadiance_coop,
+	                     Ray_coop,
+	                     PathState_coop,
+	                     Intersection_coop,
+	                     ray_state,
+	                     sw, sh,
+	                     Queue_data,
+	                     Queue_index,
+	                     queuesize,
+	                     use_queues_flag,
+	                     parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
new file mode 100644
index 00000000000..4acd991f0b4
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_next_iteration_setup.h"
+
+__kernel void kernel_ocl_path_trace_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global int *Queue_data,           /* Queue memory */
+        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
+        int queuesize,                        /* Size (capacity) of each queue */
+        ccl_global char *use_queues_flag)      /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+{
+	kernel_next_iteration_setup(globals,
+	                            data,
+	                            shader_data,
+	                            rng_coop,
+	                            throughput_coop,
+	                            PathRadiance_coop,
+	                            Ray_coop,
+	                            PathState_coop,
+	                            LightRay_dl_coop,
+	                            ISLamp_coop,
+	                            BSDFEval_coop,
+	                            LightRay_ao_coop,
+	                            AOBSDF_coop,
+	                            AOAlpha_coop,
+	                            ray_state,
+	                            Queue_data,
+	                            Queue_index,
+	                            queuesize,
+	                            use_queues_flag);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
new file mode 100644
index 00000000000..62cf08c387d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_queue_enqueue.h"
+
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+        ccl_global int *Queue_data,   /* Queue memory */
+        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+        ccl_global char *ray_state,   /* Denotes the state of each ray */
+        int queuesize)                /* Size (capacity) of each queue */
+{
+	kernel_queue_enqueue(Queue_data,
+	                     Queue_index,
+	                     ray_state,
+	                     queuesize);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
new file mode 100644
index 00000000000..d219874d391
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_scene_intersect.h"
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	kernel_scene_intersect(globals,
+	                       data,
+	                       rng_coop,
+	                       Ray_coop,
+	                       PathState_coop,
+	                       Intersection_coop,
+	                       ray_state,
+	                       sw, sh,
+	                       Queue_data,
+	                       Queue_index,
+	                       queuesize,
+	                       use_queues_flag,
+#ifdef __KERNEL_DEBUG__
+	                       debugdata_coop,
+#endif
+	                       parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
new file mode 100644
index 00000000000..04769d7d792
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shader_eval.h"
+
+__kernel void kernel_ocl_path_trace_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global int *Queue_data,            /* queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize)                         /* Size (capacity) of each queue */
+{
+	kernel_shader_eval(globals,
+	                   data,
+	                   shader_data,
+	                   rng_coop,
+	                   Ray_coop,
+	                   PathState_coop,
+	                   Intersection_coop,
+	                   ray_state,
+	                   Queue_data,
+	                   Queue_index,
+	                   queuesize);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
new file mode 100644
index 00000000000..9d57364c8d6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shadow_blocked.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int total_num_rays)
+{
+	kernel_shadow_blocked(globals,
+	                      data,
+	                      shader_shadow,
+	                      PathState_coop,
+	                      LightRay_dl_coop,
+	                      LightRay_ao_coop,
+	                      Intersection_coop_AO,
+	                      Intersection_coop_DL,
+	                      ray_state,
+	                      Queue_data,
+	                      Queue_index,
+	                      queuesize,
+	                      total_num_rays);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
new file mode 100644
index 00000000000..88a1ed830af
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_sum_all_radiance.h"
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	kernel_sum_all_radiance(data,
+	                        buffer,
+	                        per_sample_output_buffer,
+	                        parallel_samples,
+	                        sw, sh, stride,
+	                        buffer_offset_x,
+	                        buffer_offset_y,
+	                        buffer_stride,
+	                        start_sample);
+}
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
new file mode 100644
index 00000000000..95de1a4b2a9
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------|                                      |--- ray_state
+ * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
+ * parallel_samples -------------------------------------|                                      |--- PathState_coop
+ * end_sample -------------------------------------------|                                      |--- throughput_coop
+ * kg (globals + data) ----------------------------------|                                      |--- rng_coop
+ * rng_state --------------------------------------------|                                      |--- Ray
+ * PathRadiance_coop ------------------------------------|                                      |
+ * sw ---------------------------------------------------|                                      |
+ * sh ---------------------------------------------------|                                      |
+ * sx ---------------------------------------------------|                                      |
+ * sy ---------------------------------------------------|                                      |
+ * stride -----------------------------------------------|                                      |
+ * work_array -------------------------------------------|                                      |--- work_array
+ * queuesize --------------------------------------------|                                      |
+ * start_sample -----------------------------------------|                                      |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------|                                      |
+ * num_samples ------------------------------------------|                                      |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+ccl_device void kernel_background_buffer_update(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_state,
+	ccl_global uint *rng_coop,                   /* Required for buffer Update */
+	ccl_global float3 *throughput_coop,          /* Required for background hit processing */
+	PathRadiance *PathRadiance_coop,  /* Required for background hit processing and buffer Update */
+	ccl_global Ray *Ray_coop,                    /* Required for background hit processing */
+	ccl_global PathState *PathState_coop,        /* Required for background hit processing */
+	ccl_global float *L_transparent_coop,        /* Required for background hit processing and buffer Update */
+	ccl_global char *ray_state,                  /* Stores information on the current state of a ray */
+	int sw, int sh, int sx, int sy, int stride,
+	int rng_state_offset_x,
+	int rng_state_offset_y,
+	int rng_state_stride,
+	ccl_global unsigned int *work_array,         /* Denotes work of each ray */
+	ccl_global int *Queue_data,                  /* Queues memory */
+	ccl_global int *Queue_index,                 /* Tracks the number of elements in each queue */
+	int queuesize,                               /* Size (capacity) of each queue */
+	int end_sample,
+	int start_sample,
+#ifdef __WORK_STEALING__
+	ccl_global unsigned int *work_pool_wgs,
+	unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                         /* Number of samples to be processed in parallel */
+	)
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required.
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		/* Load kernel globals structure and ShaderData strucuture */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+		DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		PathRadiance *L = L = &PathRadiance_coop[ray_index];
+		ccl_global Ray *ray = &Ray_coop[ray_index];
+		ccl_global float3 *throughput = &throughput_coop[ray_index];
+		ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work;
+		ccl_global float *initial_per_sample_output_buffers;
+		ccl_global uint *initial_rng;
+#endif
+		unsigned int sample;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		my_work = work_array[ray_index];
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+		my_sample_tile = 0;
+		initial_per_sample_output_buffers = per_sample_output_buffers;
+		initial_rng = rng_state;
+#else // __WORK_STEALING__
+		sample = work_array[ray_index];
+		int tile_index = ray_index / parallel_samples;
+		/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+			/* eval background shader if nothing hit */
+			if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+				*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			}
+
+			if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+			{
+#ifdef __BACKGROUND__
+				/* sample background shader */
+				float3 L_background = indirect_background(kg, state, ray, sd);
+				path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			}
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+			float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+			kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+			kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+			float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+			/* accumulate result in output buffer */
+			kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+			path_rng_end(kg, rng_state, *rng);
+
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+			/* We have completed current work; So get next work */
+			int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+			if(!valid_work) {
+				/* If work is invalid, this means no more work is available and the thread may exit */
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+			}
+#else
+			if((sample + parallel_samples) >= end_sample) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+			}
+#endif
+			if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+				work_array[ray_index] = my_work;
+				/* Get the sample associated with the current work */
+				sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+				/* Get pixel and tile position associated with current work */
+				get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+				my_sample_tile = 0;
+
+				/* Remap rng_state according to the current work */
+				rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+				/* Remap per_sample_output_buffers according to the current work */
+				per_sample_output_buffers = initial_per_sample_output_buffers
+											+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else
+				work_array[ray_index] = sample + parallel_samples;
+				sample = work_array[ray_index];
+
+				/* Get ray position from ray index */
+				pixel_x = sx + ((ray_index / parallel_samples) % sw);
+				pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif
+
+				/* initialize random numbers and ray */
+				kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+				if(ray->t != 0.0f) {
+					/* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+					*throughput = make_float3(1.0f, 1.0f, 1.0f);
+					*L_transparent = 0.0f;
+					path_radiance_init(L, kernel_data.film.use_light_pass);
+					path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+					debug_data_init(debug_data);
+#endif
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+					enqueue_flag = 1;
+				} else {
+					/*These rays do not participate in path-iteration */
+					float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					/* accumulate result in output buffer */
+					kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+					path_rng_end(kg, rng_state, *rng);
+
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+				}
+			}
+		}
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays
+	 * will be made active during next SceneIntersectkernel
+	 */
+	enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
new file mode 100644
index 00000000000..b7a4d847d03
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------|                                  |--- Initialized throughput
+ * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
+ * Un-initialized Ray --------------|                                  |--- Initialized Ray
+ * Un-initialized PathState --------|                                  |--- Initialized PathState
+ * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initilaized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------|                                  |--- Initialized ray_state
+ * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------|                                  |--- Initialized work_array
+ * data ----------------------------|                                  |--- Initialized work_pool_wgs
+ * start_sample --------------------|                                  |
+ * sx ------------------------------|                                  |
+ * sy ------------------------------|                                  |
+ * sw ------------------------------|                                  |
+ * sh ------------------------------|                                  |
+ * stride --------------------------|                                  |
+ * queuesize -----------------------|                                  |
+ * num_samples ---------------------|                                  |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+ccl_device void kernel_data_init(
+	ccl_global char *globals,
+	ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+	ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+
+	ccl_global float3 *P_sd,
+	ccl_global float3 *P_sd_DL_shadow,
+
+	ccl_global float3 *N_sd,
+	ccl_global float3 *N_sd_DL_shadow,
+
+	ccl_global float3 *Ng_sd,
+	ccl_global float3 *Ng_sd_DL_shadow,
+
+	ccl_global float3 *I_sd,
+	ccl_global float3 *I_sd_DL_shadow,
+
+	ccl_global int *shader_sd,
+	ccl_global int *shader_sd_DL_shadow,
+
+	ccl_global int *flag_sd,
+	ccl_global int *flag_sd_DL_shadow,
+
+	ccl_global int *prim_sd,
+	ccl_global int *prim_sd_DL_shadow,
+
+	ccl_global int *type_sd,
+	ccl_global int *type_sd_DL_shadow,
+
+	ccl_global float *u_sd,
+	ccl_global float *u_sd_DL_shadow,
+
+	ccl_global float *v_sd,
+	ccl_global float *v_sd_DL_shadow,
+
+	ccl_global int *object_sd,
+	ccl_global int *object_sd_DL_shadow,
+
+	ccl_global float *time_sd,
+	ccl_global float *time_sd_DL_shadow,
+
+	ccl_global float *ray_length_sd,
+	ccl_global float *ray_length_sd_DL_shadow,
+
+	ccl_global int *ray_depth_sd,
+	ccl_global int *ray_depth_sd_DL_shadow,
+
+	ccl_global int *transparent_depth_sd,
+	ccl_global int *transparent_depth_sd_DL_shadow,
+
+	/* Ray differentials. */
+	ccl_global differential3 *dP_sd,
+	ccl_global differential3 *dP_sd_DL_shadow,
+
+	ccl_global differential3 *dI_sd,
+	ccl_global differential3 *dI_sd_DL_shadow,
+
+	ccl_global differential *du_sd,
+	ccl_global differential *du_sd_DL_shadow,
+
+	ccl_global differential *dv_sd,
+	ccl_global differential *dv_sd_DL_shadow,
+
+	/* Dp/Du */
+	ccl_global float3 *dPdu_sd,
+	ccl_global float3 *dPdu_sd_DL_shadow,
+
+	ccl_global float3 *dPdv_sd,
+	ccl_global float3 *dPdv_sd_DL_shadow,
+
+	/* Object motion. */
+	ccl_global Transform *ob_tfm_sd,
+	ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+	ccl_global Transform *ob_itfm_sd,
+	ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+	ShaderClosure *closure_sd,
+	ShaderClosure *closure_sd_DL_shadow,
+
+	ccl_global int *num_closure_sd,
+	ccl_global int *num_closure_sd_DL_shadow,
+
+	ccl_global float *randb_closure_sd,
+	ccl_global float *randb_closure_sd_DL_shadow,
+
+	ccl_global float3 *ray_P_sd,
+	ccl_global float3 *ray_P_sd_DL_shadow,
+
+	ccl_global differential3 *ray_dP_sd,
+	ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+	ccl_constant KernelData *data,
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_state,
+	ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+	ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+	ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+	PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
+	ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+	ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+	ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../kernel_textures.h"
+
+	int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+	int rng_state_offset_x,
+	int rng_state_offset_y,
+	int rng_state_stride,
+	ccl_global int *Queue_data,                  /* Memory for queues */
+	ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+	int queuesize,                               /* size (capacity) of the queue */
+	ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+	ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+	ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+	unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                         /* Number of samples to be processed in parallel */
+	)
+{
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+	kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../kernel_textures.h"
+
+	/* Load ShaderData structure */
+	ShaderData *sd = (ShaderData *)shader_data_sd;
+	ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+	sd->P = P_sd;
+	sd_DL_shadow->P = P_sd_DL_shadow;
+
+	sd->N = N_sd;
+	sd_DL_shadow->N = N_sd_DL_shadow;
+
+	sd->Ng = Ng_sd;
+	sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+	sd->I = I_sd;
+	sd_DL_shadow->I = I_sd_DL_shadow;
+
+	sd->shader = shader_sd;
+	sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+	sd->flag = flag_sd;
+	sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+	sd->prim = prim_sd;
+	sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+	sd->type = type_sd;
+	sd_DL_shadow->type = type_sd_DL_shadow;
+
+	sd->u = u_sd;
+	sd_DL_shadow->u = u_sd_DL_shadow;
+
+	sd->v = v_sd;
+	sd_DL_shadow->v = v_sd_DL_shadow;
+
+	sd->object = object_sd;
+	sd_DL_shadow->object = object_sd_DL_shadow;
+
+	sd->time = time_sd;
+	sd_DL_shadow->time = time_sd_DL_shadow;
+
+	sd->ray_length = ray_length_sd;
+	sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+	sd->ray_depth = ray_depth_sd;
+	sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+	sd->transparent_depth = transparent_depth_sd;
+	sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+	sd->dP = dP_sd;
+	sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+	sd->dI = dI_sd;
+	sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+	sd->du = du_sd;
+	sd_DL_shadow->du = du_sd_DL_shadow;
+
+	sd->dv = dv_sd;
+	sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+	sd->dPdu = dPdu_sd;
+	sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+	sd->dPdv = dPdv_sd;
+	sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+#ifdef __OBJECT_MOTION__
+	sd->ob_tfm = ob_tfm_sd;
+	sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
+
+	sd->ob_itfm = ob_itfm_sd;
+	sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
+#endif
+
+	sd->closure = closure_sd;
+	sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+	sd->num_closure = num_closure_sd;
+	sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+	sd->randb_closure = randb_closure_sd;
+	sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+	sd->ray_P = ray_P_sd;
+	sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+	sd->ray_dP = ray_dP_sd;
+	sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	/* Initialize work_pool_wgs */
+	if(lid == 0) {
+		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+		work_pool_wgs[group_index] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif // __WORK_STEALING__
+
+	/* Initialize queue data and queue index */
+	if(thread_index < queuesize) {
+		/* Initialize active ray queue */
+		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize background and buffer update queue */
+		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of AO queue */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of direct lighting queue */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	if(thread_index == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		use_queues_flag[0] = 0;
+	}
+
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < (sw * parallel_samples) && y < sh) {
+
+		int ray_index = x + y * (sw * parallel_samples);
+
+		/* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */
+		ray_state[ray_index] = RAY_ACTIVE;
+
+		unsigned int my_sample;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work = 0;
+		/* get work */
+		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		/* Get the sample associated with the work */
+		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+		my_sample_tile = 0;
+
+		/* Get pixel and tile position associated with the work */
+		get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+		work_array[ray_index] = my_work;
+#else // __WORK_STEALING__
+
+		unsigned int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+		my_sample = my_sample_tile + start_sample;
+
+		/* Initialize work array */
+		work_array[ray_index] = my_sample ;
+
+		/* Calculate pixel position of this ray */
+		pixel_x = sx + tile_x;
+		pixel_y = sy + tile_y;
+#endif // __WORK_STEALING__
+
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+		/* Initialise per_sample_output_buffers to all zeros */
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+		int per_sample_output_buffers_iterator = 0;
+		for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) {
+			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+		}
+
+		/* initialize random numbers and ray */
+		kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]);
+
+		if(Ray_coop[ray_index].t != 0.0f) {
+			/* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+			L_transparent_coop[ray_index] = 0.0f;
+			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+			path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+			debug_data_init(&debugdata_coop[ray_index]);
+#endif
+		} else {
+			/*These rays do not participate in path-iteration */
+
+			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			/* accumulate result in output buffer */
+			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+			path_rng_end(kg, rng_state, rng_coop[ray_index]);
+
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+	}
+
+	/* Mark rest of the ray-state indices as RAY_INACTIVE */
+	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
new file mode 100644
index 00000000000..6b83d892057
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------|                             |--- ISLamp_coop
+ * shader_data --------------------------------------|                             |--- LightRay_coop
+ * ray_state ----------------------------------------|                             |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
+ * kg (globals + data) ------------------------------|                             |
+ * queuesize ----------------------------------------|                             |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+ccl_device void kernel_direct_lighting(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,           /* Required for direct lighting */
+	ccl_global char *shader_DL,             /* Required for direct lighting */
+	ccl_global uint *rng_coop,              /* Required for direct lighting */
+	ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+	ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+	ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+	ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+	ccl_global char *ray_state,             /* Denotes the state of each ray */
+	ccl_global int *Queue_data,             /* Queue memory */
+	ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+	int queuesize                           /* Size (capacity) of each queue */
+	)
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			/* Load kernel globals structure and ShaderData structure */
+			KernelGlobals *kg = (KernelGlobals *)globals;
+			ShaderData *sd = (ShaderData *)shader_data;
+			ShaderData *sd_DL  = (ShaderData *)shader_DL;
+
+			ccl_global PathState *state = &PathState_coop[ray_index];
+
+			/* direct lighting */
+#ifdef __EMISSION__
+			if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
+				/* sample illumination from lights to find path contribution */
+				ccl_global RNG* rng = &rng_coop[ray_index];
+				float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+				float light_u, light_v;
+				path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+				LightSample ls;
+				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
+
+				Ray light_ray;
+#ifdef __OBJECT_MOTION__
+				light_ray.time = ccl_fetch(sd, time);
+#endif
+
+				BsdfEval L_light;
+				bool is_lamp;
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
+					/* write intermediate data to global memory to access from the next kernel */
+					LightRay_coop[ray_index] = light_ray;
+					BSDFEval_coop[ray_index] = L_light;
+					ISLamp_coop[ray_index] = is_lamp;
+					/// mark ray state for next shadow kernel
+					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+					enqueue_flag = 1;
+				}
+			}
+#endif
+		}
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays */
+	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
new file mode 100644
index 00000000000..393ea4bcadc
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                           |--- PathState_coop
+ * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
+ * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
+ * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                           |--- ShaderData
+ * ray_state --------------------------------------------|                                                           |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------|                                                           |--- AOBSDF_coop
+ * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------|                                                           |
+ * sw ---------------------------------------------------|                                                           |
+ * sh ---------------------------------------------------|                                                           |
+ * sx ---------------------------------------------------|                                                           |
+ * sy ---------------------------------------------------|                                                           |
+ * stride -----------------------------------------------|                                                           |
+ * work_array -------------------------------------------|                                                           |
+ * queuesize --------------------------------------------|                                                           |
+ * start_sample -----------------------------------------|                                                           |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required throughout the kernel except probabilistic path termination and AO */
+	ccl_global float *per_sample_output_buffers,
+	ccl_global uint *rng_coop,                  /* Required for "kernel_write_data_passes" and AO */
+	ccl_global float3 *throughput_coop,         /* Required for handling holdout material and AO */
+	ccl_global float *L_transparent_coop,       /* Required for handling holdout material */
+	PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
+	ccl_global PathState *PathState_coop,       /* Required throughout the kernel and AO */
+	Intersection *Intersection_coop, /* Required for indirect primitive emission */
+	ccl_global float3 *AOAlpha_coop,            /* Required for AO */
+	ccl_global float3 *AOBSDF_coop,             /* Required for AO */
+	ccl_global Ray *AOLightRay_coop,            /* Required for AO */
+	int sw, int sh, int sx, int sy, int stride,
+	ccl_global char *ray_state,                /* Denotes the state of each ray */
+	ccl_global unsigned int *work_array,       /* Denotes the work that each ray belongs to */
+	ccl_global int *Queue_data,                /* Queue memory */
+	ccl_global int *Queue_index,               /* Tracks the number of elements in each queue */
+	int queuesize,                             /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+	unsigned int start_sample,
+#endif
+	int parallel_samples                       /* Number of samples to be processed in parallel */
+	)
+{
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		/* Load kernel globals structure and ShaderData structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+#endif
+		unsigned int tile_x;
+		unsigned int tile_y;
+		int my_sample_tile;
+		unsigned int sample;
+
+		ccl_global RNG *rng = 0x0;
+		ccl_global PathState *state = 0x0;
+		float3 throughput;
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+			throughput = throughput_coop[ray_index];
+			state = &PathState_coop[ray_index];
+			rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+			my_work = work_array[ray_index];
+			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+			my_sample_tile = 0;
+#else // __WORK_STEALING__
+			sample = work_array[ray_index];
+			/* buffer's stride is "stride"; Find x and y using ray_index */
+			int tile_index = ray_index / parallel_samples;
+			tile_x = tile_index % sw;
+			tile_y = tile_index / sw;
+			my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif // __WORK_STEALING__
+			per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+			/* holdout */
+#ifdef __HOLDOUT__
+			if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
+				if(kernel_data.background.transparent) {
+					float3 holdout_weight;
+
+					if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+						holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+					else
+						holdout_weight = shader_holdout_eval(kg, sd);
+
+					/* any throughput is ok, should all be identical here */
+					L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+				}
+
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					enqueue_flag = 1;
+				}
+			}
+#endif
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+			PathRadiance *L = &PathRadiance_coop[ray_index];
+			/* holdout mask objects do not write data passes */
+			kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
+
+			/* blurring of bsdf after bounces, for rays that have a small likelihood
+				* of following this particular path (diffuse, rough glossy) */
+			if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+				float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+				if(blur_pdf < 1.0f) {
+					float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+					shader_bsdf_blur(kg, sd, blur_roughness);
+				}
+			}
+
+#ifdef __EMISSION__
+			/* emission */
+			if(ccl_fetch(sd, flag) & SD_EMISSION) {
+				/* todo: is isect.t wrong here for transparent surfaces? */
+				float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
+				path_radiance_accum_emission(L, throughput, emission, state->bounce);
+			}
+#endif
+
+			/* path termination. this is a strange place to put the termination, it's
+			 * mainly due to the mixed in MIS that we use. gives too many unneeded
+			 * shader evaluations, only need emission if we are going to terminate */
+			float probability = path_state_terminate_probability(kg, state, throughput);
+
+			if(probability == 0.0f) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				enqueue_flag = 1;
+			}
+
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				if(probability != 1.0f) {
+					float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+
+					if(terminate >= probability) {
+						ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+						enqueue_flag = 1;
+					} else {
+						throughput_coop[ray_index] = throughput/probability;
+					}
+				}
+			}
+		}
+
+#ifdef __AO__
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			/* ambient occlusion */
+			if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
+				/* todo: solve correlation */
+				float bsdf_u, bsdf_v;
+				path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+				float ao_factor = kernel_data.background.ao_factor;
+				float3 ao_N;
+				AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+				AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+				float3 ao_D;
+				float ao_pdf;
+				sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+				if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+					Ray _ray;
+					_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+					_ray.D = ao_D;
+					_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+					_ray.time = ccl_fetch(sd, time);
+#endif
+					_ray.dP = ccl_fetch(sd, dP);
+					_ray.dD = differential3_zero();
+					AOLightRay_coop[ray_index] = _ray;
+
+					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+					enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+				}
+			}
+		}
+#endif
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays */
+	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays */
+	enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
new file mode 100644
index 00000000000..f400a99e229
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------|                           |
+ * Intersection_coop ----------------------------------|                           |
+ * ray_state ------------------------------------------|                           |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
+ * queuesize ------------------------------------------|                           |
+ * use_queues_flag ------------------------------------|                           |
+ * sw -------------------------------------------------|                           |
+ * sh -------------------------------------------------|                           |
+ * parallel_samples -----------------------------------|                           |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel.
+ */
+ccl_device void kernel_lamp_emission(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required for lamp emission */
+	ccl_global float3 *throughput_coop,         /* Required for lamp emission */
+	PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+	ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
+	ccl_global PathState *PathState_coop,       /* Required for lamp emission */
+	Intersection *Intersection_coop, /* Required for lamp emission */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	int sw, int sh,
+	ccl_global int *Queue_data,                 /* Memory for queues */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+	int queuesize,                              /* Size (capacity) of queues */
+	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+	int parallel_samples                        /* Number of samples to be processed in parallel */
+	)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+
+		float3 throughput = throughput_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+		PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+			/* ray starting from previous non-transparent bounce */
+			Ray light_ray;
+
+			light_ray.P = ray.P - state.ray_t*ray.D;
+			state.ray_t += Intersection_coop[ray_index].t;
+			light_ray.D = ray.D;
+			light_ray.t = state.ray_t;
+			light_ray.time = ray.time;
+			light_ray.dD = ray.dD;
+			light_ray.dP = ray.dP;
+			/* intersect with lamp */
+			float3 emission;
+
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			}
+		}
+#endif
+		/* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else
+#endif
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif
+			}
+		}
+#endif
+#endif
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
new file mode 100644
index 00000000000..343dbb06e99
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
+ * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                 |--- PathState_coop
+ * ray_state --------------------------------------------|                                 |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
+ * Ray_coop ---------------------------------------------|                                 |
+ * kg (globals + data) ----------------------------------|                                 |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+
+ccl_device void kernel_next_iteration_setup(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Required for setting up ray for next iteration */
+	ccl_global uint *rng_coop,                  /* Required for setting up ray for next iteration */
+	ccl_global float3 *throughput_coop,         /* Required for setting up ray for next iteration */
+	PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
+	ccl_global Ray *Ray_coop,                   /* Required for setting up ray for next iteration */
+	ccl_global PathState *PathState_coop,       /* Required for setting up ray for next iteration */
+	ccl_global Ray *LightRay_dl_coop,           /* Required for radiance update - direct lighting */
+	ccl_global int *ISLamp_coop,                /* Required for radiance update - direct lighting */
+	ccl_global BsdfEval *BSDFEval_coop,         /* Required for radiance update - direct lighting */
+	ccl_global Ray *LightRay_ao_coop,           /* Required for radiance update - AO */
+	ccl_global float3 *AOBSDF_coop,             /* Required for radiance update - AO */
+	ccl_global float3 *AOAlpha_coop,            /* Required for radiance update - AO */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	ccl_global int *Queue_data,                 /* Queue memory */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+	int queuesize,                              /* Size (capacity) of each queue */
+	ccl_global char *use_queues_flag            /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+	)
+{
+
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS
+		 * queues that were made empty during the previous kernel
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not required
+	 * If we are executing on a CPU device, then we need to keep all threads active
+	 * since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		/* Load kernel globals structure and ShaderData structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = 0x0;
+		ccl_global PathState *state = 0x0;
+
+		/* Path radiance update for AO/Direct_lighting's shadow blocked */
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+			state = &PathState_coop[ray_index];
+			L = &PathRadiance_coop[ray_index];
+			float3 _throughput = throughput_coop[ray_index];
+
+			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+				float3 shadow = LightRay_ao_coop[ray_index].P;
+				char update_path_radiance = LightRay_ao_coop[ray_index].t;
+				if(update_path_radiance) {
+					path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce);
+				}
+				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+			}
+
+			if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+				float3 shadow = LightRay_dl_coop[ray_index].P;
+				char update_path_radiance = LightRay_dl_coop[ray_index].t;
+				if(update_path_radiance) {
+					BsdfEval L_light = BSDFEval_coop[ray_index];
+					path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]);
+				}
+				REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+			}
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+			ccl_global float3 *throughput = &throughput_coop[ray_index];
+			ccl_global Ray *ray = &Ray_coop[ray_index];
+			ccl_global RNG* rng = &rng_coop[ray_index];
+			state = &PathState_coop[ray_index];
+			L = &PathRadiance_coop[ray_index];
+
+			/* compute direct lighting and next bounce */
+			if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				enqueue_flag = 1;
+			}
+		}
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays */
+	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..9bcf8f540b4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../kernel_compat_opencl.h"
+#include "../kernel_math.h"
+#include "../kernel_types.h"
+#include "../kernel_globals.h"
+#include "../kernel_queues.h"
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+
+ccl_device void kernel_queue_enqueue(
+	ccl_global int *Queue_data,   /* Queue memory */
+	ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+	ccl_global char *ray_state,   /* Denotes the state of each ray */
+	int queuesize                 /* Size (capacity) of each queue */
+	)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	} else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
new file mode 100644
index 00000000000..01e0b1fd19e
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_scene_intersect kernel.
+ * This is the second kernel in the ray tracing logic. This is the first
+ * of the path iteration kernels. This kernel takes care of scene_intersect function.
+ *
+ * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
+ * This kernel processes rays of ray state RAY_ACTIVE
+ * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
+ *
+ * The input and output are as follows,
+ *
+ * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
+ * PathState_coop ---------------------------------|                                          |--- Intersection
+ * ray_state --------------------------------------|                                          |--- ray_state
+ * use_queues_flag --------------------------------|                                          |
+ * parallel_samples -------------------------------|                                          |
+ * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
+ * kg (data + globals) ----------------------------|                                          |
+ * rng_coop ---------------------------------------|                                          |
+ * sw ---------------------------------------------|                                          |
+ * sh ---------------------------------------------|                                          |
+ * queuesize --------------------------------------|                                          |
+ *
+ * Note on Queues :
+ * Ideally we would want kernel_scene_intersect to work on queues.
+ * But during the very first time, the queues wil be empty and hence we perform a direct mapping
+ * between ray-index and thread-index; From the next time onward, the queue will be filled and
+ * we may start operating on queues.
+ *
+ * State of queue during the first time this kernel is called :
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
+ *
+ * State of queues during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
+ * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
+ * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
+ * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
+ * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
+ * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ */
+
+ccl_device void kernel_scene_intersect(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global uint *rng_coop,
+	ccl_global Ray *Ray_coop,                   /* Required for scene_intersect */
+	ccl_global PathState *PathState_coop,       /* Required for scene_intersect */
+	Intersection *Intersection_coop,            /* Required for scene_intersect */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	int sw, int sh,
+	ccl_global int *Queue_data,                 /* Memory for queues */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+	int queuesize,                              /* Size (capacity) of queues */
+	ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+	DebugData *debugdata_coop,
+#endif
+	int parallel_samples                        /* Number of samples to be processed in parallel */
+	)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	/* All regenerated rays become active here */
+	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+
+	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+		return;
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	Intersection *isect = &Intersection_coop[ray_index];
+	PathState state = PathState_coop[ray_index];
+	Ray ray = Ray_coop[ray_index];
+
+	/* intersect scene */
+	uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+	RNG rng = rng_coop[ray_index];
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray.dD.dx + ray.dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	}
+
+	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	if(state.flag & PATH_RAY_CAMERA) {
+		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+	}
+#endif
+
+	if(!hit) {
+		/* Change the state of rays that hit the background;
+		 * These rays undergo special processing in the
+		 * background_bufferUpdate kernel*/
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
new file mode 100644
index 00000000000..0a8d77f52b0
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_shader_eval kernel
+ * This kernel is the 5th kernel in the ray tracing logic. This is
+ * the 4rd kernel in path iteration. This kernel sets up the ShaderData
+ * structure from the values computed by the previous kernels. It also identifies
+ * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * The input and output of the kernel is as follows,
+ * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- shader_data
+ * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Intersection_coop ----------------------------------|                         |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
+ * ray_state ------------------------------------------|                         |
+ * kg (globals + data) --------------------------------|                         |
+ * queuesize ------------------------------------------|                         |
+ *
+ * Note on Queues :
+ * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE;
+ * State of queues when this kernel is called,
+ * at entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * at exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ */
+
+ccl_device void kernel_shader_eval(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_data,               /* Output ShaderData structure to be filled */
+	ccl_global uint *rng_coop,                  /* Required for rbsdf calculation */
+	ccl_global Ray *Ray_coop,                   /* Required for setting up shader from ray */
+	ccl_global PathState *PathState_coop,       /* Required for all functions in this kernel */
+	Intersection *Intersection_coop, /* Required for setting up shader from ray */
+	ccl_global char *ray_state,                 /* Denotes the state of each ray */
+	ccl_global int *Queue_data,                 /* queue memory */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+	int queuesize                               /* Size (capacity) of each queue */
+	)
+{
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+
+	enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+
+	ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	/* Continue on with shader evaluation */
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		Intersection *isect = &Intersection_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+
+		shader_setup_from_ray(kg, sd, isect, &ray, state->bounce, state->transparent_bounce);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
new file mode 100644
index 00000000000..71fab19518c
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/*
+ * Note on kernel_shadow_blocked kernel.
+ * This is the ninth kernel in the ray tracing logic. This is the eighth
+ * of the path iteration kernels. This kernel takes care of "shadow ray cast"
+ * logic of the direct lighting and AO  part of ray tracing.
+ *
+ * The input and output are as follows,
+ *
+ * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
+ * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
+ * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
+ * ray_state ---------------------------------------|                            |--- ray_state
+ * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
+	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
+	      QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * kg (globals + data) -----------------------------|                            |
+ * queuesize ---------------------------------------|                            |
+ *
+ * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
+ * Note on queues :
+ * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
+ * these queues this kernel.
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
+ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
+ */
+
+ccl_device void kernel_shadow_blocked(
+	ccl_global char *globals,
+	ccl_constant KernelData *data,
+	ccl_global char *shader_shadow,             /* Required for shadow blocked */
+	ccl_global PathState *PathState_coop,       /* Required for shadow blocked */
+	ccl_global Ray *LightRay_dl_coop,           /* Required for direct lighting's shadow blocked */
+	ccl_global Ray *LightRay_ao_coop,           /* Required for AO's shadow blocked */
+	Intersection *Intersection_coop_AO,
+	Intersection *Intersection_coop_DL,
+	ccl_global char *ray_state,
+	ccl_global int *Queue_data,                 /* Queue memory */
+	ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+	int queuesize,                              /* Size (capacity) of each queue */
+	int total_num_rays
+	)
+{
+#if 0
+	/* we will make the Queue_index entries '0' in the next kernel */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+	/* flag determining if we need to update L */
+	char update_path_radiance = 0;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+		/* Load kernel global structure */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
+		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
+		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
+
+		ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global;
+		Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+
+		float3 shadow;
+		update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global));
+
+		/* We use light_ray_global's P and t to store shadow and update_path_radiance */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
new file mode 100644
index 00000000000..f5830be6bb1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  _KERNEL_SPLIT_H_
+#define  _KERNEL_SPLIT_H_
+
+#include "../kernel_compat_opencl.h"
+#include "../kernel_math.h"
+#include "../kernel_types.h"
+#include "../kernel_globals.h"
+
+#include "../util_atomic.h"
+
+#include "../kernel_random.h"
+#include "../kernel_projection.h"
+#include "../kernel_montecarlo.h"
+#include "../kernel_differential.h"
+#include "../kernel_camera.h"
+
+#include "../geom/geom.h"
+
+#include "../kernel_accumulate.h"
+#include "../kernel_shader.h"
+#include "../kernel_light.h"
+#include "../kernel_passes.h"
+
+#ifdef __SUBSURFACE__
+#include "../kernel_subsurface.h"
+#endif
+
+#ifdef __VOLUME__
+#include "../kernel_volume.h"
+#endif
+
+#include "../kernel_path_state.h"
+#include "../kernel_shadow.h"
+#include "../kernel_emission.h"
+#include "../kernel_path_common.h"
+#include "../kernel_path_surface.h"
+#include "../kernel_path_volume.h"
+
+#ifdef __KERNEL_DEBUG__
+#include "../kernel_debug.h"
+#endif
+
+#include "../kernel_queues.h"
+#include "../kernel_work_stealing.h"
+
+#endif
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
new file mode 100644
index 00000000000..eeb7da76e73
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../kernel_compat_opencl.h"
+#include "../kernel_math.h"
+#include "../kernel_types.h"
+#include "../kernel_globals.h"
+
+/*
+* Since we process various samples in parallel; The output radiance of different samples
+* are stored in different locations; This kernel combines the output radiance contributed
+* by all different samples and stores them in the RenderTile's output buffer.
+*/
+
+ccl_device void kernel_sum_all_radiance(
+	ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+	ccl_global float *buffer,                    /* Output buffer of RenderTile */
+	ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+	int parallel_samples, int sw, int sh, int stride,
+	int buffer_offset_x,
+	int buffer_offset_y,
+	int buffer_stride,
+	int start_sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < sw && y < sh) {
+		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
+		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
+
+		int sample_stride = (data->film.pass_stride);
+
+		int sample_iterator = 0;
+		int pass_stride_iterator = 0;
+		int num_floats = data->film.pass_stride;
+
+		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
+			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
+				*(buffer + pass_stride_iterator) = (start_sample == 0 && sample_iterator == 0) ? *(per_sample_output_buffer + pass_stride_iterator)
+				: *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
+			}
+			per_sample_output_buffer += sample_stride;
+		}
+	}
+}
-- 
cgit v1.2.3