19 files changed, 2079 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
new file mode 100644
index 00000000000..37a73ab2f04
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size)
+{
+	if(strcmp(name, "__data") == 0)
+		memcpy(&kg->__data, host, size);
+	else
+		assert(0);
+}
+
+void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation)
+{
+	if(0) {
+	}
+
+#define KERNEL_TEX(type, ttype, tname) \
+	else if(strcmp(name, #tname) == 0) { \
+		kg->tname.data = (type*)mem; \
+		kg->tname.width = width; \
+	}
+#define KERNEL_IMAGE_TEX(type, ttype, tname)
+#include "kernel_textures.h"
+
+	else if(strstr(name, "__tex_image_float")) {
+		texture_image_float4 *tex = NULL;
+		int id = atoi(name + strlen("__tex_image_float_"));
+		int array_index = id;
+
+		if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
+			tex = &kg->texture_float_images[array_index];
+		}
+
+		if(tex) {
+			tex->data = (float4*)mem;
+			tex->dimensions_set(width, height, depth);
+			tex->interpolation = interpolation;
+		}
+	}
+	else if(strstr(name, "__tex_image")) {
+		texture_image_uchar4 *tex = NULL;
+		int id = atoi(name + strlen("__tex_image_"));
+		int array_index = id - MAX_FLOAT_IMAGES;
+
+		if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
+			tex = &kg->texture_byte_images[array_index];
+		}
+
+		if(tex) {
+			tex->data = (uchar4*)mem;
+			tex->dimensions_set(width, height, depth);
+			tex->interpolation = interpolation;
+		}
+	}
+	else
+		assert(0);
+}
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
+#if defined(__x86_64__) || defined(_M_X64)
+#define __KERNEL_SSE2__
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+	/* do nothing */
+#endif
+
+/* Path Tracing */
+
+void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluation */
+
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..df77bedc729
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#endif
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx(void);
+void __dummy_function_cycles_avx(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..b3192369794
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
+#endif
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx2(void);
+void __dummy_function_cycles_avx2(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..f9c5134e442
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_sse2(void);
+void __dummy_function_cycles_sse2(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..2dbe4b81821
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_sse3(void);
+void __dummy_function_cycles_sse3(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..5c57ad01181
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+
+#include "kernel_compat_cpu.h"
+#include "kernel.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_path_branched.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_sse41(void);
+void __dummy_function_cycles_sse41(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
new file mode 100644
index 00000000000..bcd55b8c676
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#include "../../kernel_compat_cuda.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
+
+/* device data taken from CUDA occupancy calculator */
+
+#ifdef __CUDA_ARCH__
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0 and 5.2 */
+#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 40
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+#ifdef __BRANCHED_PATH__
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
+kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+}
+#endif
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw)
+		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw)
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
new file mode 100644
index 00000000000..15fb34cfe3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points - unfinished */
+
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
+
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_path_trace(
+	ccl_constant KernelData *data,
+	ccl_global float *buffer,
+	ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int sample,
+	int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw)
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+}
+
+__kernel void kernel_ocl_bake(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw) {
+#if defined(__KERNEL_OPENCL_NVIDIA__) && __COMPUTE_CAPABILITY__ < 300
+		/* NVidia compiler is spending infinite amount of time trying
+		 * to deal with kernel_bake_evaluate() on architectures prior
+		 * to sm_30.
+		 * For now we disable baking kernel for those devices, so at
+		 * least rendering with split kernel could be compiled.
+		 */
+		output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+#endif
+	}
+}
+
+__kernel void kernel_ocl_convert_to_byte(
+	ccl_constant KernelData *data,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+__kernel void kernel_ocl_convert_to_half_float(
+	ccl_constant KernelData *data,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "../../kernel_textures.h"
+
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../../kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif // __COMPILE_ONLY_MEGAKERNEL__
+\ No newline at end of file
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
new file mode 100644
index 00000000000..eff77b89a0a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_background_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        ccl_global int *Queue_data,            /* Queues memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag =
+			kernel_background_buffer_update(globals,
+			                                data,
+			                                shader_data,
+			                                per_sample_output_buffers,
+			                                rng_state,
+			                                rng_coop,
+			                                throughput_coop,
+			                                PathRadiance_coop,
+			                                Ray_coop,
+			                                PathState_coop,
+			                                L_transparent_coop,
+			                                ray_state,
+			                                sw, sh, sx, sy, stride,
+			                                rng_state_offset_x,
+			                                rng_state_offset_y,
+			                                rng_state_stride,
+			                                work_array,
+			                                end_sample,
+			                                start_sample,
+#ifdef __WORK_STEALING__
+			                                work_pool_wgs,
+			                                num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+			                                debugdata_coop,
+#endif
+			                                parallel_samples,
+			                                ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
new file mode 100644
index 00000000000..c3277676029
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_data_init.h"
+
+__kernel void kernel_ocl_path_trace_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+	kernel_data_init(globals,
+	                 shader_data_sd,
+	                 shader_data_sd_DL_shadow,
+	                 P_sd,
+	                 P_sd_DL_shadow,
+	                 N_sd,
+	                 N_sd_DL_shadow,
+	                 Ng_sd,
+	                 Ng_sd_DL_shadow,
+	                 I_sd,
+	                 I_sd_DL_shadow,
+	                 shader_sd,
+	                 shader_sd_DL_shadow,
+	                 flag_sd,
+	                 flag_sd_DL_shadow,
+	                 prim_sd,
+	                 prim_sd_DL_shadow,
+	                 type_sd,
+	                 type_sd_DL_shadow,
+	                 u_sd,
+	                 u_sd_DL_shadow,
+	                 v_sd,
+	                 v_sd_DL_shadow,
+	                 object_sd,
+	                 object_sd_DL_shadow,
+	                 time_sd,
+	                 time_sd_DL_shadow,
+	                 ray_length_sd,
+	                 ray_length_sd_DL_shadow,
+	                 ray_depth_sd,
+	                 ray_depth_sd_DL_shadow,
+	                 transparent_depth_sd,
+	                 transparent_depth_sd_DL_shadow,
+
+	                 /* Ray differentials. */
+	                 dP_sd,
+	                 dP_sd_DL_shadow,
+	                 dI_sd,
+	                 dI_sd_DL_shadow,
+	                 du_sd,
+	                 du_sd_DL_shadow,
+	                 dv_sd,
+	                 dv_sd_DL_shadow,
+
+	                 /* Dp/Du */
+	                 dPdu_sd,
+	                 dPdu_sd_DL_shadow,
+	                 dPdv_sd,
+	                 dPdv_sd_DL_shadow,
+
+	                 /* Object motion. */
+	                 ob_tfm_sd,
+	                 ob_tfm_sd_DL_shadow,
+	                 ob_itfm_sd,
+	                 ob_itfm_sd_DL_shadow,
+
+	                 closure_sd,
+	                 closure_sd_DL_shadow,
+	                 num_closure_sd,
+	                 num_closure_sd_DL_shadow,
+	                 randb_closure_sd,
+	                 randb_closure_sd_DL_shadow,
+	                 ray_P_sd,
+	                 ray_P_sd_DL_shadow,
+	                 ray_dP_sd,
+	                 ray_dP_sd_DL_shadow,
+	                 data,
+	                 per_sample_output_buffers,
+	                 rng_state,
+	                 rng_coop,
+	                 throughput_coop,
+	                 L_transparent_coop,
+	                 PathRadiance_coop,
+	                 Ray_coop,
+	                 PathState_coop,
+	                 ray_state,
+
+#define KERNEL_TEX(type, ttype, name) name,
+#include "../../kernel_textures.h"
+
+	                 start_sample, sx, sy, sw, sh, offset, stride,
+	                 rng_state_offset_x,
+	                 rng_state_offset_y,
+	                 rng_state_stride,
+	                 Queue_data,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_array,
+#ifdef __WORK_STEALING__
+	                 work_pool_wgs,
+	                 num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	                 debugdata_coop,
+#endif
+	                 parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
new file mode 100644
index 00000000000..6ec75013b3a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_direct_lighting.h"
+
+__kernel void kernel_ocl_path_trace_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        ccl_global int *Queue_data,             /* Queue memory */
+        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+        int queuesize)                          /* Size (capacity) of each queue */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_direct_lighting(globals,
+		                                      data,
+		                                      shader_data,
+		                                      shader_DL,
+		                                      rng_coop,
+		                                      PathState_coop,
+		                                      ISLamp_coop,
+		                                      LightRay_coop,
+		                                      BSDFEval_coop,
+		                                      ray_state,
+		                                      ray_index);
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644
index 00000000000..ae5f5cd1b3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		kernel_holdout_emission_blurring_pathtermination_ao(
+		        globals,
+		        data,
+		        shader_data,
+		        per_sample_output_buffers,
+		        rng_coop,
+		        throughput_coop,
+		        L_transparent_coop,
+		        PathRadiance_coop,
+		        PathState_coop,
+		        Intersection_coop,
+		        AOAlpha_coop,
+		        AOBSDF_coop,
+		        AOLightRay_coop,
+		        sw, sh, sx, sy, stride,
+		        ray_state,
+		        work_array,
+#ifdef __WORK_STEALING__
+		        start_sample,
+#endif
+		        parallel_samples,
+		        ray_index,
+		        &enqueue_flag,
+		        &enqueue_flag_AO_SHADOW_RAY_CAST);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics_bg,
+	                        Queue_data,
+	                        Queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        queuesize,
+	                        &local_queue_atomics_ao,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
new file mode 100644
index 00000000000..1bc7808d834
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_lamp_emission.h"
+
+__kernel void kernel_ocl_path_trace_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_lamp_emission(globals,
+	                     data,
+	                     shader_data,
+	                     throughput_coop,
+	                     PathRadiance_coop,
+	                     Ray_coop,
+	                     PathState_coop,
+	                     Intersection_coop,
+	                     ray_state,
+	                     sw, sh,
+	                     use_queues_flag,
+	                     parallel_samples,
+	                     ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
new file mode 100644
index 00000000000..dcf4db40411
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_next_iteration_setup.h"
+
+__kernel void kernel_ocl_path_trace_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global int *Queue_data,           /* Queue memory */
+        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
+        int queuesize,                        /* Size (capacity) of each queue */
+        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_next_iteration_setup(globals,
+		                                           data,
+		                                           shader_data,
+		                                           rng_coop,
+		                                           throughput_coop,
+		                                           PathRadiance_coop,
+		                                           Ray_coop,
+		                                           PathState_coop,
+		                                           LightRay_dl_coop,
+		                                           ISLamp_coop,
+		                                           BSDFEval_coop,
+		                                           LightRay_ao_coop,
+		                                           AOBSDF_coop,
+		                                           AOAlpha_coop,
+		                                           ray_state,
+		                                           use_queues_flag,
+		                                           ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
new file mode 100644
index 00000000000..3156dc255fb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_queues.h"
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+        ccl_global int *Queue_data,   /* Queue memory */
+        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+        ccl_global char *ray_state,   /* Denotes the state of each ray */
+        int queuesize)                /* Size (capacity) of each queue */
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  queuesize,
+		                                  my_lqidx,
+		                                  local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
new file mode 100644
index 00000000000..e5fad7bce50
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_scene_intersect.h"
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_scene_intersect(globals,
+	                       data,
+	                       rng_coop,
+	                       Ray_coop,
+	                       PathState_coop,
+	                       Intersection_coop,
+	                       ray_state,
+	                       sw, sh,
+	                       use_queues_flag,
+#ifdef __KERNEL_DEBUG__
+	                       debugdata_coop,
+#endif
+	                       parallel_samples,
+	                       ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
new file mode 100644
index 00000000000..b9f616e6bdf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shader_eval.h"
+
+__kernel void kernel_ocl_path_trace_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global int *Queue_data,            /* queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize)                         /* Size (capacity) of each queue */
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+
+	/* Continue on with shader evaluation. */
+	kernel_shader_eval(globals,
+	                   data,
+	                   shader_data,
+	                   rng_coop,
+	                   Ray_coop,
+	                   PathState_coop,
+	                   Intersection_coop,
+	                   ray_state,
+	                   ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
new file mode 100644
index 00000000000..03886c0a030
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shadow_blocked.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int total_num_rays)
+{
+#if 0
+	/* We will make the Queue_index entries '0' in the next kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	kernel_shadow_blocked(globals,
+	                      data,
+	                      shader_shadow,
+	                      PathState_coop,
+	                      LightRay_dl_coop,
+	                      LightRay_ao_coop,
+	                      Intersection_coop_AO,
+	                      Intersection_coop_DL,
+	                      ray_state,
+	                      total_num_rays,
+	                      shadow_blocked_type,
+	                      ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
new file mode 100644
index 00000000000..88a1ed830af
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_sum_all_radiance.h"
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	kernel_sum_all_radiance(data,
+	                        buffer,
+	                        per_sample_output_buffer,
+	                        parallel_samples,
+	                        sw, sh, stride,
+	                        buffer_offset_x,
+	                        buffer_offset_y,
+	                        buffer_stride,
+	                        start_sample);
+}