5 files changed, 0 insertions, 1187 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
-                              CCL_FILTER_TILE_INFO,
-                              int4 prefilter_rect,
-                              int buffer_pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-		int itile = ytile * 3 + xtile;
-		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
-			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
-		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
-		for (int i = 0; i < buffer_pass_stride; ++i)
-			buffer[i] = in[i];
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		if (num_inputs > 0) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3;
-			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
-			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
-			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
-		}
-		if (num_inputs > 1) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-		if (num_inputs > 2) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
-		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
-		out[0] = in[0] * num_samples;
-		out[1] = in[1] * num_samples;
-		out[2] = in[2] * num_samples;
-	}
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
-                                 CCL_FILTER_TILE_INFO,
-                                 float *unfilteredA,
-                                 float *unfilteredB,
-                                 float *sampleVariance,
-                                 float *sampleVarianceV,
-                                 float *bufferVariance,
-                                 int4 prefilter_rect,
-                                 int buffer_pass_stride,
-                                 int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            tile_info,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
-                               CCL_FILTER_TILE_INFO,
-                               int m_offset,
-                               int v_offset,
-                               float *mean,
-                               float *variance,
-                               float scale,
-                               int4 prefilter_rect,
-                               int buffer_pass_stride,
-                               int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          tile_info,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
-                                 int4 buffer_params,
-                                 int4 filter_area,
-                                 float *from,
-                                 float *buffer,
-                                 int out_offset,
-                                 int4 prefilter_rect)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-	                                x + filter_area.x,
-	                                y + filter_area.y,
-	                                buffer_params,
-	                                from,
-	                                buffer,
-	                                out_offset,
-	                                prefilter_rect);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
-                                   float *variance,
-                                   float *depth,
-                                   float *output,
-                                   int4 prefilter_rect,
-                                   int pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
-                                       CCL_FILTER_TILE_INFO,
-                                       float *transform, int *rank,
-                                       int4 filter_area, int4 rect,
-                                       int radius, float pca_threshold,
-                                       int pass_stride, int frame_stride,
-                                       bool use_time)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int *l_rank = rank + y*filter_area.z + x;
-		float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  tile_info,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
-                                       const float *ccl_restrict variance_image,
-                                       const float *ccl_restrict scale_image,
-                                       float *difference_image,
-                                       int w,
-                                       int h,
-                                       int stride,
-                                       int pass_stride,
-                                       int r,
-                                       int channel_offset,
-                                       int frame_offset,
-                                       float a,
-                                       float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
-                            float *out_image,
-                            int w,
-                            int h,
-                            int stride,
-                            int pass_stride,
-                            int r,
-                            int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                   float *out_image,
-                                   int w,
-                                   int h,
-                                   int stride,
-                                   int pass_stride,
-                                   int r,
-                                   int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
-                                     const float *ccl_restrict image,
-                                     float *out_image,
-                                     float *accum_image,
-                                     int w,
-                                     int h,
-                                     int stride,
-                                     int pass_stride,
-                                     int channel_offset,
-                                     int r,
-                                     int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
-                                 const float *ccl_restrict accum_image,
-                                 int w,
-                                 int h,
-                                 int stride)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
-                                         const float *ccl_restrict difference_image,
-                                         const float *ccl_restrict buffer,
-                                         float const* __restrict__ transform,
-                                         int *rank,
-                                         float *XtWX,
-                                         float3 *XtWY,
-                                         int4 filter_window,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f,
-                                         int frame_offset,
-                                         bool use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
-                            int *rank,
-                            float *XtWX,
-                            float3 *XtWY,
-                            int4 filter_area,
-                            int4 buffer_params,
-                            int sample)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
deleted file mode 100644
index cf62b6e781e..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/cuda/kernel_cuda_image.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-	
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	KernelGlobals kg;
-	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
-		uint x = tile->x + work_index % tile->w;
-		uint y = tile->y + work_index / tile->w;
-		int index = tile->offset + x + y * tile->stride;
-		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-		kernel_do_adaptive_stopping(&kg, buffer, sample);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->h) {
-			int y = tile->y + ccl_global_id(0);
-			kernel_do_adaptive_filter_x(&kg, y, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->w) {
-			int x = tile->x + ccl_global_id(0);
-			kernel_do_adaptive_filter_y(&kg, x, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
-{
-	if(kernel_data.film.pass_adaptive_aux_buffer) {
-		int work_index = ccl_global_id(0);
-		bool thread_is_active = work_index < total_work_size;
-		KernelGlobals kg;
-		if(thread_is_active) {
-			uint x = tile->x + work_index % tile->w;
-			uint y = tile->y + work_index / tile->w;
-			int index = tile->offset + x + y * tile->stride;
-			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-				float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-				if(sample_multiplier != 1.0f) {
-					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
-				}
-			}
-			else {
-				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
-			}
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_displace(uint4 *input,
-                     float4 *output,
-                     int type,
-                     int sx,
-                     int sw,
-                     int offset,
-                     int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_displace_evaluate(&kg, input, output, x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_background(uint4 *input,
-                       float4 *output,
-                       int type,
-                       int sx,
-                       int sw,
-                       int offset,
-                       int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_background_evaluate(&kg, input, output, x);
-	}
-}
-
-#ifdef __BAKING__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-}
-#endif
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
deleted file mode 100644
index 2e47ce2de6c..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* device data taken from CUDA occupancy calculator */
-
-/* 3.0 and 3.5 */
-#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.x, 6.x */
-#elif __CUDA_ARCH__ <= 699
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
- * registers */
-#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
-#    define CUDA_KERNEL_MAX_REGISTERS 64
-#  else
-#    define CUDA_KERNEL_MAX_REGISTERS 48
-#  endif
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 7.x, 8.x */
-#elif __CUDA_ARCH__ <= 899
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 64
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* For split kernel using all registers seems fastest for now, but this
- * is unlikely to be optimal once we resolve other bottlenecks. */
-
-#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
-
-/* Compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread. */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-  __launch_bounds__(threads_block_width *threads_block_width, \
-                    CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-                        (threads_block_width * threads_block_width * thread_num_registers))
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-        (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
-    CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
deleted file mode 100644
index 132653fa7ca..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright 2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_NANOVDB
-#  define NDEBUG /* Disable "assert" in device code */
-#  define NANOVDB_USE_INTRINSICS
-#  include "nanovdb/NanoVDB.h"
-#  include "nanovdb/util/SampleFromVoxels.h"
-#endif
-
-/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
-ccl_device float cubic_w0(float a)
-{
-  return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f);
-}
-ccl_device float cubic_w1(float a)
-{
-  return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);
-}
-ccl_device float cubic_w2(float a)
-{
-  return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);
-}
-ccl_device float cubic_w3(float a)
-{
-  return (1.0f / 6.0f) * (a * a * a);
-}
-
-/* g0 and g1 are the two amplitude functions. */
-ccl_device float cubic_g0(float a)
-{
-  return cubic_w0(a) + cubic_w1(a);
-}
-ccl_device float cubic_g1(float a)
-{
-  return cubic_w2(a) + cubic_w3(a);
-}
-
-/* h0 and h1 are the two offset functions */
-ccl_device float cubic_h0(float a)
-{
-  return (cubic_w1(a) / cubic_g0(a)) - 1.0f;
-}
-ccl_device float cubic_h1(float a)
-{
-  return (cubic_w3(a) / cubic_g1(a)) + 1.0f;
-}
-
-/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
-template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
-{
-  CUtexObject tex = (CUtexObject)info.data;
-
-  x = (x * info.width) - 0.5f;
-  y = (y * info.height) - 0.5f;
-
-  float px = floorf(x);
-  float py = floorf(y);
-  float fx = x - px;
-  float fy = y - py;
-
-  float g0x = cubic_g0(fx);
-  float g1x = cubic_g1(fx);
-  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
-  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
-  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
-  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
-  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
-
-  return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
-         cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
-}
-
-/* Fast tricubic texture lookup using 8 trilinear lookups. */
-template<typename T>
-ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
-{
-  CUtexObject tex = (CUtexObject)info.data;
-
-  x = (x * info.width) - 0.5f;
-  y = (y * info.height) - 0.5f;
-  z = (z * info.depth) - 0.5f;
-
-  float px = floorf(x);
-  float py = floorf(y);
-  float pz = floorf(z);
-  float fx = x - px;
-  float fy = y - py;
-  float fz = z - pz;
-
-  float g0x = cubic_g0(fx);
-  float g1x = cubic_g1(fx);
-  float g0y = cubic_g0(fy);
-  float g1y = cubic_g1(fy);
-  float g0z = cubic_g0(fz);
-  float g1z = cubic_g1(fz);
-
-  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
-  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
-  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
-  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
-  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
-  float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
-  float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
-
-  return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
-         g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1)));
-}
-
-#ifdef WITH_NANOVDB
-template<typename T, typename S>
-ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z)
-{
-  float px = floorf(x);
-  float py = floorf(y);
-  float pz = floorf(z);
-  float fx = x - px;
-  float fy = y - py;
-  float fz = z - pz;
-
-  float g0x = cubic_g0(fx);
-  float g1x = cubic_g1(fx);
-  float g0y = cubic_g0(fy);
-  float g1y = cubic_g1(fy);
-  float g0z = cubic_g0(fz);
-  float g1z = cubic_g1(fz);
-
-  float x0 = px + cubic_h0(fx);
-  float x1 = px + cubic_h1(fx);
-  float y0 = py + cubic_h0(fy);
-  float y1 = py + cubic_h1(fy);
-  float z0 = pz + cubic_h0(fz);
-  float z1 = pz + cubic_h1(fz);
-
-  using namespace nanovdb;
-
-  return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) +
-                g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) +
-         g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) +
-                g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
-}
-
-template<typename T>
-ccl_device_inline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
-{
-  using namespace nanovdb;
-
-  NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
-  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
-  AccessorType acc = grid->getAccessor();
-
-  switch (interpolation) {
-    case INTERPOLATION_CLOSEST:
-      return SampleFromVoxels<AccessorType, 0, false>(acc)(Vec3f(x, y, z));
-    case INTERPOLATION_LINEAR:
-      return SampleFromVoxels<AccessorType, 1, false>(acc)(Vec3f(x - 0.5f, y - 0.5f, z - 0.5f));
-    default:
-      SampleFromVoxels<AccessorType, 1, false> s(acc);
-      return kernel_tex_image_interp_tricubic_nanovdb<T>(s, x - 0.5f, y - 0.5f, z - 0.5f);
-  }
-}
-#endif
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
-
-  /* float4, byte4, ushort4 and half4 */
-  const int texture_type = info.data_type;
-  if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
-      texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
-    if (info.interpolation == INTERPOLATION_CUBIC) {
-      return kernel_tex_image_interp_bicubic<float4>(info, x, y);
-    }
-    else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex2D<float4>(tex, x, y);
-    }
-  }
-  /* float, byte and half */
-  else {
-    float f;
-
-    if (info.interpolation == INTERPOLATION_CUBIC) {
-      f = kernel_tex_image_interp_bicubic<float>(info, x, y);
-    }
-    else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex2D<float>(tex, x, y);
-    }
-
-    return make_float4(f, f, f, 1.0f);
-  }
-}
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
-                                             int id,
-                                             float3 P,
-                                             InterpolationType interp)
-{
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
-
-  if (info.use_transform_3d) {
-    P = transform_point(&info.transform_3d, P);
-  }
-
-  const float x = P.x;
-  const float y = P.y;
-  const float z = P.z;
-
-  uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
-  const int texture_type = info.data_type;
-
-#ifdef WITH_NANOVDB
-  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
-    float f = kernel_tex_image_interp_nanovdb<float>(info, x, y, z, interpolation);
-    return make_float4(f, f, f, 1.0f);
-  }
-  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    nanovdb::Vec3f f = kernel_tex_image_interp_nanovdb<nanovdb::Vec3f>(
-        info, x, y, z, interpolation);
-    return make_float4(f[0], f[1], f[2], 1.0f);
-  }
-#endif
-  if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
-      texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
-    if (interpolation == INTERPOLATION_CUBIC) {
-      return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
-    }
-    else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex3D<float4>(tex, x, y, z);
-    }
-  }
-  else {
-    float f;
-
-    if (interpolation == INTERPOLATION_CUBIC) {
-      f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
-    }
-    else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex3D<float>(tex, x, y, z);
-    }
-
-    return make_float4(f, f, f, 1.0f);
-  }
-}
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
deleted file mode 100644
index 95ad7599cf1..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA split kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#define __SPLIT_KERNEL__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-#include "kernel/split/kernel_path_init.h"
-#include "kernel/split/kernel_scene_intersect.h"
-#include "kernel/split/kernel_lamp_emission.h"
-#include "kernel/split/kernel_do_volume.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-#include "kernel/split/kernel_indirect_background.h"
-#include "kernel/split/kernel_shader_setup.h"
-#include "kernel/split/kernel_shader_sort.h"
-#include "kernel/split/kernel_shader_eval.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-#include "kernel/split/kernel_direct_lighting.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-#include "kernel/split/kernel_buffer_update.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#include "kernel/kernel_film.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
-{
-	*size = split_data_buffer_size(NULL, num_threads);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace_data_init(
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,
-        int queuesize,
-        ccl_global char *use_queues_flag,
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-        ccl_global float *buffer)
-{
-	kernel_data_init(NULL,
-	                 NULL,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		kernel_##name(NULL); \
-	}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		ccl_local type locals; \
-		kernel_##name(NULL, &locals); \
-	}
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif
-