4 files changed, 325 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
new file mode 100644
index 00000000000..665da43e1a1
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_CUDA__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+#ifdef __CUDACC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#if __CUDA_ARCH__ < 500
+#  define ccl_device_inline __device__ __forceinline__
+#  define ccl_device_forceinline __device__ __forceinline__
+#else
+#  define ccl_device_inline __device__ __inline__
+#  define ccl_device_forceinline __device__ __forceinline__
+#endif
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for CUDA */
+
+#define kernel_assert(cond)
+
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+#    define GPU_KERNEL_MAX_REGISTERS 64
+#  else
+#    define GPU_KERNEL_MAX_REGISTERS 48
+#  endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
+#  define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
new file mode 100644
index 00000000000..e26fe243642
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#  include "kernel/device/cuda/compat.h"
+#  include "kernel/device/cuda/config.h"
+#  include "kernel/device/cuda/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif