diff options
Diffstat (limited to 'intern/cycles/kernel/kernel_compat_cuda.h')
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cuda.h | 129 |
1 files changed, 84 insertions, 45 deletions
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..ac63bcf7ac9 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -30,75 +30,114 @@ # define __NODES_FEATURES__ NODE_FEATURE_ALL #endif -#include <cuda.h> -#include <cuda_fp16.h> -#include <float.h> +/* Manual definitions so we can compile without CUDA toolkit. */ + +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef unsigned short half; +typedef unsigned long long CUtexObject; + +#define FLT_MIN 1.175494350822287507969e-38f +#define FLT_MAX 340282346638528859811704183484516925440.0f + +__device__ half __float2half(const float f) +{ + half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); + return val; +} /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ -# define ccl_device_forceinline __device__ __forceinline__ -#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500) +#if __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_global -#define ccl_constant +#define ccl_static_constant __constant__ +#define ccl_constant const +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +/* TODO(sergey): In theory we might use references with CUDA, however + * performance impact yet to be investigated. + */ +#define ccl_ref #define ccl_align(n) __align__(n) +#define ATTR_FALLTHROUGH + +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) /* Types */ -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" + +/* Work item functions */ + +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} /* Textures */ -typedef texture<float4, 1> texture_float4; -typedef texture<float2, 1> texture_float2; -typedef texture<float, 1> texture_float; -typedef texture<uint, 1> texture_uint; -typedef texture<int, 1> texture_int; -typedef texture<uint4, 1> texture_uint4; -typedef texture<uchar, 1> texture_uchar; -typedef texture<uchar4, 1> texture_uchar4; -typedef texture<float4, 2> texture_image_float4; -typedef texture<float4, 3> texture_image3d_float4; -typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; - -/* Macros to handle different memory storage on different devices */ - -/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. - * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. - * - * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. - * Using Arrays on Fermi turned out to be slower.*/ - -/* Fermi */ -#if __CUDA_ARCH__ < 300 -# define __KERNEL_CUDA_TEX_STORAGE__ -# define kernel_tex_fetch(t, index) tex1Dfetch(t, index) - -# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) - -/* Kepler */ -#else -# define kernel_tex_fetch(t, index) t[(index)] - -# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y) -# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y) -# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z) -# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z) -#endif +/* Use arrays for regular data. */ +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) #define kernel_data __data |