diff options
author | Lukas Tönne <lukas.toenne@gmail.com> | 2017-10-16 12:16:13 +0300 |
---|---|---|
committer | Lukas Tönne <lukas.toenne@gmail.com> | 2017-10-16 12:22:35 +0300 |
commit | a78b3ee53aa53020b086a6df25c0e28491223dcc (patch) | |
tree | bd883e95580f5777f7eae7cac4e47f182ac9fc00 /intern/cycles/kernel/kernel_compat_cuda.h | |
parent | 4842cc017c3bb7df2070c2f96605190ff88e6a2e (diff) | |
parent | 49f4ac17bf704614de59a4db7a65c205c085d694 (diff) |
Merge remote-tracking branch 'origin/master' into openvdbopenvdb
Diffstat (limited to 'intern/cycles/kernel/kernel_compat_cuda.h')
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cuda.h | 112 |
1 files changed, 75 insertions, 37 deletions
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..fa512f80e41 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -33,71 +33,109 @@ #include <cuda.h> #include <cuda_fp16.h> #include <float.h> +#include <stdint.h> /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ +#if __CUDA_ARCH__ < 300 +# define ccl_device_inline __device__ __inline__ # define ccl_device_forceinline __device__ __forceinline__ -#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500) +#elif __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_global -#define ccl_constant +#define ccl_static_constant __constant__ +#define ccl_constant const +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +/* TODO(sergey): In theory we might use references with CUDA, however + * performance impact yet to be investigated. + */ +#define ccl_ref #define ccl_align(n) __align__(n) +#define ATTR_FALLTHROUGH + +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) /* Types */ -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" + +/* Work item functions */ + +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} /* Textures */ -typedef texture<float4, 1> texture_float4; -typedef texture<float2, 1> texture_float2; -typedef texture<float, 1> texture_float; -typedef texture<uint, 1> texture_uint; -typedef texture<int, 1> texture_int; -typedef texture<uint4, 1> texture_uint4; -typedef texture<uchar, 1> texture_uchar; -typedef texture<uchar4, 1> texture_uchar4; +/* Use arrays for regular data. This is a little slower than textures on Fermi, + * but allows for cleaner code and we will stop supporting Fermi soon. */ +#define kernel_tex_fetch(t, index) t[(index)] + +/* On Kepler (6xx) and above, we use Bindless Textures for images. + * On Fermi cards (4xx and 5xx), we have to use regular textures. */ +#if __CUDA_ARCH__ < 300 typedef texture<float4, 2> texture_image_float4; typedef texture<float4, 3> texture_image3d_float4; typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; - -/* Macros to handle different memory storage on different devices */ - -/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. - * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. - * - * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. - * Using Arrays on Fermi turned out to be slower.*/ - -/* Fermi */ -#if __CUDA_ARCH__ < 300 -# define __KERNEL_CUDA_TEX_STORAGE__ -# define kernel_tex_fetch(t, index) tex1Dfetch(t, index) - -# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) - -/* Kepler */ -#else -# define kernel_tex_fetch(t, index) t[(index)] - -# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y) -# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y) -# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z) -# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z) #endif #define kernel_data __data |