Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Tönne <lukas.toenne@gmail.com>2017-10-16 12:16:13 +0300
committerLukas Tönne <lukas.toenne@gmail.com>2017-10-16 12:22:35 +0300
commita78b3ee53aa53020b086a6df25c0e28491223dcc (patch)
treebd883e95580f5777f7eae7cac4e47f182ac9fc00 /intern/cycles/kernel/kernel_compat_cuda.h
parent4842cc017c3bb7df2070c2f96605190ff88e6a2e (diff)
parent49f4ac17bf704614de59a4db7a65c205c085d694 (diff)
Merge remote-tracking branch 'origin/master' into openvdbopenvdb
Diffstat (limited to 'intern/cycles/kernel/kernel_compat_cuda.h')
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h112
1 files changed, 75 insertions, 37 deletions
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..fa512f80e41 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -33,71 +33,109 @@
#include <cuda.h>
#include <cuda_fp16.h>
#include <float.h>
+#include <stdint.h>
/* Qualifier wrappers for different names on different devices */
#define ccl_device __device__ __inline__
+#if __CUDA_ARCH__ < 300
+# define ccl_device_inline __device__ __inline__
# define ccl_device_forceinline __device__ __forceinline__
-#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#elif __CUDA_ARCH__ < 500
# define ccl_device_inline __device__ __forceinline__
+# define ccl_device_forceinline __device__ __forceinline__
#else
# define ccl_device_inline __device__ __inline__
+# define ccl_device_forceinline __device__ __forceinline__
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_global
-#define ccl_constant
+#define ccl_static_constant __constant__
+#define ccl_constant const
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
#define ccl_align(n) __align__(n)
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
/* Types */
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+ switch(d) {
+ case 0: return threadIdx.x;
+ case 1: return threadIdx.y;
+ case 2: return threadIdx.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+ switch(d) {
+ case 0: return blockDim.x;
+ case 1: return blockDim.y;
+ case 2: return blockDim.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+ switch(d) {
+ case 0: return blockIdx.x;
+ case 1: return blockIdx.y;
+ case 2: return blockIdx.z;
+ default: return 0;
+ }
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+ switch(d) {
+ case 0: return gridDim.x;
+ case 1: return gridDim.y;
+ case 2: return gridDim.z;
+ default: return 0;
+ }
+}
/* Textures */
-typedef texture<float4, 1> texture_float4;
-typedef texture<float2, 1> texture_float2;
-typedef texture<float, 1> texture_float;
-typedef texture<uint, 1> texture_uint;
-typedef texture<int, 1> texture_int;
-typedef texture<uint4, 1> texture_uint4;
-typedef texture<uchar, 1> texture_uchar;
-typedef texture<uchar4, 1> texture_uchar4;
+/* Use arrays for regular data. This is a little slower than textures on Fermi,
+ * but allows for cleaner code and we will stop supporting Fermi soon. */
+#define kernel_tex_fetch(t, index) t[(index)]
+
+/* On Kepler (6xx) and above, we use Bindless Textures for images.
+ * On Fermi cards (4xx and 5xx), we have to use regular textures. */
+#if __CUDA_ARCH__ < 300
typedef texture<float4, 2> texture_image_float4;
typedef texture<float4, 3> texture_image3d_float4;
typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
-
-/* Macros to handle different memory storage on different devices */
-
-/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
- * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
- *
- * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
- * Using Arrays on Fermi turned out to be slower.*/
-
-/* Fermi */
-#if __CUDA_ARCH__ < 300
-# define __KERNEL_CUDA_TEX_STORAGE__
-# define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
-
-# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
-
-/* Kepler */
-#else
-# define kernel_tex_fetch(t, index) t[(index)]
-
-# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
-# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
-# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
-# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
#endif
#define kernel_data __data