Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/kernel_compat_cuda.h')
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h129
1 files changed, 84 insertions, 45 deletions
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..ac63bcf7ac9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -30,75 +30,114 @@
# define __NODES_FEATURES__ NODE_FEATURE_ALL
#endif
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <float.h>
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef unsigned short half;
+typedef unsigned long long CUtexObject;
+
+#define FLT_MIN 1.175494350822287507969e-38f
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+
+__device__ half __float2half(const float f)
+{
+ half val;
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+ return val;
+}
/* Qualifier wrappers for different names on different devices */
#define ccl_device __device__ __inline__
-# define ccl_device_forceinline __device__ __forceinline__
-#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#if __CUDA_ARCH__ < 500
# define ccl_device_inline __device__ __forceinline__
+# define ccl_device_forceinline __device__ __forceinline__
#else
# define ccl_device_inline __device__ __inline__
+# define ccl_device_forceinline __device__ __forceinline__
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_global
-#define ccl_constant
+#define ccl_static_constant __constant__
+#define ccl_constant const
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
#define ccl_align(n) __align__(n)
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
/* Types */
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+ switch(d) {
+ case 0: return threadIdx.x;
+ case 1: return threadIdx.y;
+ case 2: return threadIdx.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+ switch(d) {
+ case 0: return blockDim.x;
+ case 1: return blockDim.y;
+ case 2: return blockDim.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+ switch(d) {
+ case 0: return blockIdx.x;
+ case 1: return blockIdx.y;
+ case 2: return blockIdx.z;
+ default: return 0;
+ }
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+ switch(d) {
+ case 0: return gridDim.x;
+ case 1: return gridDim.y;
+ case 2: return gridDim.z;
+ default: return 0;
+ }
+}
/* Textures */
-typedef texture<float4, 1> texture_float4;
-typedef texture<float2, 1> texture_float2;
-typedef texture<float, 1> texture_float;
-typedef texture<uint, 1> texture_uint;
-typedef texture<int, 1> texture_int;
-typedef texture<uint4, 1> texture_uint4;
-typedef texture<uchar, 1> texture_uchar;
-typedef texture<uchar4, 1> texture_uchar4;
-typedef texture<float4, 2> texture_image_float4;
-typedef texture<float4, 3> texture_image3d_float4;
-typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
-
-/* Macros to handle different memory storage on different devices */
-
-/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
- * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
- *
- * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
- * Using Arrays on Fermi turned out to be slower.*/
-
-/* Fermi */
-#if __CUDA_ARCH__ < 300
-# define __KERNEL_CUDA_TEX_STORAGE__
-# define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
-
-# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
-
-/* Kepler */
-#else
-# define kernel_tex_fetch(t, index) t[(index)]
-
-# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
-# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
-# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
-# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
-#endif
+/* Use arrays for regular data. */
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
#define kernel_data __data