intern/cycles/kernel/kernel_compat_cuda.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

/*
 * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef __KERNEL_COMPAT_CUDA_H__
#define __KERNEL_COMPAT_CUDA_H__

#define __KERNEL_GPU__
#define __KERNEL_CUDA__
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END

/* Selective nodes compilation. */
#ifndef __NODES_MAX_GROUP__
#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
#endif
#ifndef __NODES_FEATURES__
#  define __NODES_FEATURES__ NODE_FEATURE_ALL
#endif

/* Manual definitions so we can compile without CUDA toolkit. */

typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef unsigned short half;
typedef unsigned long long CUtexObject;

#define FLT_MIN 1.175494350822287507969e-38f
#define FLT_MAX 340282346638528859811704183484516925440.0f

__device__ half __float2half(const float f)
{
       half val;
       asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
       return val;
}

/* Qualifier wrappers for different names on different devices */

#define ccl_device  __device__ __inline__
#if __CUDA_ARCH__ < 500
#  define ccl_device_inline  __device__ __forceinline__
#  define ccl_device_forceinline  __device__ __forceinline__
#else
#  define ccl_device_inline  __device__ __inline__
#  define ccl_device_forceinline  __device__ __forceinline__
#endif
#define ccl_device_noinline  __device__ __noinline__
#define ccl_global
#define ccl_static_constant __constant__
#define ccl_constant const
#define ccl_local __shared__
#define ccl_local_param
#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
/* TODO(sergey): In theory we might use references with CUDA, however
 * performance impact yet to be investigated.
 */
#define ccl_ref
#define ccl_align(n) __align__(n)

#define ATTR_FALLTHROUGH

#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)


/* No assert supported for CUDA */

#define kernel_assert(cond)

/* Types */

#include "util/util_half.h"
#include "util/util_types.h"

/* Work item functions */

ccl_device_inline uint ccl_local_id(uint d)
{
	switch(d) {
		case 0: return threadIdx.x;
		case 1: return threadIdx.y;
		case 2: return threadIdx.z;
		default: return 0;
	}
}

#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))

ccl_device_inline uint ccl_local_size(uint d)
{
	switch(d) {
		case 0: return blockDim.x;
		case 1: return blockDim.y;
		case 2: return blockDim.z;
		default: return 0;
	}
}

#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))

ccl_device_inline uint ccl_group_id(uint d)
{
	switch(d) {
		case 0: return blockIdx.x;
		case 1: return blockIdx.y;
		case 2: return blockIdx.z;
		default: return 0;
	}
}

ccl_device_inline uint ccl_num_groups(uint d)
{
	switch(d) {
		case 0: return gridDim.x;
		case 1: return gridDim.y;
		case 2: return gridDim.z;
		default: return 0;
	}
}

/* Textures */

/* Use arrays for regular data. */
#define kernel_tex_fetch(t, index) t[(index)]
#define kernel_tex_array(t) (t)

#define kernel_data __data

/* Use fast math functions */

#define cosf(x) __cosf(((float)(x)))
#define sinf(x) __sinf(((float)(x)))
#define powf(x, y) __powf(((float)(x)), ((float)(y)))
#define tanf(x) __tanf(((float)(x)))
#define logf(x) __logf(((float)(x)))
#define expf(x) __expf(((float)(x)))

#endif /* __KERNEL_COMPAT_CUDA_H__ */