Cycles: Kernel address space changes for MSL

This is the first of a sequence of changes to support compiling Cycles kernels as MSL (Metal Shading Language) in preparation for a Metal GPU device implementation. MSL requires that all pointer types be declared with explicit address space attributes (device, thread, etc...). There is already precedent for this with Cycles' address space macros (ccl_global, ccl_private, etc...), therefore the first step of MSL-enablement is to apply these consistently. Line-for-line this represents the largest change required to enable MSL. Applying this change first will simplify future patches as well as offering the emergent benefit of enhanced descriptiveness. The vast majority of deltas in this patch fall into one of two cases: - Ensuring ccl_private is specified for thread-local pointer types - Ensuring ccl_global is specified for device-wide pointer types Additionally, the ccl_addr_space qualifier can be removed. Prior to Cycles X, ccl_addr_space was used as a context-dependent address space qualifier, but now it is either redundant (e.g. in struct typedefs), or can be replaced by ccl_global in the case of pointer types. Associated function variants (e.g. lcg_step_float_addrspace) are also redundant. In cases where address space qualifiers are chained with "const", this patch places the address space qualifier first. The rationale for this is that the choice of address space is likely to have the greater impact on runtime performance and overall architecture. The final part of this patch is the addition of a metal/compat.h header. This is partially complete and will be extended in future patches, paving the way for the full Metal implementation. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D12864
author: Michael Jones <michael_p_jones@apple.com> 2021-10-14 15:53:40 +0300
committer: Michael Jones <michael_p_jones@apple.com> 2021-10-14 18:14:43 +0300
commit: a0f269f682dab848afc80cd322d04a0c4a815cae (patch)
tree: 0978b1888273fbaa2d14550bde484c5247fa89ff /intern/cycles/util
parent: 47caeb8c26686e24ea7e694f94fabee44f3d2dca (diff)
12 files changed, 64 insertions, 48 deletions
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 7b67b90e44d..361c36d9061 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -277,7 +277,7 @@ ccl_device float4 color_srgb_to_linear_v4(float4 c)
 #endif
 }
 
-ccl_device float3 color_highlight_compress(float3 color, float3 *variance)
+ccl_device float3 color_highlight_compress(float3 color, ccl_private float3 *variance)
 {
   color += one_float3();
   if (variance) {
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index f36a492a1b0..81723abe1e2 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -61,7 +61,7 @@ struct half4 {
 
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
-ccl_device_inline void float4_store_half(half *h, float4 f)
+ccl_device_inline void float4_store_half(ccl_private half *h, float4 f)
 {
   h[0] = __float2half(f.x);
   h[1] = __float2half(f.y);
@@ -71,7 +71,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 
 #else
 
-ccl_device_inline void float4_store_half(half *h, float4 f)
+ccl_device_inline void float4_store_half(ccl_private half *h, float4 f)
 {
 
 #  ifndef __KERNEL_SSE2__
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index cb1e94c838c..f834011a032 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -338,7 +338,7 @@ ccl_device_inline int quick_floor_to_int(float x)
   return float_to_int(x) - ((x < 0) ? 1 : 0);
 }
 
-ccl_device_inline float floorfrac(float x, int *i)
+ccl_device_inline float floorfrac(float x, ccl_private int *i)
 {
   *i = quick_floor_to_int(x);
   return x - *i;
@@ -465,14 +465,18 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
 
 /* Triangle */
 
-ccl_device_inline float triangle_area(const float3 &v1, const float3 &v2, const float3 &v3)
+ccl_device_inline float triangle_area(ccl_private const float3 &v1,
+                                      ccl_private const float3 &v2,
+                                      ccl_private const float3 &v3)
 {
   return len(cross(v3 - v2, v1 - v2)) * 0.5f;
 }
 
 /* Orthonormal vectors */
 
-ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
+ccl_device_inline void make_orthonormals(const float3 N,
+                                         ccl_private float3 *a,
+                                         ccl_private float3 *b)
 {
 #if 0
   if (fabsf(N.y) >= 0.999f) {
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index 38afa163db5..cc924f36a71 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -156,7 +156,7 @@ ccl_device float fast_cosf(float x)
   return u;
 }
 
-ccl_device void fast_sincosf(float x, float *sine, float *cosine)
+ccl_device void fast_sincosf(float x, ccl_private float *sine, ccl_private float *cosine)
 {
   /* Same argument reduction as fast_sin. */
   int q = fast_rint(x * M_1_PI_F);
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 70b80c33544..25eda840214 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -207,7 +207,7 @@ ccl_device_inline float2 normalize(const float2 &a)
   return a / len(a);
 }
 
-ccl_device_inline float2 normalize_len(const float2 &a, float *t)
+ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t)
 {
   *t = len(a);
   return a / (*t);
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 30a1b4c3f77..c3230a8068c 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -411,7 +411,7 @@ ccl_device_inline float3 saturate3(float3 a)
   return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
 }
 
-ccl_device_inline float3 normalize_len(const float3 a, float *t)
+ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t)
 {
   *t = len(a);
   float x = 1.0f / *t;
@@ -424,7 +424,7 @@ ccl_device_inline float3 safe_normalize(const float3 a)
   return (t != 0.0f) ? a * (1.0f / t) : a;
 }
 
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t)
 {
   *t = len(a);
   return (*t != 0.0f) ? a / (*t) : a;
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 19af5c8c638..f30a78cfc69 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -497,7 +497,7 @@ ccl_device_inline float4 reduce_max(const float4 &a)
 #  endif
 }
 
-ccl_device_inline float4 load_float4(const float *v)
+ccl_device_inline float4 load_float4(ccl_private const float *v)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_loadu_ps(v));
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
index fd0c9124345..0c431a36afb 100644
--- a/intern/cycles/util/util_math_intersect.h
+++ b/intern/cycles/util/util_math_intersect.h
@@ -26,8 +26,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
                                      float ray_t,
                                      float3 sphere_P,
                                      float sphere_radius,
-                                     float3 *isect_P,
-                                     float *isect_t)
+                                     ccl_private float3 *isect_P,
+                                     ccl_private float *isect_t)
 {
   const float3 d = sphere_P - ray_P;
   const float radiussq = sphere_radius * sphere_radius;
@@ -60,8 +60,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
                                            float ray_t,
                                            float3 disk_P,
                                            float disk_radius,
-                                           float3 *isect_P,
-                                           float *isect_t)
+                                           ccl_private float3 *isect_P,
+                                           ccl_private float *isect_t)
 {
   /* Aligned disk normal. */
   float disk_t;
@@ -95,9 +95,9 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                    const float3 tri_b,
                                                    const float3 tri_c,
 #endif
-                                                   float *isect_u,
-                                                   float *isect_v,
-                                                   float *isect_t)
+                                                   ccl_private float *isect_u,
+                                                   ccl_private float *isect_v,
+                                                   ccl_private float *isect_t)
 {
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
   typedef ssef float3;
@@ -207,10 +207,10 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
                                    float3 quad_u,
                                    float3 quad_v,
                                    float3 quad_n,
-                                   float3 *isect_P,
-                                   float *isect_t,
-                                   float *isect_u,
-                                   float *isect_v,
+                                   ccl_private float3 *isect_P,
+                                   ccl_private float *isect_t,
+                                   ccl_private float *isect_u,
+                                   ccl_private float *isect_v,
                                    bool ellipse)
 {
   /* Perform intersection test. */
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 123736f75a6..bff7ddb4cee 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -35,14 +35,14 @@ CCL_NAMESPACE_BEGIN
 
 /* Zeroing helpers. */
 
-ccl_device_inline void math_vector_zero(float *v, int n)
+ccl_device_inline void math_vector_zero(ccl_private float *v, int n)
 {
   for (int i = 0; i < n; i++) {
     v[i] = 0.0f;
   }
 }
 
-ccl_device_inline void math_matrix_zero(float *A, int n)
+ccl_device_inline void math_matrix_zero(ccl_private float *A, int n)
 {
   for (int row = 0; row < n; row++) {
     for (int col = 0; col <= row; col++) {
@@ -53,14 +53,18 @@ ccl_device_inline void math_matrix_zero(float *A, int n)
 
 /* Elementary vector operations. */
 
-ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
+ccl_device_inline void math_vector_add(ccl_private float *a,
+                                       ccl_private const float *ccl_restrict b,
+                                       int n)
 {
   for (int i = 0; i < n; i++) {
     a[i] += b[i];
   }
 }
 
-ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
+ccl_device_inline void math_vector_mul(ccl_private float *a,
+                                       ccl_private const float *ccl_restrict b,
+                                       int n)
 {
   for (int i = 0; i < n; i++) {
     a[i] *= b[i];
@@ -68,7 +72,7 @@ ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, in
 }
 
 ccl_device_inline void math_vector_mul_strided(ccl_global float *a,
-                                               const float *ccl_restrict b,
+                                               ccl_private const float *ccl_restrict b,
                                                int astride,
                                                int n)
 {
@@ -77,21 +81,23 @@ ccl_device_inline void math_vector_mul_strided(ccl_global float *a,
   }
 }
 
-ccl_device_inline void math_vector_scale(float *a, float b, int n)
+ccl_device_inline void math_vector_scale(ccl_private float *a, float b, int n)
 {
   for (int i = 0; i < n; i++) {
     a[i] *= b;
   }
 }
 
-ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max(ccl_private float *a,
+                                       ccl_private const float *ccl_restrict b,
+                                       int n)
 {
   for (int i = 0; i < n; i++) {
     a[i] = max(a[i], b[i]);
   }
 }
 
-ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
+ccl_device_inline void math_vec3_add(ccl_private float3 *v, int n, ccl_private float *x, float3 w)
 {
   for (int i = 0; i < n; i++) {
     v[i] += w * x[i];
@@ -99,7 +105,7 @@ ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
 }
 
 ccl_device_inline void math_vec3_add_strided(
-    ccl_global float3 *v, int n, float *x, float3 w, int stride)
+    ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
 {
   for (int i = 0; i < n; i++) {
     ccl_global float *elem = (ccl_global float *)(v + i * stride);
@@ -125,9 +131,9 @@ ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A,
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian(float *A,
+ccl_device_inline void math_matrix_add_gramian(ccl_private float *A,
                                                int n,
-                                               const float *ccl_restrict v,
+                                               ccl_private const float *ccl_restrict v,
                                                float weight)
 {
   for (int row = 0; row < n; row++) {
@@ -140,7 +146,7 @@ ccl_device_inline void math_matrix_add_gramian(float *A,
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
 ccl_device_inline void math_trimatrix_add_gramian_strided(
-    ccl_global float *A, int n, const float *ccl_restrict v, float weight, int stride)
+    ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight, int stride)
 {
   for (int row = 0; row < n; row++) {
     for (int col = 0; col <= row; col++) {
@@ -151,7 +157,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(
 
 ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
                                                   int n,
-                                                  const float *ccl_restrict v,
+                                                  ccl_private const float *ccl_restrict v,
                                                   float weight)
 {
   for (int row = 0; row < n; row++) {
@@ -244,7 +250,7 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A,
  * and V will contain the eigenvectors of the original A in its rows (!),
  * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
  */
-ccl_device void math_matrix_jacobi_eigendecomposition(float *A,
+ccl_device void math_matrix_jacobi_eigendecomposition(ccl_private float *A,
                                                       ccl_global float *V,
                                                       int n,
                                                       int v_stride)
diff --git a/intern/cycles/util/util_projection.h b/intern/cycles/util/util_projection.h
index 9c7e0061c82..04b4574d75b 100644
--- a/intern/cycles/util/util_projection.h
+++ b/intern/cycles/util/util_projection.h
@@ -45,7 +45,8 @@ typedef struct PerspectiveMotionTransform {
 
 /* Functions */
 
-ccl_device_inline float3 transform_perspective(const ProjectionTransform *t, const float3 a)
+ccl_device_inline float3 transform_perspective(ccl_private const ProjectionTransform *t,
+                                               const float3 a)
 {
   float4 b = make_float4(a.x, a.y, a.z, 1.0f);
   float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
@@ -54,7 +55,7 @@ ccl_device_inline float3 transform_perspective(const ProjectionTransform *t, con
   return (w != 0.0f) ? c / w : zero_float3();
 }
 
-ccl_device_inline float3 transform_perspective_direction(const ProjectionTransform *t,
+ccl_device_inline float3 transform_perspective_direction(ccl_private const ProjectionTransform *t,
                                                          const float3 a)
 {
   float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z,
diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h
index 36f02a01f7b..32df9327cbd 100644
--- a/intern/cycles/util/util_rect.h
+++ b/intern/cycles/util/util_rect.h
@@ -54,7 +54,10 @@ ccl_device_inline int coord_to_local_index(int4 rect, int x, int y)
 
 /* Finds the coordinates of a pixel given by its row-major index in the rect,
  * and returns whether the pixel is inside it. */
-ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y)
+ccl_device_inline bool local_index_to_coord(int4 rect,
+                                            int idx,
+                                            ccl_private int *x,
+                                            ccl_private int *y)
 {
   int w = rect.z - rect.x;
   *x = (idx % w) + rect.x;
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index e9cd3b0b483..fc04f9aab46 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -53,7 +53,7 @@ typedef struct DecomposedTransform {
 
 /* Functions */
 
-ccl_device_inline float3 transform_point(const Transform *t, const float3 a)
+ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a)
 {
   /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
@@ -82,7 +82,7 @@ ccl_device_inline float3 transform_point(const Transform *t, const float3 a)
 #endif
 }
 
-ccl_device_inline float3 transform_direction(const Transform *t, const float3 a)
+ccl_device_inline float3 transform_direction(ccl_private const Transform *t, const float3 a)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
   ssef x, y, z, w, aa;
@@ -108,7 +108,8 @@ ccl_device_inline float3 transform_direction(const Transform *t, const float3 a)
 #endif
 }
 
-ccl_device_inline float3 transform_direction_transposed(const Transform *t, const float3 a)
+ccl_device_inline float3 transform_direction_transposed(ccl_private const Transform *t,
+                                                        const float3 a)
 {
   float3 x = make_float3(t->x.x, t->y.x, t->z.x);
   float3 y = make_float3(t->x.y, t->y.y, t->z.y);
@@ -409,7 +410,8 @@ ccl_device_inline Transform transform_quick_inverse(Transform M)
   return R;
 }
 
-ccl_device_inline void transform_compose(Transform *tfm, const DecomposedTransform *decomp)
+ccl_device_inline void transform_compose(ccl_private Transform *tfm,
+                                         ccl_private const DecomposedTransform *decomp)
 {
   /* rotation */
   float q0, q1, q2, q3, qda, qdb, qdc, qaa, qab, qac, qbb, qbc, qcc;
@@ -449,7 +451,7 @@ ccl_device_inline void transform_compose(Transform *tfm, const DecomposedTransfo
 
 /* Interpolate from array of decomposed transforms. */
 ccl_device void transform_motion_array_interpolate(Transform *tfm,
-                                                   const ccl_global DecomposedTransform *motion,
+                                                   const DecomposedTransform *motion,
                                                    uint numsteps,
                                                    float time)
 {
@@ -458,8 +460,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
   int step = min((int)(time * maxstep), maxstep - 1);
   float t = time * maxstep - step;
 
-  const ccl_global DecomposedTransform *a = motion + step;
-  const ccl_global DecomposedTransform *b = motion + step + 1;
+  const DecomposedTransform *a = motion + step;
+  const DecomposedTransform *b = motion + step + 1;
 
   /* Interpolate rotation, translation and scale. */
   DecomposedTransform decomp;
@@ -472,12 +474,12 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
   transform_compose(tfm, &decomp);
 }
 
-ccl_device_inline bool transform_isfinite_safe(Transform *tfm)
+ccl_device_inline bool transform_isfinite_safe(ccl_private Transform *tfm)
 {
   return isfinite4_safe(tfm->x) && isfinite4_safe(tfm->y) && isfinite4_safe(tfm->z);
 }
 
-ccl_device_inline bool transform_decomposed_isfinite_safe(DecomposedTransform *decomp)
+ccl_device_inline bool transform_decomposed_isfinite_safe(ccl_private DecomposedTransform *decomp)
 {
   return isfinite4_safe(decomp->x) && isfinite4_safe(decomp->y) && isfinite4_safe(decomp->z) &&
          isfinite4_safe(decomp->w);
author	Michael Jones <michael_p_jones@apple.com>	2021-10-14 15:53:40 +0300
committer	Michael Jones <michael_p_jones@apple.com>	2021-10-14 18:14:43 +0300
commit	a0f269f682dab848afc80cd322d04a0c4a815cae (patch)
tree	0978b1888273fbaa2d14550bde484c5247fa89ff /intern/cycles/util
parent	47caeb8c26686e24ea7e694f94fabee44f3d2dca (diff)