Cycles: several small fixes and additions for MSL

This patch contains many small leftover fixes and additions that are required for Metal-enablement: - Address space fixes and a few other small compile fixes - Addition of missing functionality to the Metal adapter headers - Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for atomic support & maths functions) Ref T92212 Differential Revision: https://developer.blender.org/D13263
author: Michael Jones <michael_p_jones@apple.com> 2021-11-18 16:25:05 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-11-18 16:38:02 +0300
commit: d19e35873f67c90b251ca38e007a83aa1eada211 (patch)
tree: 7e8659acea7f12b188077ada225e113b5df35e60 /intern/cycles/kernel
parent: c0d52db783eb3a6288c9af04298b2358fec76357 (diff)
14 files changed, 28 insertions, 35 deletions
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index 8686f887021..26ba136dd79 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
     swapped = false;
     for (int j = 0; j < num_hits - 1; ++j) {
       if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp_hit = hits[j];
+        Intersection tmp_hit = hits[j];
         float3 tmp_Ng = Ng[j];
         hits[j] = hits[j + 1];
         Ng[j] = Ng[j + 1];
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index ba3aefa43bf..7f901510329 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -86,7 +86,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index dd0c6dd6893..60332af752c 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
   }
 }
 
@@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index f667ede2712..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
     const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
     /* For each thread within a warp compute how many other active states precede it. */
-    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
-                                            ccl_gpu_thread_mask(thread_warp));
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));
 
     /* Last thread in warp stores number of active states for each warp. */
     if (thread_warp == ccl_gpu_warp_size - 1) {
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index b58179e12ff..39bf2131c22 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -85,7 +85,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 typedef hipTextureObject_t ccl_gpu_tex_object;
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 19358e063d8..080109e3b83 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;
 
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"
 
 /* Qualifiers */
 
@@ -65,7 +66,7 @@ using namespace metal;
 #define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
 
 #define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
-#define ccl_gpu_popc(x) popcount(x)
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
 
 // clang-format off
 
@@ -124,7 +125,6 @@ kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
                                 uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                 uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
   MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  INIT_DEBUG_BUFFER \
   params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
@@ -230,6 +230,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 #define sinhf(x) sinh(float(x))
 #define coshf(x) cosh(float(x))
 #define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))
 
 /* Use native functions with possibly lower precision for performance,
  * no issues found so far. */
@@ -243,6 +244,8 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 
 #define NULL 0
 
+#define __device__
+
 /* texture bindings and sampler setup */
 
 struct Texture2DParamsMetal {
@@ -257,6 +260,9 @@ struct MetalAncillaries {
   device Texture3DParamsMetal *textures_3d;
 };
 
+#include "util/half.h"
+#include "util/types.h"
+
 enum SamplerType {
   SamplerFilterNearest_AddressRepeat,
   SamplerFilterNearest_AddressClampEdge,
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
index b4963518b63..1aea36589d0 100644
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
 
 typedef struct KernelParamsMetal {
 
-#define KERNEL_TEX(type, name) ccl_constant type *name;
+#define KERNEL_TEX(type, name) ccl_global const type *name;
 #include "kernel/textures.h"
 #undef KERNEL_TEX
 
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index c7a7be7309a..bebb1e458eb 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -87,7 +87,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 
diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h
index c9303088e3f..9ee0d27cc8c 100644
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -160,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
 
   ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
 
-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+  return atomic_fetch_and_add_uint32(
+             (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
          sample_offset;
 }
 
diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h
index ae96e7b76ef..a7ac2bd926f 100644
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}
 
 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h
index e3b5c9afb91..1b693a915bf 100644
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h
index 92cfff639b4..e8c4915813e 100644
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Linear Congruential Generator */
 
-ccl_device uint lcg_step_uint(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device uint lcg_step_uint(T rng)
 {
   /* implicit mod 2^32 */
   *rng = (1103515245 * (*rng) + 12345);
   return *rng;
 }
 
-ccl_device float lcg_step_float(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device float lcg_step_float(T rng)
 {
   /* implicit mod 2^32 */
   *rng = (1103515245 * (*rng) + 12345);
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index 0c27992c7f6..adc8493badd 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
     /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
      * We can use this to get divide sample sequence into two classes for easier variance
      * estimation. */
-#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
-    return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
-#elif defined(__NVCC__)
-    return __popc(sample & 0xaaaaaaaa) & 1;
-#else
-    /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
-    int i = sample & 0xaaaaaaaa;
-    i = i - ((i >> 1) & 0x55555555);
-    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
-    i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
-    return i & 1;
-#endif
+    return popcount(uint(sample) & 0xaaaaaaaa) & 1;
   }
   else {
     /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 62ba5bf04e3..ce32e1a520f 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN
 template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
                                ConstIntegratorGenericState state,
-                               ShaderData *sd,
+                               ccl_private ShaderData *sd,
                                ccl_global float *render_buffer,
                                uint32_t path_flag)
 {
author	Michael Jones <michael_p_jones@apple.com>	2021-11-18 16:25:05 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-11-18 16:38:02 +0300
commit	d19e35873f67c90b251ca38e007a83aa1eada211 (patch)
tree	7e8659acea7f12b188077ada225e113b5df35e60 /intern/cycles/kernel
parent	c0d52db783eb3a6288c9af04298b2358fec76357 (diff)