2 files changed, 1174 insertions, 0 deletions
diff --git a/source/blender/gpu/shaders/metal/mtl_shader_common.msl b/source/blender/gpu/shaders/metal/mtl_shader_common.msl
new file mode 100644
index 00000000000..c504cdbacb1
--- /dev/null
+++ b/source/blender/gpu/shaders/metal/mtl_shader_common.msl
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* Common Metal header to be included in all compiled Metal shaders.
+ * Both native MSL shaders and GLSL shaders. */
+
+using namespace metal;
+
+/* Should match GPUVertFetchMode. */
+typedef enum {
+  GPU_FETCH_FLOAT = 0,
+  GPU_FETCH_INT,
+  GPU_FETCH_INT_TO_FLOAT_UNIT,
+  GPU_FETCH_INT_TO_FLOAT,
+} GPUVertFetchMode;
+
+/* Consant to flag base binding index of uniform buffers. */
+constant int MTL_uniform_buffer_base_index [[function_constant(0)]];
+
+/* Default Point Size.
+ * Unused if function constant not set. */
+constant float MTL_global_pointsize [[function_constant(1)]];
+
+/* Attribute conversions flags (Up to 16 attributes supported in Blender). */
+constant int MTL_AttributeConvert0 [[function_constant(2)]];
+constant int MTL_AttributeConvert1 [[function_constant(3)]];
+constant int MTL_AttributeConvert2 [[function_constant(4)]];
+constant int MTL_AttributeConvert3 [[function_constant(5)]];
+constant int MTL_AttributeConvert4 [[function_constant(6)]];
+constant int MTL_AttributeConvert5 [[function_constant(7)]];
+constant int MTL_AttributeConvert6 [[function_constant(8)]];
+constant int MTL_AttributeConvert7 [[function_constant(9)]];
+constant int MTL_AttributeConvert8 [[function_constant(10)]];
+constant int MTL_AttributeConvert9 [[function_constant(11)]];
+constant int MTL_AttributeConvert10 [[function_constant(12)]];
+constant int MTL_AttributeConvert11 [[function_constant(13)]];
+constant int MTL_AttributeConvert12 [[function_constant(14)]];
+constant int MTL_AttributeConvert13 [[function_constant(15)]];
+constant int MTL_AttributeConvert14 [[function_constant(16)]];
+constant int MTL_AttributeConvert15 [[function_constant(17)]];
+
+/* Consant to flag binding index of transform feedback buffer.
+ * Unused if function constant not set. */
+constant int MTL_transform_feedback_buffer_index [[function_constant(18)]];
+
+/** Internal attribute conversion functionality. */
+/* Following descriptions in mtl_shader.hh, Metal only supports some implicit
+ * attribute type conversions. These conversions occur when there is a difference
+ * between the type specified in the vertex descriptor (In the input vertex buffers),
+ * and the attribute type in the shader's VertexIn struct (ShaderInterface).
+ *
+ * The supported implicit conversions are described here:
+ * https://developer.apple.com/documentation/metal/mtlvertexattributedescriptor/1516081-format?language=objc
+ *
+ * For unsupported conversions, the mtl_shader_generator will create an attribute reading function
+ * which performs this conversion manually upon read, depending on the requested fetchmode.
+ *
+ * These conversions use the function constants above, so any branching is optimized out during
+ * backend shader compilation (PSO creation).
+ *
+ * NOTE: Not all possibilities have been covered here, any additional conversion routines should
+ * be added as needed, and mtl_shader_generator should also be updated with any newly required
+ * read functions.
+ *
+ * These paths are only needed for cases where implicit conversion will not happen, in which
+ * case the value will be read as the type in the shader.
+ */
+#define internal_vertex_attribute_convert_read_float(ATTR, v_in, v_out) \
+  if (ATTR == GPU_FETCH_INT_TO_FLOAT) { \
+    v_out = float(as_type<int>(v_in)); \
+  } \
+  else if (ATTR == GPU_FETCH_INT_TO_FLOAT_UNIT) { \
+    v_out = float(as_type<int>(v_in)) / float(__INT_MAX__); \
+  } \
+  else { \
+    v_out = v_in; \
+  }
+
+#define internal_vertex_attribute_convert_read_float2(ATTR, v_in, v_out) \
+  if (ATTR == GPU_FETCH_INT_TO_FLOAT) { \
+    v_out = float2(as_type<int2>(v_in)); \
+  } \
+  else if (ATTR == GPU_FETCH_INT_TO_FLOAT_UNIT) { \
+    v_out = float2(as_type<int2>(v_in)) / float2(__INT_MAX__); \
+  } \
+  else { \
+    v_out = v_in; \
+  }
+
+#define internal_vertex_attribute_convert_read_float3(ATTR, v_in, v_out) \
+  if (ATTR == GPU_FETCH_INT_TO_FLOAT) { \
+    v_out = float3(as_type<int3>(v_in)); \
+  } \
+  else if (ATTR == GPU_FETCH_INT_TO_FLOAT_UNIT) { \
+    v_out = float3(as_type<int3>(v_in)) / float3(__INT_MAX__); \
+  } \
+  else { \
+    v_out = v_in; \
+  }
+
+#define internal_vertex_attribute_convert_read_float4(ATTR, v_in, v_out) \
+  if (ATTR == GPU_FETCH_INT_TO_FLOAT) { \
+    v_out = float4(as_type<int4>(v_in)); \
+  } \
+  else if (ATTR == GPU_FETCH_INT_TO_FLOAT_UNIT) { \
+    v_out = float4(as_type<int4>(v_in)) / float4(__INT_MAX__); \
+  } \
+  else { \
+    v_out = v_in; \
+  }
diff --git a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
new file mode 100644
index 00000000000..3b32783620d
--- /dev/null
+++ b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
@@ -0,0 +1,1065 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/** Special header for mapping commonly defined tokens to API-specific variations.
+ * Where possible, this will adhere closely to base GLSL, where semantics are the same.
+ * However, host code shader code may need modifying to support types where necessary variations
+ * exist between APIs but are not expressed through the source. (e.g. distinctio between depth2d
+ * and texture2d types in metal).
+ */
+
+/* Base instance with offsets. */
+#define gpu_BaseInstance gl_BaseInstanceARB
+#define gpu_InstanceIndex (gl_InstanceID + gpu_BaseInstance)
+
+/* derivative signs. */
+#define DFDX_SIGN 1.0
+#define DFDY_SIGN 1.0
+
+/* Type definitions. */
+#define vec2 float2
+#define vec3 float3
+#define vec4 float4
+#define mat2 float2x2
+#define mat2x2 float2x2
+#define mat3 float3x3
+#define mat4 float4x4
+#define ivec2 int2
+#define ivec3 int3
+#define ivec4 int4
+#define uvec2 uint2
+#define uvec3 uint3
+#define uvec4 uint4
+/* MTLBOOL is used for native boolean's generated by the Metal backend, to avoid type-emulation
+ * for GLSL bools, which are treated as integers. */
+#define MTLBOOL bool
+#define bool int
+#define bvec2 bool2
+#define bvec3 bool3
+#define bvec4 bool4
+#define vec3_1010102_Unorm uint
+#define vec3_1010102_Inorm int
+
+/* Strip GLSL Decorators. */
+#define in
+#define flat
+#define smooth
+#define noperspective
+#define layout(std140) struct
+#define uniform
+
+/* Used to replace 'out' in function parameters with threadlocal reference
+ * shortened to avoid expanding the glsl source string. */
+#define THD thread
+
+/* Generate wrapper structs for combined texture and sampler type. */
+#ifdef USE_ARGUMENT_BUFFER_FOR_SAMPLERS
+#  define COMBINED_SAMPLER_TYPE(STRUCT_NAME, TEX_TYPE) \
+    template<typename T, access A = access::sample> struct STRUCT_NAME { \
+      thread TEX_TYPE<T, A> *texture; \
+      constant sampler *samp; \
+    }
+#else
+#  define COMBINED_SAMPLER_TYPE(STRUCT_NAME, TEX_TYPE) \
+    template<typename T, access A = access::sample> struct STRUCT_NAME { \
+      thread TEX_TYPE<T, A> *texture; \
+      thread sampler *samp; \
+    }
+#endif
+
+/* Add any types as needed. */
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_1d, texture1d);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_1d_array, texture1d_array);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_2d, texture2d);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_2d, depth2d);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_2d_array, texture2d_array);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_2d_array, depth2d_array);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_3d, texture3d);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_buffer, texture_buffer);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_cube, texturecube);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_cube_array, texturecube_array);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_cube, texturecube_array);
+COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_cube_array, texturecube_array);
+
+/* Sampler struct for argument buffer. */
+#ifdef USE_ARGUMENT_BUFFER_FOR_SAMPLERS
+struct SStruct {
+  array<sampler, ARGUMENT_BUFFER_NUM_SAMPLERS> sampler_args [[id(0)]];
+};
+#endif
+
+/* Samplers as function parameters. */
+#define sampler1D thread _mtl_combined_image_sampler_1d<float>
+#define sampler1DArray thread _mtl_combined_image_sampler_1d_array<float>
+#define sampler2D thread _mtl_combined_image_sampler_2d<float>
+#define depth2D thread _mtl_combined_image_sampler_depth_2d<float>
+#define sampler2DArray thread _mtl_combined_image_sampler_2d_array<float>
+#define sampler2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<float>
+#define depth2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<float>
+#define sampler3D thread _mtl_combined_image_sampler_3d<float>
+#define samplerBuffer thread _mtl_combined_image_sampler_buffer<float, access::read>
+#define samplerCube thread _mtl_combined_image_sampler_cube<float>
+#define samplerCubeArray thread _mtl_combined_image_sampler_cube_array<float>
+
+#define usampler1D thread _mtl_combined_image_sampler_1d<uint>
+#define usampler1DArray thread _mtl_combined_image_sampler_1d_array<uint>
+#define usampler2D thread _mtl_combined_image_sampler_2d<uint>
+#define udepth2D thread _mtl_combined_image_sampler_depth_2d<uint>
+#define usampler2DArray thread _mtl_combined_image_sampler_2d_array<uint>
+#define usampler2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<uint>
+#define udepth2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<uint>
+#define usampler3D thread _mtl_combined_image_sampler_3d<uint>
+#define usamplerBuffer thread _mtl_combined_image_sampler_buffer<uint, access::read>
+#define usamplerCube thread _mtl_combined_image_sampler_cube<uint>
+#define usamplerCubeArray thread _mtl_combined_image_sampler_cube_array<uint>
+
+#define isampler1D thread _mtl_combined_image_sampler_1d<int>
+#define isampler1DArray thread _mtl_combined_image_sampler_1d_array<int>
+#define isampler2D thread _mtl_combined_image_sampler_2d<int>
+#define idepth2D thread _mtl_combined_image_sampler_depth_2d<int>
+#define isampler2DArray thread _mtl_combined_image_sampler_2d_array<int>
+#define isampler2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<int>
+#define idepth2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<int>
+#define isampler3D thread _mtl_combined_image_sampler_3d<int>
+#define isamplerBuffer thread _mtl_combined_image_sampler_buffer<int, access::read>
+#define isamplerCube thread _mtl_combined_image_sampler_cube<int>
+#define isamplerCubeArray thread _mtl_combined_image_sampler_cube_array<int>
+
+/* Vector accessor aliases. */
+#define st xy
+
+/* Texture functions. */
+#define texelFetch _texelFetch_internal
+#define texelFetchOffset(__tex, __texel, __lod, __offset) \
+  _texelFetch_internal(__tex, __texel, __lod, __offset)
+#define texture2(__tex, __uv) _texture_internal_samp(__tex, __uv)
+#define texture3(__tex, __uv, _bias) _texture_internal_bias(__tex, __uv, bias(float(_bias)))
+#define textureLod(__tex, __uv, __lod) _texture_internal_level(__tex, __uv, level(float(__lod)))
+#define textureLodOffset(__tex, __uv, __lod, __offset) \
+  _texture_internal_level(__tex, __uv, level(float(__lod)), __offset)
+#define textureGather2(__tex, __uv) _texture_gather_internal(__tex, __uv, 0)
+#define textureGather3(__tex, __uv, __comp) _texture_gather_internal(__tex, __uv, __comp)
+#define textureGatherOffset(__tex, __offset, __uv, __comp) \
+  _texture_gather_internal(__tex, __uv, __comp, __offset)
+
+#define TEXURE_MACRO(_1, _2, _3, TEXNAME, ...) TEXNAME
+#define texture(...) TEXURE_MACRO(__VA_ARGS__, texture3, texture2)(__VA_ARGS__)
+#define textureGather(...) TEXURE_MACRO(__VA_ARGS__, textureGather3, textureGather2)(__VA_ARGS__)
+
+/* Texture-write functions. */
+#define imageStore(_tex, _coord, _value) _texture_write_internal(_tex, _coord, _value)
+
+/* Singular return values from texture functions of type DEPTH are often indexed with either .r or
+ * .x. This is a lightweight wrapper type for handling this syntax. */
+union _msl_return_float {
+  float r;
+  float x;
+  inline operator float() const
+  {
+    return r;
+  }
+};
+
+/* Add custom texture sampling/reading routines for each type to account for special return cases,
+ * e.g. returning a float with an r parameter Note: Cannot use template specialization for input
+ * type, as return types are specific to the signature of 'tex'. */
+/* Texture Read. */
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex, T texel)
+{
+  float w = tex.texture->get_width();
+  if (texel >= 0 && texel < w) {
+    return tex.texture->read(uint(texel));
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(
+    const thread _mtl_combined_image_sampler_buffer<S, access::read> tex, T texel)
+{
+  float w = tex.texture->get_width();
+  if (texel >= 0 && texel < w) {
+    return tex.texture->read(uint(texel));
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
+                                      T texel,
+                                      uint lod,
+                                      T offset = 0)
+{
+  float w = tex.texture->get_width();
+  if ((texel + offset) >= 0 && (texel + offset) < w) {
+    /* LODs not supported for 1d textures. This must be zero. */
+    return tex.texture->read(uint(texel + offset), 0);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
+                                      vec<T, 1> texel,
+                                      uint lod,
+                                      vec<T, 1> offset = 0)
+{
+  float w = tex.texture->get_width();
+  if ((texel + offset) >= 0 && (texel + offset) < w) {
+    /* LODs not supported for 1d textures. This must be zero. */
+    return tex.texture->read(uint(texel + offset), 0);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, int n, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
+                                      vec<T, n> texel,
+                                      uint lod,
+                                      vec<T, n> offset = vec<T, n>(0))
+{
+  float w = tex.texture->get_width();
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w) {
+    /* LODs not supported for 1d textures. This must be zero. */
+    return tex.texture->read(uint(texel.x + offset.x), 0);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
+                                      vec<T, 2> texel,
+                                      uint lod,
+                                      vec<T, 2> offset = vec<T, 2>(0, 0))
+{
+
+  float w = tex.texture->get_width();
+  float h = tex.texture->get_array_size();
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
+      (texel.y + offset.y) < h) {
+    /* LODs not supported for 1d textures. This must be zero. */
+    return tex.texture->read(uint(texel.x + offset.x), uint(texel.y + offset.y), 0);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
+                                      vec<T, 2> texel,
+                                      uint lod,
+                                      vec<T, 2> offset = vec<T, 2>(0))
+{
+
+  float w = tex.texture->get_width() >> lod;
+  float h = tex.texture->get_height() >> lod;
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
+      (texel.y + offset.y) < h) {
+    return tex.texture->read(uint2(texel + offset), lod);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
+                                      vec<T, 3> texel,
+                                      uint lod,
+                                      vec<T, 3> offset = vec<T, 3>(0))
+{
+  float w = tex.texture->get_width() >> lod;
+  float h = tex.texture->get_height() >> lod;
+  float d = tex.texture->get_array_size();
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
+      (texel.y + offset.y) < h && (texel.z + offset.z) >= 0 && (texel.z + offset.z) < d) {
+    return tex.texture->read(uint2(texel.xy + offset.xy), uint(texel.z + offset.z), lod);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
+                                      vec<T, 3> texel,
+                                      uint lod,
+                                      vec<T, 3> offset = vec<T, 3>(0))
+{
+
+  float w = tex.texture->get_width() >> lod;
+  float h = tex.texture->get_height() >> lod;
+  float d = tex.texture->get_depth() >> lod;
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
+      (texel.y + offset.y) < h && (texel.z + offset.z) >= 0 && (texel.z + offset.z) < d) {
+    return tex.texture->read(uint3(texel + offset), lod);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+template<typename T, access A>
+inline _msl_return_float _texelFetch_internal(
+    thread _mtl_combined_image_sampler_depth_2d<float, A> tex,
+    vec<T, 2> texel,
+    uint lod,
+    vec<T, 2> offset = vec<T, 2>(0))
+{
+
+  float w = tex.texture->get_width() >> lod;
+  float h = tex.texture->get_height() >> lod;
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
+      (texel.y + offset.y) < h) {
+    _msl_return_float fl = {tex.texture->read(uint2(texel + offset), lod)};
+    return fl;
+  }
+  else {
+    _msl_return_float fl = {0};
+    return fl;
+  }
+}
+
+template<typename S, typename T, access A>
+inline vec<S, 4> _texture_internal_samp(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
+                                        vec<T, 3> texel,
+                                        uint lod,
+                                        vec<T, 3> offset = vec<T, 3>(0))
+{
+
+  float w = tex.texture->get_width() >> lod;
+  float h = tex.texture->get_height() >> lod;
+  float d = tex.texture->get_array_size();
+  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
+      (texel.y + offset.y) < h && (texel.z + offset.z) >= 0 && (texel.z + offset.z) < d) {
+    return tex.texture->read(uint2(texel.xy + offset.xy), uint(texel.z + offset.z), lod);
+  }
+  else {
+    return vec<S, 4>(0);
+  }
+}
+
+/* Sample. */
+template<typename T>
+inline vec<T, 4> _texture_internal_samp(
+    thread _mtl_combined_image_sampler_1d<T, access::sample> tex, float u)
+{
+  return tex.texture->sample(*tex.samp, u);
+}
+
+inline float4 _texture_internal_samp(
+    thread _mtl_combined_image_sampler_1d_array<float, access::sample> tex, float2 ua)
+{
+  return tex.texture->sample(*tex.samp, ua.x, uint(ua.y));
+}
+
+inline int4 _texture_internal_samp(thread _mtl_combined_image_sampler_2d<int, access::sample> tex,
+                                   float2 uv)
+{
+  return tex.texture->sample(*tex.samp, uv);
+}
+
+inline uint4 _texture_internal_samp(
+    thread _mtl_combined_image_sampler_2d<uint, access::sample> tex, float2 uv)
+{
+  return tex.texture->sample(*tex.samp, uv);
+}
+
+inline float4 _texture_internal_samp(
+    thread _mtl_combined_image_sampler_2d<float, access::sample> tex, float2 uv)
+{
+  return tex.texture->sample(*tex.samp, uv);
+}
+
+inline _msl_return_float _texture_internal_samp(
+    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex, float2 uv)
+{
+  _msl_return_float fl = {tex.texture->sample(*tex.samp, uv)};
+  return fl;
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_samp(
+    thread _mtl_combined_image_sampler_3d<T, access::sample> tex, float3 uvw)
+{
+  return tex.texture->sample(*tex.samp, uvw);
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_samp(
+    thread _mtl_combined_image_sampler_2d_array<T, access::sample> tex, float3 uva)
+{
+  return tex.texture->sample(*tex.samp, uva.xy, uint(uva.z));
+}
+
+inline _msl_return_float _texture_internal_samp(
+    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex, float3 uva)
+{
+  _msl_return_float fl = {tex.texture->sample(*tex.samp, uva.xy, uint(uva.z))};
+  return fl;
+}
+
+inline _msl_return_float _texture_internal_samp(
+    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex, float4 uvac)
+{
+  _msl_return_float fl = {
+      tex.texture->sample_compare(*tex.samp, uvac.xy, uint(uvac.z), uvac.w, level(0))};
+  return fl;
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_samp(
+    thread _mtl_combined_image_sampler_cube<T, access::sample> tex, float3 uvs)
+{
+  return tex.texture->sample(*tex.samp, uvs.xyz);
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_samp(
+    thread _mtl_combined_image_sampler_cube_array<T, access::sample> tex, float4 coord_a)
+{
+  return tex.texture->sample(*tex.samp, coord_a.xyz, uint(coord_a.w));
+}
+
+/* Sample Level. */
+template<typename T>
+inline vec<T, 4> _texture_internal_level(
+    thread _mtl_combined_image_sampler_1d<T, access::sample> tex,
+    float u,
+    level options,
+    int offset = 0)
+{
+  /* LODs not supported for 1d textures. This must be zero. */
+  return tex.texture->sample(*tex.samp, u);
+}
+
+inline float4 _texture_internal_level(
+    thread _mtl_combined_image_sampler_1d_array<float, access::sample> tex,
+    float2 ua,
+    level options,
+    int offset = 0)
+{
+  /* LODs not supported for 1d textures. This must be zero. */
+  return tex.texture->sample(*tex.samp, ua.x, uint(ua.y));
+}
+
+inline int4 _texture_internal_level(thread _mtl_combined_image_sampler_2d<int, access::sample> tex,
+                                    float2 uv,
+                                    level options,
+                                    int2 offset = int2(0))
+{
+  return tex.texture->sample(*tex.samp, uv, options, offset);
+}
+
+inline uint4 _texture_internal_level(
+    thread _mtl_combined_image_sampler_2d<uint, access::sample> tex,
+    float2 uv,
+    level options,
+    int2 offset = int2(0))
+{
+  return tex.texture->sample(*tex.samp, uv, options, offset);
+}
+
+inline float4 _texture_internal_level(
+    thread _mtl_combined_image_sampler_2d<float, access::sample> tex,
+    float2 uv,
+    level options,
+    int2 offset = int2(0))
+{
+  return tex.texture->sample(*tex.samp, uv, options, offset);
+}
+
+inline _msl_return_float _texture_internal_level(
+    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
+    float2 uv,
+    level options,
+    int2 offset = int2(0))
+{
+  _msl_return_float fl = {tex.texture->sample(*tex.samp, uv, options, offset)};
+  return fl;
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_level(
+    thread _mtl_combined_image_sampler_3d<T, access::sample> tex,
+    float3 uvw,
+    level options = level(0),
+    int3 offset = int3(0))
+{
+  return tex.texture->sample(*tex.samp, uvw, options, offset);
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_level(
+    thread _mtl_combined_image_sampler_2d_array<T, access::sample> tex,
+    float3 uva,
+    level options = level(0),
+    int2 offset = int2(0))
+{
+  return tex.texture->sample(*tex.samp, uva.xy, uint(uva.z), options, offset);
+}
+
+inline _msl_return_float _texture_internal_level(
+    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex,
+    float3 uva,
+    level options = level(0),
+    int2 offset = int2(0))
+{
+  _msl_return_float fl = {tex.texture->sample(*tex.samp, uva.xy, uint(uva.z), options, offset)};
+  return fl;
+}
+
+inline _msl_return_float _texture_internal_level(
+    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex,
+    float4 uvac,
+    level options = level(0),
+    int2 offset = int2(0))
+{
+  _msl_return_float fl = {
+      tex.texture->sample_compare(*tex.samp, uvac.xy, uint(uvac.z), uvac.w, level(0), offset)};
+  return fl;
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_level(
+    thread _mtl_combined_image_sampler_cube<T, access::sample> tex,
+    float3 uvs,
+    level options = level(0),
+    int2 offset = int2(0))
+{
+  return tex.texture->sample(*tex.samp, uvs.xyz, options);
+}
+
+template<typename T>
+inline vec<T, 4> _texture_internal_level(
+    thread _mtl_combined_image_sampler_cube_array<T, access::sample> tex,
+    float4 coord_a,
+    level options = level(0),
+    int3 offset = int3(0))
+{
+  return tex.texture->sample(*tex.samp, coord_a.xyz, uint(coord_a.w), options);
+}
+
+/* Sample Bias. */
+template<typename T>
+inline vec<T, 4> _texture_internal_bias(
+    thread _mtl_combined_image_sampler_1d<T, access::sample> tex,
+    float u,
+    bias options = bias(0.0),
+    int offset = 0)
+{
+  return tex.texture->sample(*tex.samp, u);
+}
+
+inline float4 _texture_internal_bias(
+    thread _mtl_combined_image_sampler_2d<float, access::sample> tex,
+    float2 uv,
+    bias options = bias(0.0),
+    int2 offset = int2(0))
+{
+  return tex.texture->sample(*tex.samp, uv, options, offset);
+}
+
+inline _msl_return_float _texture_internal_bias(
+    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
+    float2 uv,
+    bias options = bias(0),
+    int2 offset = int2(0))
+{
+  _msl_return_float fl = {tex.texture->sample(*tex.samp, uv, options, offset)};
+  return fl;
+}
+
+/* Texture Gather. */
+component int_to_component(const int comp)
+{
+  switch (comp) {
+    default:
+    case 0:
+      return component::x;
+    case 1:
+      return component::y;
+    case 2:
+      return component::z;
+    case 3:
+      return component::w;
+  }
+  return component::x;
+}
+
+inline float4 _texture_gather_internal(
+    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
+    float2 uv,
+    const int comp = 0,
+    int2 offset = int2(0))
+{
+  return tex.texture->gather(*tex.samp, uv, offset);
+}
+
+template<typename T>
+inline vec<T, 4> _texture_gather_internal(
+    thread _mtl_combined_image_sampler_2d<T, access::sample> tex,
+    float2 uv,
+    const int comp = 0,
+    int2 offset = int2(0))
+{
+  return tex.texture->gather(*tex.samp, uv, offset);
+}
+
+template<typename T>
+inline vec<T, 4> _texture_gather_internal(
+    thread _mtl_combined_image_sampler_2d_array<T, access::sample> tex,
+    float2 uv,
+    const int comp = 0,
+    int2 offset = int2(0))
+{
+  return tex.texture->gather(*tex.samp, uv, offset);
+}
+
+/* Texture write support. */
+template<typename S, typename T, access A>
+inline void _texture_write_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
+                                    T _coord,
+                                    vec<S, 4> value)
+{
+  float w = tex.texture->get_width();
+  float h = tex.texture->get_height();
+  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h) {
+    tex.texture->write(value, uint2(_coord.xy));
+  }
+}
+
+template<typename S, typename T, access A>
+inline void _texture_write_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
+                                    T _coord,
+                                    vec<S, 4> value)
+{
+  float w = tex.texture->get_width();
+  float h = tex.texture->get_height();
+  float d = tex.texture->get_depth();
+  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h && _coord.z >= 0 &&
+      _coord.z < d) {
+    tex.texture->write(value, uint3(_coord.xyz));
+  }
+}
+
+/* SSBO Vertex Fetch Mode. */
+#ifdef MTL_SSBO_VERTEX_FETCH
+/* Enabled when geometry is passed via raw buffer bindings, rather than using
+ * vertex assembly in the vertex-descriptor.
+ *
+ *  To describe the layout of input attribute data, we will generate uniforms (defaulting to 0)
+ * with the names per unique input attribute with name `attr`:
+ *
+ * - uniform_ssbo_stride_##attr  -- Representing the stride between element.
+ * - uniform_ssbo_offset_##attr  -- Representing the base offset within the vertex.
+ * - uniform_ssbo_fetchmode_##attr - Whether using per-vertex (=0) or per-instance fetch (=1).
+ * - uniform_ssbo_vbo_id_##attr - buffer binding index for VBO with data for this attribute.
+ * - uniform_ssbo_type_##attr - The type of data in the currently bound buffer.
+ *
+ * If the uniform_ssbo_type_* does not match with the desired type, then it is the responsibility
+ * of the shader to perform the conversion. Types should always be read as the raw attribute type,
+ * and then converted. e.g. If the uniform_ssbo_type_* is `int`, but we want to read it to be
+ * normalized to a float.
+ * The implementation should query the attribute type using vertex_fetch_get_attr_type(attr_name):
+ *
+ * float fweight = 0.0;
+ * if(vertex_fetch_get_attr_type(in_weight) == GPU_SHADER_ATTR_TYPE_INT) {
+ *   int iweight = vertex_fetch_attribute(gl_VertexID, in_weight, int);
+ *   fweight = (float)iweight/(float)INT32_MAX;
+ * } else {
+ *   fweight = = vertex_fetch_attribute(gl_VertexID, in_weight, float);
+ * }
+ *
+ * Note: These uniforms are generated as part of the same data block used for regular uniforms
+ * and attribute data is written prior to each draw call, depending on the configuration of
+ * the vertex descriptor for an MTLBatch or MTLImmedaite call. */
+#  define PPCAT_NX(A, B) A##B
+#  define PPCAT(A, B) PPCAT_NX(A, B)
+
+#  define RESOLVE_VERTEX(v_id) \
+    ((UNIFORM_SSBO_USES_INDEXED_RENDERING_STR > 0) ? \
+         ((UNIFORM_SSBO_INDEX_MODE_U16_STR > 0) ? MTL_INDEX_DATA_U16[v_id] : \
+                                                  MTL_INDEX_DATA_U32[v_id]) : \
+         v_id)
+#  define ATTR_TYPE(attr) PPCAT(SSBO_ATTR_TYPE_, attr)
+#  define vertex_fetch_attribute_raw(n, attr, type) \
+    (reinterpret_cast<constant type *>( \
+        &MTL_VERTEX_DATA[PPCAT(UNIFORM_SSBO_VBO_ID_STR, attr)] \
+                        [(PPCAT(UNIFORM_SSBO_STRIDE_STR, attr) * \
+                          ((PPCAT(UNIFORM_SSBO_FETCHMODE_STR, attr)) ? gl_InstanceID : n)) + \
+                         PPCAT(UNIFORM_SSBO_OFFSET_STR, attr)]))[0]
+#  define vertex_fetch_attribute(n, attr, type) \
+    vertex_fetch_attribute_raw(RESOLVE_VERTEX(n), attr, type)
+#  define vertex_id_from_index_id(n) RESOLVE_VERTEX(n)
+#  define vertex_fetch_get_input_prim_type() UNIFORM_SSBO_INPUT_PRIM_TYPE_STR
+#  define vertex_fetch_get_input_vert_count() UNIFORM_SSBO_INPUT_VERT_COUNT_STR
+#  define vertex_fetch_get_attr_type(attr) PPCAT(UNIFORM_SSBO_TYPE_STR, attr)
+
+/* Must mirror GPU_primitive.h. */
+#  define GPU_PRIM_POINTS 0
+#  define GPU_PRIM_LINES 1
+#  define GPU_PRIM_TRIS 2
+#  define GPU_PRIM_LINE_STRIP 3
+#  define GPU_PRIM_LINE_LOOP 4
+#  define GPU_PRIM_TRI_STRIP 5
+#  define GPU_PRIM_TRI_FAN 6
+#  define GPU_PRIM_LINES_ADJ 7
+#  define GPU_PRIM_TRIS_ADJ 8
+#  define GPU_PRIM_LINE_STRIP_ADJ 9
+#endif
+
+/* Common Functions. */
+#define dFdx(x) dfdx(x)
+#define dFdy(x) dfdy(x)
+#define mod(x, y) _mtlmod(x, y)
+#define discard discard_fragment()
+#define inversesqrt rsqrt
+
+inline float radians(float deg)
+{
+  /* Constant factor: M_PI_F/180.0. */
+  return deg * 0.01745329251f;
+}
+
+inline float degrees(float rad)
+{
+  /* Constant factor: 180.0/M_PI_F. */
+  return rad * 57.2957795131;
+}
+
+#define select(A, B, C) mix(A, B, C)
+
+/* Type conversions and type truncations. */
+inline float4 to_float4(float3 val)
+{
+  return float4(val, 1.0);
+}
+
+/* Type conversions and type truncations (Utility Functions). */
+inline float3x3 mat4_to_mat3(float4x4 matrix)
+{
+  return float3x3(matrix[0].xyz, matrix[1].xyz, matrix[2].xyz);
+}
+
+inline int floatBitsToInt(float f)
+{
+  return as_type<int>(f);
+}
+
+inline int2 floatBitsToInt(float2 f)
+{
+  return as_type<int2>(f);
+}
+
+inline int3 floatBitsToInt(float3 f)
+{
+  return as_type<int3>(f);
+}
+
+inline int4 floatBitsToInt(float4 f)
+{
+  return as_type<int4>(f);
+}
+
+inline uint floatBitsToUint(float f)
+{
+  return as_type<uint>(f);
+}
+
+inline uint2 floatBitsToUint(float2 f)
+{
+  return as_type<uint2>(f);
+}
+
+inline uint3 floatBitsToUint(float3 f)
+{
+  return as_type<uint3>(f);
+}
+
+inline uint4 floatBitsToUint(float4 f)
+{
+  return as_type<uint4>(f);
+}
+
+inline float intBitsToFloat(int f)
+{
+  return as_type<float>(f);
+}
+
+inline float2 intBitsToFloat(int2 f)
+{
+  return as_type<float2>(f);
+}
+
+inline float3 intBitsToFloat(int3 f)
+{
+  return as_type<float3>(f);
+}
+
+inline float4 intBitsToFloat(int4 f)
+{
+  return as_type<float4>(f);
+}
+
+/* Texture size functions. Add texture types as needed. */
+template<typename T, access A>
+int textureSize(thread _mtl_combined_image_sampler_1d<T, A> image, uint lod)
+{
+  return int(image.texture->get_width());
+}
+
+template<typename T, access A>
+int2 textureSize(thread _mtl_combined_image_sampler_1d_array<T, A> image, uint lod)
+{
+  return int2(image.texture->get_width(), image.texture->get_array_size());
+}
+
+template<typename T, access A>
+int2 textureSize(thread _mtl_combined_image_sampler_2d<T, A> image, uint lod)
+{
+  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
+}
+
+template<typename T, access A>
+int2 textureSize(thread _mtl_combined_image_sampler_depth_2d<T, A> image, uint lod)
+{
+  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
+}
+
+template<typename T, access A>
+int3 textureSize(thread _mtl_combined_image_sampler_2d_array<T, A> image, uint lod)
+{
+  return int3(image.texture->get_width(lod),
+              image.texture->get_height(lod),
+              image.texture->get_array_size());
+}
+
+template<typename T, access A>
+int3 textureSize(thread _mtl_combined_image_sampler_depth_2d_array<T, A> image, uint lod)
+{
+  return int3(image.texture->get_width(lod),
+              image.texture->get_height(lod),
+              image.texture->get_array_size());
+}
+
+template<typename T, access A>
+int2 textureSize(thread _mtl_combined_image_sampler_cube<T, A> image, uint lod)
+{
+  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
+}
+
+template<typename T, access A>
+int3 textureSize(thread _mtl_combined_image_sampler_3d<T, A> image, uint lod)
+{
+  return int3(image.texture->get_width(lod),
+              image.texture->get_height(lod),
+              image.texture->get_depth(lod));
+}
+
+/* Equality and comparison functions. */
+#define lessThan(a, b) ((a) < (b))
+#define lessThanEqual(a, b) ((a) <= (b))
+#define greaterThan(a, b) ((a) > (b))
+#define greaterThanEqual(a, b) ((a) >= (b))
+#define equal(a, b) ((a) == (b))
+#define notEqual(a, b) ((a) != (b))
+
+template<typename T, int n> bool all(vec<T, n> x)
+{
+  bool _all = true;
+  for (int i = 0; i < n; i++) {
+    _all = _all && (x[i] > 0);
+  }
+  return _all;
+}
+
+template<typename T, int n> bool any(vec<T, n> x)
+{
+  bool _any = false;
+  for (int i = 0; i < n; i++) {
+    _any = _any || (x[i] > 0);
+  }
+  return _any;
+}
+
+/* Modulo functionality. */
+int _mtlmod(int a, int b)
+{
+  return a - b * (a / b);
+}
+
+template<typename T, int n> vec<T, n> _mtlmod(vec<T, n> x, vec<T, n> y)
+{
+  return x - y * floor(x / y);
+}
+
+template<typename T, int n, typename U> vec<T, n> _mtlmod(vec<T, n> x, U y)
+{
+  return x - vec<T, n>(y) * floor(x / vec<T, n>(y));
+}
+
+template<typename T, typename U, int n> vec<U, n> _mtlmod(T x, vec<U, n> y)
+{
+  return vec<U, n>(x) - y * floor(vec<U, n>(x) / y);
+}
+
+/* Mathematical functions. */
+template<typename T> T atan(T y, T x)
+{
+  return atan2(y, x);
+}
+
+/* Matrix Inverse. */
+float4x4 inverse(float4x4 a)
+{
+  float b00 = a[0][0] * a[1][1] - a[0][1] * a[1][0];
+  float b01 = a[0][0] * a[1][2] - a[0][2] * a[1][0];
+  float b02 = a[0][0] * a[1][3] - a[0][3] * a[1][0];
+  float b03 = a[0][1] * a[1][2] - a[0][2] * a[1][1];
+  float b04 = a[0][1] * a[1][3] - a[0][3] * a[1][1];
+  float b05 = a[0][2] * a[1][3] - a[0][3] * a[1][2];
+  float b06 = a[2][0] * a[3][1] - a[2][1] * a[3][0];
+  float b07 = a[2][0] * a[3][2] - a[2][2] * a[3][0];
+  float b08 = a[2][0] * a[3][3] - a[2][3] * a[3][0];
+  float b09 = a[2][1] * a[3][2] - a[2][2] * a[3][1];
+  float b10 = a[2][1] * a[3][3] - a[2][3] * a[3][1];
+  float b11 = a[2][2] * a[3][3] - a[2][3] * a[3][2];
+
+  float invdet = 1.0 / (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
+
+  return float4x4(a[1][1] * b11 - a[1][2] * b10 + a[1][3] * b09,
+                  a[0][2] * b10 - a[0][1] * b11 - a[0][3] * b09,
+                  a[3][1] * b05 - a[3][2] * b04 + a[3][3] * b03,
+                  a[2][2] * b04 - a[2][1] * b05 - a[2][3] * b03,
+                  a[1][2] * b08 - a[1][0] * b11 - a[1][3] * b07,
+                  a[0][0] * b11 - a[0][2] * b08 + a[0][3] * b07,
+                  a[3][2] * b02 - a[3][0] * b05 - a[3][3] * b01,
+                  a[2][0] * b05 - a[2][2] * b02 + a[2][3] * b01,
+                  a[1][0] * b10 - a[1][1] * b08 + a[1][3] * b06,
+                  a[0][1] * b08 - a[0][0] * b10 - a[0][3] * b06,
+                  a[3][0] * b04 - a[3][1] * b02 + a[3][3] * b00,
+                  a[2][1] * b02 - a[2][0] * b04 - a[2][3] * b00,
+                  a[1][1] * b07 - a[1][0] * b09 - a[1][2] * b06,
+                  a[0][0] * b09 - a[0][1] * b07 + a[0][2] * b06,
+                  a[3][1] * b01 - a[3][0] * b03 - a[3][2] * b00,
+                  a[2][0] * b03 - a[2][1] * b01 + a[2][2] * b00) *
+         invdet;
+}
+
+float3x3 inverse(float3x3 m)
+{
+
+  float invdet = 1.0 / (m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) -
+                        m[1][0] * (m[0][1] * m[2][2] - m[2][1] * m[0][2]) +
+                        m[2][0] * (m[0][1] * m[1][2] - m[1][1] * m[0][2]));
+
+  float3x3 inverse(0);
+  inverse[0][0] = +(m[1][1] * m[2][2] - m[2][1] * m[1][2]);
+  inverse[1][0] = -(m[1][0] * m[2][2] - m[2][0] * m[1][2]);
+  inverse[2][0] = +(m[1][0] * m[2][1] - m[2][0] * m[1][1]);
+  inverse[0][1] = -(m[0][1] * m[2][2] - m[2][1] * m[0][2]);
+  inverse[1][1] = +(m[0][0] * m[2][2] - m[2][0] * m[0][2]);
+  inverse[2][1] = -(m[0][0] * m[2][1] - m[2][0] * m[0][1]);
+  inverse[0][2] = +(m[0][1] * m[1][2] - m[1][1] * m[0][2]);
+  inverse[1][2] = -(m[0][0] * m[1][2] - m[1][0] * m[0][2]);
+  inverse[2][2] = +(m[0][0] * m[1][1] - m[1][0] * m[0][1]);
+  inverse = inverse * invdet;
+
+  return inverse;
+}
+
+/* Additional overloads for builtin functions. */
+float distance(float x, float y)
+{
+  return abs(y - x);
+}
+
+/* Overload for mix(A, B, float ratio). */
+template<typename T, int Size> vec<T, Size> mix(vec<T, Size> a, vec<T, Size> b, float val)
+{
+  return mix(a, b, vec<T, Size>(val));
+}
+
+/* Overload for mix(A, B, bvec<N>). */
+template<typename T, int Size>
+vec<T, Size> mix(vec<T, Size> a, vec<T, Size> b, vec<int, Size> mask)
+{
+  vec<T, Size> result;
+  for (int i = 0; i < Size; i++) {
+    result[i] = mask[i] ? b[i] : a[i];
+  }
+  return result;
+}
+
+/* Using vec<bool, S> does not appear to work, splitting cases. */
+/* Overload for mix(A, B, bvec<N>). */
+template<typename T> vec<T, 4> mix(vec<T, 4> a, vec<T, 4> b, bvec4 mask)
+{
+  vec<T, 4> result;
+  for (int i = 0; i < 4; i++) {
+    result[i] = mask[i] ? b[i] : a[i];
+  }
+  return result;
+}
+
+/* Overload for mix(A, B, bvec<N>). */
+template<typename T> vec<T, 3> mix(vec<T, 3> a, vec<T, 3> b, bvec3 mask)
+{
+  vec<T, 3> result;
+  for (int i = 0; i < 3; i++) {
+    result[i] = mask[i] ? b[i] : a[i];
+  }
+  return result;
+}
+
+/* Overload for mix(A, B, bvec<N>). */
+template<typename T> vec<T, 2> mix(vec<T, 2> a, vec<T, 2> b, bvec2 mask)
+{
+  vec<T, 2> result;
+  for (int i = 0; i < 2; i++) {
+    result[i] = mask[i] ? b[i] : a[i];
+  }
+  return result;
+}
+
+/* Overload for mix(A, B, bvec<N>). */
+template<typename T> T mix(T a, T b, MTLBOOL mask)
+{
+  return (mask) ? b : a;
+}
+
+template<typename T, unsigned int Size> bool is_zero(vec<T, Size> a)
+{
+  for (int i = 0; i < Size; i++) {
+    if (a[i] != T(0)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* Matrix conversion fallback. */
+mat3 MAT3(vec3 a, vec3 b, vec3 c)
+{
+  return mat3(a, b, c);
+}
+mat3 MAT3(float f)
+{
+  return mat3(f);
+}
+mat3 MAT3(mat4 m)
+{
+  return mat4_to_mat3(m);
+}
+\ No newline at end of file