From 23da445bd44afe588d6ccf61dc871b6ddac21867 Mon Sep 17 00:00:00 2001
From: Hans-Kristian Arntzen <post@arntzen-software.no>
Date: Wed, 14 Apr 2021 13:13:13 +0200
Subject: MSL: Emit multiple threadgroup slices for multi-patch.

Multiple patches can run in the same workgroup when using multi-patch
mode, so we need to allocate enough storage to avoid false sharing.
---
 ...uts-block.mask-location-0.multi-patch.msl2.tesc | 44 +++++++++++
 ...uts-block.mask-location-1.multi-patch.msl2.tesc | 44 +++++++++++
 .../write-outputs.mask-location-0.multi-patch.tesc | 81 ++++++++++++++++++++
 .../write-outputs.mask-location-1.multi-patch.tesc | 40 ++++++++++
 .../write-outputs.mask-point-size.multi-patch.tesc | 89 ++++++++++++++++++++++
 .../write-outputs.mask-position.multi-patch.tesc   | 89 ++++++++++++++++++++++
 6 files changed, 387 insertions(+)
 create mode 100644 reference/opt/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
 create mode 100644 reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
 create mode 100644 reference/opt/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
 create mode 100644 reference/opt/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc

(limited to 'reference/opt/shaders-msl')

diff --git a/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
new file mode 100644
index 00000000..ca025cdb
--- /dev/null
+++ b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-0.multi-patch.msl2.tesc
@@ -0,0 +1,44 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct P
+{
+    float a;
+    float b;
+};
+
+struct C
+{
+    float a;
+    float b;
+};
+
+struct main0_out
+{
+    float C_a;
+    float C_b;
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float P_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
+    threadgroup P spvStorage_11[8];
+    threadgroup P (&_11) = spvStorage_11[(gl_GlobalInvocationID.x / 4) % 8];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    _11.a = 1.0;
+    patchOut.P_b = 2.0;
+    gl_out[gl_InvocationID].C_a = 3.0;
+    gl_out[gl_InvocationID].C_b = 4.0;
+    gl_out[gl_InvocationID].gl_Position = float4(1.0);
+}
+
diff --git a/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
new file mode 100644
index 00000000..700e3fc5
--- /dev/null
+++ b/reference/opt/shaders-msl/masking/write-outputs-block.mask-location-1.multi-patch.msl2.tesc
@@ -0,0 +1,44 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct P
+{
+    float a;
+    float b;
+};
+
+struct C
+{
+    float a;
+    float b;
+};
+
+struct main0_out
+{
+    float C_b;
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    float P_a;
+    float P_b;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    threadgroup C spvStoragec[8][4];
+    threadgroup C (&c)[4] = spvStoragec[(gl_GlobalInvocationID.x / 4) % 8];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    patchOut.P_a = 1.0;
+    patchOut.P_b = 2.0;
+    c[gl_InvocationID].a = 3.0;
+    gl_out[gl_InvocationID].C_b = 4.0;
+    gl_out[gl_InvocationID].gl_Position = float4(1.0);
+}
+
diff --git a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
new file mode 100644
index 00000000..d20b7d78
--- /dev/null
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.multi-patch.tesc
@@ -0,0 +1,81 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct main0_out
+{
+    float4 gl_Position;
+    float gl_PointSize;
+};
+
+struct main0_patchOut
+{
+    spvUnsafeArray<float4, 2> v1;
+    float4 v3;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    threadgroup float4 spvStoragev0[8][4];
+    threadgroup float4 (&v0)[4] = spvStoragev0[(gl_GlobalInvocationID.x / 4) % 8];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    v0[gl_InvocationID] = float4(1.0);
+    v0[gl_InvocationID].z = 3.0;
+    if (gl_InvocationID == 0)
+    {
+        patchOut.v1[0] = float4(2.0);
+        ((device float*)&patchOut.v1[0])[0u] = 3.0;
+        patchOut.v1[1] = float4(2.0);
+        ((device float*)&patchOut.v1[1])[0u] = 5.0;
+    }
+    patchOut.v3 = float4(5.0);
+    gl_out[gl_InvocationID].gl_Position = float4(10.0);
+    gl_out[gl_InvocationID].gl_Position.z = 20.0;
+    gl_out[gl_InvocationID].gl_PointSize = 40.0;
+}
+
diff --git a/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc b/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
new file mode 100644
index 00000000..2831008f
--- /dev/null
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-1.multi-patch.tesc
@@ -0,0 +1,40 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct main0_out
+{
+    float4 v0;
+    float4 gl_Position;
+    float gl_PointSize;
+};
+
+struct main0_patchOut
+{
+    float4 v3;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
+    threadgroup float4 spvStoragev1[8][2];
+    threadgroup float4 (&v1)[2] = spvStoragev1[(gl_GlobalInvocationID.x / 4) % 8];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    gl_out[gl_InvocationID].v0 = float4(1.0);
+    gl_out[gl_InvocationID].v0.z = 3.0;
+    if (gl_InvocationID == 0)
+    {
+        v1[0] = float4(2.0);
+        ((threadgroup float*)&v1[0])[0u] = 3.0;
+        v1[1] = float4(2.0);
+        ((threadgroup float*)&v1[1])[0u] = 5.0;
+    }
+    patchOut.v3 = float4(5.0);
+    gl_out[gl_InvocationID].gl_Position = float4(10.0);
+    gl_out[gl_InvocationID].gl_Position.z = 20.0;
+    gl_out[gl_InvocationID].gl_PointSize = 40.0;
+}
+
diff --git a/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc b/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
new file mode 100644
index 00000000..21360341
--- /dev/null
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-point-size.multi-patch.tesc
@@ -0,0 +1,89 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct gl_PerVertex
+{
+    float4 gl_Position;
+    float gl_PointSize;
+    float gl_ClipDistance[1];
+    float gl_CullDistance[1];
+};
+
+struct main0_out
+{
+    float4 v0;
+    float4 gl_Position;
+};
+
+struct main0_patchOut
+{
+    spvUnsafeArray<float4, 2> v1;
+    float4 v3;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
+    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    gl_out[gl_InvocationID].v0 = float4(1.0);
+    gl_out[gl_InvocationID].v0.z = 3.0;
+    if (gl_InvocationID == 0)
+    {
+        patchOut.v1[0] = float4(2.0);
+        ((device float*)&patchOut.v1[0])[0u] = 3.0;
+        patchOut.v1[1] = float4(2.0);
+        ((device float*)&patchOut.v1[1])[0u] = 5.0;
+    }
+    patchOut.v3 = float4(5.0);
+    gl_out[gl_InvocationID].gl_Position = float4(10.0);
+    gl_out[gl_InvocationID].gl_Position.z = 20.0;
+    gl_out_masked[gl_InvocationID].gl_PointSize = 40.0;
+}
+
diff --git a/reference/opt/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc b/reference/opt/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc
new file mode 100644
index 00000000..3aea5798
--- /dev/null
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-position.multi-patch.tesc
@@ -0,0 +1,89 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct gl_PerVertex
+{
+    float4 gl_Position;
+    float gl_PointSize;
+    float gl_ClipDistance[1];
+    float gl_CullDistance[1];
+};
+
+struct main0_out
+{
+    float4 v0;
+    float gl_PointSize;
+};
+
+struct main0_patchOut
+{
+    spvUnsafeArray<float4, 2> v1;
+    float4 v3;
+};
+
+kernel void main0(uint3 gl_GlobalInvocationID [[thread_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device main0_patchOut* spvPatchOut [[buffer(27)]], device MTLQuadTessellationFactorsHalf* spvTessLevel [[buffer(26)]])
+{
+    device main0_out* gl_out = &spvOut[gl_GlobalInvocationID.x - gl_GlobalInvocationID.x % 4];
+    threadgroup gl_PerVertex spvStoragegl_out_masked[8][4];
+    threadgroup gl_PerVertex (&gl_out_masked)[4] = spvStoragegl_out_masked[(gl_GlobalInvocationID.x / 4) % 8];
+    device main0_patchOut& patchOut = spvPatchOut[gl_GlobalInvocationID.x / 4];
+    uint gl_InvocationID = gl_GlobalInvocationID.x % 4;
+    uint gl_PrimitiveID = min(gl_GlobalInvocationID.x / 4, spvIndirectParams[1]);
+    gl_out[gl_InvocationID].v0 = float4(1.0);
+    gl_out[gl_InvocationID].v0.z = 3.0;
+    if (gl_InvocationID == 0)
+    {
+        patchOut.v1[0] = float4(2.0);
+        ((device float*)&patchOut.v1[0])[0u] = 3.0;
+        patchOut.v1[1] = float4(2.0);
+        ((device float*)&patchOut.v1[1])[0u] = 5.0;
+    }
+    patchOut.v3 = float4(5.0);
+    gl_out_masked[gl_InvocationID].gl_Position = float4(10.0);
+    gl_out_masked[gl_InvocationID].gl_Position.z = 20.0;
+    gl_out[gl_InvocationID].gl_PointSize = 40.0;
+}
+
-- 
cgit v1.2.3