3 files changed, 19 insertions, 4 deletions
diff --git a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
index 3c2e5a02..7c8e3878 100644
--- a/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
+++ b/reference/opt/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
@@ -20,7 +20,7 @@ kernel void main0(uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_
     device main0_out* gl_out = &spvOut[gl_PrimitiveID * 4];
     device main0_patchOut& patchOut = spvPatchOut[gl_PrimitiveID];
     v0[gl_InvocationID] = float4(1.0);
-    ((threadgroup float*)&v0[gl_InvocationID])[0u] = 2.0;
+    v0[gl_InvocationID].x = 2.0;
     if (gl_InvocationID == 0)
     {
         patchOut.v1 = float4(2.0);
diff --git a/reference/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc b/reference/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
index 0948ad9d..e4f047d3 100644
--- a/reference/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
+++ b/reference/shaders-msl/masking/write-outputs.mask-location-0.msl2.tesc
@@ -20,7 +20,7 @@ static inline __attribute__((always_inline))
 void write_in_func(threadgroup float4 (&v0)[4], thread uint& gl_InvocationID, device float4& v1, device main0_out* thread & gl_out)
 {
     v0[gl_InvocationID] = float4(1.0);
-    ((threadgroup float*)&v0[gl_InvocationID])[0u] = 2.0;
+    v0[gl_InvocationID].x = 2.0;
     if (gl_InvocationID == 0)
     {
         v1 = float4(2.0);
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index 1c4b5e9a..3421524b 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -8653,6 +8653,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 			// Internally, access chain implementation can also be used on composites,
 			// ignore scalar access workarounds in this case.
 			StorageClass effective_storage = StorageClassGeneric;
+			bool ignore_potential_sliced_writes = false;
 			if ((flags & ACCESS_CHAIN_FORCE_COMPOSITE_BIT) == 0)
 			{
 				auto *var = maybe_get_backing_variable(base);
@@ -8662,9 +8663,23 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 					effective_storage = StorageClassStorageBuffer;
 				else if (expression_type(base).pointer)
 					effective_storage = get_expression_effective_storage_class(base);
+
+				// Special consideration for control points.
+				// Control points can only be written by InvocationID, so there is no need
+				// to consider scalar access chains here.
+				// Cleans up some cases where it's very painful to determine the accurate storage class
+				// since blocks can be partially masked ...
+				if (var && var->storage == StorageClassOutput &&
+				    get_execution_model() == ExecutionModelTessellationControl &&
+				    !has_decoration(var->self, DecorationPatch))
+				{
+					ignore_potential_sliced_writes = true;
+				}
 			}
+			else
+				ignore_potential_sliced_writes = true;
 
-			if (!row_major_matrix_needs_conversion)
+			if (!row_major_matrix_needs_conversion && !ignore_potential_sliced_writes)
 			{
 				// On some backends, we might not be able to safely access individual scalars in a vector.
 				// To work around this, we might have to cast the access chain reference to something which can,
@@ -8704,7 +8719,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				expr += "]";
 			}
 
-			if (row_major_matrix_needs_conversion)
+			if (row_major_matrix_needs_conversion && !ignore_potential_sliced_writes)
 			{
 				prepare_access_chain_for_scalar_access(expr, get<SPIRType>(type->parent_type), effective_storage,
 				                                       is_packed);