Serialize interleaved merge scopes.cfg-fixes

In heavily unrolled loops with many different breaks, we can end up with code that is impossible to express with normal structured control flow without deinterleaving the breaks. The strategy is to pull all break blocks to a merge block which switch-dispatches to the appropriate break construct, then reconvenes at the merge block.
author: Hans-Kristian Arntzen <post@arntzen-software.no> 2022-08-08 17:53:41 +0300
committer: Hans-Kristian Arntzen <post@arntzen-software.no> 2022-08-08 19:59:54 +0300
commit: 199eba4b501ae02363a41e40d448ea53aa488f82 (patch)
tree: 582f8fc48ba1bfbe80e64bd811dd06ddc37ea283
parent: 9f2fd6356c14376ab5b88518d6dd4e6787084525 (diff)
4 files changed, 638 insertions, 4 deletions
diff --git a/cfg_structurizer.cpp b/cfg_structurizer.cpp
index e72927e..41ee5f8 100644
--- a/cfg_structurizer.cpp
+++ b/cfg_structurizer.cpp
@@ -357,6 +357,12 @@ bool CFGStructurizer::run()
 
 	create_continue_block_ladders();
 
+	while (serialize_interleaved_merge_scopes())
+	{
+		auto graphviz_split = graphviz_path + ".serialize";
+		log_cfg_graphviz(graphviz_split.c_str());
+	}
+
 	split_merge_scopes();
 	recompute_cfg();
 
@@ -2333,6 +2339,11 @@ void CFGStructurizer::rewrite_selection_breaks(CFGNode *header, CFGNode *ladder_
 	}
 }
 
+bool CFGStructurizer::is_ordered(const CFGNode *a, const CFGNode *b, const CFGNode *c)
+{
+	return a != b && a->dominates(b) && b != c && b->dominates(c);
+}
+
 bool CFGStructurizer::header_and_merge_block_have_entry_exit_relationship(const CFGNode *header, const CFGNode *merge) const
 {
 	if (!merge->post_dominates(header))
@@ -2448,10 +2459,6 @@ bool CFGStructurizer::header_and_merge_block_have_entry_exit_relationship(const
 		return false;
 	}
 
-	const auto is_ordered = [](const CFGNode *a, const CFGNode *b, const CFGNode *c) {
-		return a != b && a->dominates(b) && b != c && b->dominates(c);
-	};
-
 	// Crossing break scenario.
 	if (is_ordered(first_natural_breaks_to_inner, first_natural_breaks_to_outer, last_natural_breaks_to_inner))
 		return true;
@@ -2461,6 +2468,179 @@ bool CFGStructurizer::header_and_merge_block_have_entry_exit_relationship(const
 		return false;
 }
 
+bool CFGStructurizer::serialize_interleaved_merge_scopes()
+{
+	// Try to fixup scenarios which arise from unrolled loops with multiple break blocks.
+	// DXC will emit maximal convergence and force all dynamic instances of a given break to branch to the same
+	// block, which then breaks, e.g.:
+	// for (int i = 0; i < CONSTANT; i++) { cond_break_construct1(); cond_break_construct2(); cond_break_construct3(); }
+	// When this unrolls we can end up with merge blocks which are entangled. Only sane way to make this work
+	// is to serialize the breaks to after the merge block.
+	UnorderedSet<CFGNode *> potential_merge_nodes;
+
+	for (auto *node : forward_post_visit_order)
+		if (node->num_forward_preds() >= 2 && !block_is_plain_continue(node))
+			potential_merge_nodes.insert(node);
+
+	UnorderedSet<const CFGNode *> visited;
+
+	for (auto *node : forward_post_visit_order)
+	{
+		if (node->num_forward_preds() <= 1)
+			continue;
+		if (block_is_plain_continue(node))
+			continue;
+
+		auto *idom = node->immediate_dominator;
+
+		Vector<CFGNode *> inner_constructs;
+		Vector<CFGNode *> valid_constructs;
+
+		// Find merge block candidates that are strictly dominated by idom and immediately post-dominated by node.
+		// They also must not be good merge candidates on their own.
+		// Also, we're not interested in any loop merge candidates.
+		for (auto *candidate : potential_merge_nodes)
+		{
+			if (candidate != idom && idom->dominates(candidate) &&
+			    candidate->immediate_post_dominator == node &&
+			    !candidate->post_dominates_perfect_structured_construct() &&
+			    get_innermost_loop_header_for(idom, node) == idom)
+			{
+				bool direct_dominance_frontier = candidate->dominance_frontier.size() == 1 &&
+				                                 candidate->dominance_frontier.front() == node;
+				// The candidate must not try to merge to other code since we might end up introducing loops that way.
+				// All code reachable by candidate must cleanly break to node.
+				if (direct_dominance_frontier)
+					inner_constructs.push_back(candidate);
+			}
+		}
+
+		// Ensure stable order.
+		std::sort(inner_constructs.begin(), inner_constructs.end(), [](const CFGNode *a, const CFGNode *b) {
+			return a->forward_post_visit_order < b->forward_post_visit_order;
+		});
+
+		// Prune any candidate that can reach another candidate. The sort ensures that candidate to be removed comes last.
+		size_t count = inner_constructs.size();
+		for (size_t i = 0; i < count; i++)
+		{
+			bool valid = true;
+			for (size_t j = 0; j < i; j++)
+			{
+				if (query_reachability(*inner_constructs[j], *inner_constructs[i]))
+				{
+					valid = false;
+					break;
+				}
+			}
+
+			if (valid)
+				valid_constructs.push_back(inner_constructs[i]);
+		}
+
+		if (valid_constructs.size() < 2)
+			continue;
+
+		Vector<std::pair<CFGNode *, CFGNode *>> pdf_ranges;
+		pdf_ranges.reserve(inner_constructs.size());
+
+		// If breaking merge constructs are entangled, their PDFs will overlap.
+		for (auto *candidate : valid_constructs)
+		{
+			auto &pdf = candidate->post_dominance_frontier;
+			assert(!pdf.empty());
+			CFGNode *first = pdf.front();
+			CFGNode *last = first;
+
+			for (auto *n : pdf)
+			{
+				if (n->forward_post_visit_order > first->forward_post_visit_order)
+					first = n;
+				if (n->forward_post_visit_order < last->forward_post_visit_order)
+					last = n;
+			}
+
+			pdf_ranges.push_back({ first, last });
+		}
+
+		bool need_deinterleave = false;
+		count = valid_constructs.size();
+		for (size_t i = 0; i < count && !need_deinterleave; i++)
+			for (size_t j = 0; j < count && !need_deinterleave; j++)
+				if (i != j)
+					need_deinterleave = is_ordered(pdf_ranges[i].first, pdf_ranges[j].first, pdf_ranges[i].second);
+
+		if (need_deinterleave)
+		{
+			// Rewrite the control flow to serialize execution of the candidate blocks.
+			auto *dispatcher = create_helper_pred_block(node);
+
+			auto &builder = module.get_builder();
+			PHI phi;
+			phi.id = module.allocate_id();
+			phi.type_id = builder.makeIntType(32);
+
+			for (auto *candidate : valid_constructs)
+				traverse_dominated_blocks_and_rewrite_branch(candidate, dispatcher, node);
+
+			size_t cutoff_index = dispatcher->pred.size();
+
+			// If there is no direct branch intended for node, the default case label will never be reached,
+			// so just pilfer one of the cases as default.
+			bool need_default_case = !dispatcher->pred.empty();
+
+			for (size_t i = 0; i < cutoff_index; i++)
+				phi.incoming.push_back({ dispatcher->pred[i], builder.makeIntConstant(-1) });
+
+			for (size_t i = 0; i < count; i++)
+			{
+				auto *candidate = valid_constructs[i];
+				traverse_dominated_blocks_and_rewrite_branch(idom, candidate, dispatcher);
+				size_t next_cutoff_index = dispatcher->pred.size();
+				for (size_t j = cutoff_index; j < next_cutoff_index; j++)
+					phi.incoming.push_back({ dispatcher->pred[j], builder.makeIntConstant(int32_t(i)) });
+				cutoff_index = next_cutoff_index;
+			}
+
+			idom->freeze_structured_analysis = true;
+			idom->merge = MergeType::Loop;
+			idom->loop_merge_block = dispatcher;
+
+			dispatcher->ir.terminator.conditional_id = phi.id;
+			dispatcher->ir.phi.push_back(std::move(phi));
+			builder.addName(phi.id, String("selector_" + node->name).c_str());
+
+			Terminator::Case default_case;
+			dispatcher->ir.terminator.type = Terminator::Type::Switch;
+			dispatcher->ir.terminator.direct_block = nullptr;
+			default_case.node = need_default_case ? node : valid_constructs[0];
+			default_case.is_default = true;
+			dispatcher->ir.terminator.cases.push_back(default_case);
+
+			for (size_t i = 0; i < count; i++)
+			{
+				auto *candidate = valid_constructs[i];
+				assert(candidate->pred.empty());
+				dispatcher->add_branch(candidate);
+
+				if (need_default_case || i)
+				{
+					Terminator::Case break_case;
+					break_case.node = candidate;
+					break_case.value = uint32_t(i);
+					dispatcher->ir.terminator.cases.push_back(break_case);
+				}
+			}
+
+			// This completely transposes the CFG, so need to recompute CFG to keep going.
+			recompute_cfg();
+			return true;
+		}
+	}
+
+	return false;
+}
+
 void CFGStructurizer::split_merge_scopes()
 {
 	for (auto *node : forward_post_visit_order)
@@ -2498,6 +2678,11 @@ void CFGStructurizer::split_merge_scopes()
 		auto *idom = node->immediate_dominator;
 		assert(idom->succ.size() >= 2);
 
+		// We already rewrote this selection construct in serialize_interleaved_merge_scopes.
+		// Don't try to introduce unnecessary ladders.
+		if (idom->merge == MergeType::Loop && idom->loop_merge_block == node)
+			continue;
+
 		// If we find a construct which is a typical entry <-> exit scenario, do not attempt to rewrite
 		// any branches. The real merge block might be contained inside this construct, and this block merely
 		// serves as the exit merge point. It should generally turn into a loop merge later.
diff --git a/cfg_structurizer.hpp b/cfg_structurizer.hpp
index 9b8d5e4..b105a4b 100644
--- a/cfg_structurizer.hpp
+++ b/cfg_structurizer.hpp
@@ -103,6 +103,8 @@ private:
 	void rewrite_transposed_loop_outer(CFGNode *node, CFGNode *impossible_merge_target,
 	                                   const LoopMergeAnalysis &analysis);
 
+	static bool is_ordered(const CFGNode *a, const CFGNode *b, const CFGNode *c);
+	bool serialize_interleaved_merge_scopes();
 	void split_merge_scopes();
 	void eliminate_degenerate_blocks();
 	static bool ladder_chain_has_phi_dependencies(const CFGNode *chain, const CFGNode *incoming);
diff --git a/reference/shaders/control-flow/interleaved-unrolled-loop-breaks.comp b/reference/shaders/control-flow/interleaved-unrolled-loop-breaks.comp
new file mode 100644
index 0000000..3c8f6a7
--- /dev/null
+++ b/reference/shaders/control-flow/interleaved-unrolled-loop-breaks.comp
@@ -0,0 +1,403 @@
+#version 460
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 0, binding = 0, r32ui) uniform uimageBuffer _8;
+
+uint _75;
+uint _77;
+uint _78;
+uint _80;
+uint _81;
+uint _82;
+uint _83;
+uint _84;
+uint _85;
+uint _86;
+uint _87;
+uint _88;
+uint _89;
+uint _90;
+uint _92;
+uint _93;
+uint _94;
+uint _95;
+uint _96;
+uint _97;
+uint _98;
+uint _99;
+uint _100;
+uint _101;
+uint _102;
+
+void main()
+{
+    int selector_5;
+    uint frontier_phi_5_pred;
+    uint frontier_phi_5_pred_1;
+    uint frontier_phi_5_pred_2;
+    uint frontier_phi_5_pred_3;
+    uint frontier_phi_5_pred_4;
+    uint _13;
+    for (;;)
+    {
+        _13 = imageAtomicAdd(_8, int(0u), 1u);
+        if ((_13 & 13u) == 0u)
+        {
+            if (!((_13 & 1u) == 0u))
+            {
+                selector_5 = 1;
+                frontier_phi_5_pred = _13;
+                frontier_phi_5_pred_1 = _13;
+                frontier_phi_5_pred_2 = _77;
+                frontier_phi_5_pred_3 = _86;
+                frontier_phi_5_pred_4 = _98;
+                break;
+            }
+            if (!((_13 & 2u) == 0u))
+            {
+                selector_5 = 0;
+                frontier_phi_5_pred = _13;
+                frontier_phi_5_pred_1 = _13;
+                frontier_phi_5_pred_2 = _13;
+                frontier_phi_5_pred_3 = _82;
+                frontier_phi_5_pred_4 = _94;
+                break;
+            }
+            uint _22 = imageAtomicAdd(_8, int(0u), _13);
+            if (!((_22 & 13u) == 0u))
+            {
+                selector_5 = 2;
+                frontier_phi_5_pred = _22;
+                frontier_phi_5_pred_1 = _13;
+                frontier_phi_5_pred_2 = _13;
+                frontier_phi_5_pred_3 = _89;
+                frontier_phi_5_pred_4 = _101;
+                break;
+            }
+            if (!((_22 & 1u) == 0u))
+            {
+                selector_5 = 1;
+                frontier_phi_5_pred = _22;
+                frontier_phi_5_pred_1 = _22;
+                frontier_phi_5_pred_2 = _13;
+                frontier_phi_5_pred_3 = _85;
+                frontier_phi_5_pred_4 = _97;
+                break;
+            }
+            if (!((_22 & 2u) == 0u))
+            {
+                selector_5 = 0;
+                frontier_phi_5_pred = _22;
+                frontier_phi_5_pred_1 = _22;
+                frontier_phi_5_pred_2 = _22;
+                frontier_phi_5_pred_3 = _81;
+                frontier_phi_5_pred_4 = _93;
+                break;
+            }
+            uint _23 = imageAtomicAdd(_8, int(0u), _22);
+            if (!((_23 & 13u) == 0u))
+            {
+                selector_5 = 2;
+                frontier_phi_5_pred = _23;
+                frontier_phi_5_pred_1 = _22;
+                frontier_phi_5_pred_2 = _22;
+                frontier_phi_5_pred_3 = _88;
+                frontier_phi_5_pred_4 = _100;
+                break;
+            }
+            if (!((_23 & 1u) == 0u))
+            {
+                selector_5 = 1;
+                frontier_phi_5_pred = _23;
+                frontier_phi_5_pred_1 = _23;
+                frontier_phi_5_pred_2 = _22;
+                frontier_phi_5_pred_3 = _84;
+                frontier_phi_5_pred_4 = _96;
+                break;
+            }
+            if (!((_23 & 2u) == 0u))
+            {
+                selector_5 = 0;
+                frontier_phi_5_pred = _23;
+                frontier_phi_5_pred_1 = _23;
+                frontier_phi_5_pred_2 = _23;
+                frontier_phi_5_pred_3 = _80;
+                frontier_phi_5_pred_4 = _92;
+                break;
+            }
+            uint _24 = imageAtomicAdd(_8, int(0u), _23);
+            if (!((_24 & 13u) == 0u))
+            {
+                selector_5 = 2;
+                frontier_phi_5_pred = _24;
+                frontier_phi_5_pred_1 = _23;
+                frontier_phi_5_pred_2 = _23;
+                frontier_phi_5_pred_3 = _87;
+                frontier_phi_5_pred_4 = _99;
+                break;
+            }
+            if (!((_24 & 1u) == 0u))
+            {
+                selector_5 = 1;
+                frontier_phi_5_pred = _24;
+                frontier_phi_5_pred_1 = _24;
+                frontier_phi_5_pred_2 = _23;
+                frontier_phi_5_pred_3 = _83;
+                frontier_phi_5_pred_4 = _95;
+                break;
+            }
+            if ((_24 & 2u) == 0u)
+            {
+                selector_5 = -1;
+                frontier_phi_5_pred = _24;
+                frontier_phi_5_pred_1 = _24;
+                frontier_phi_5_pred_2 = _24;
+                frontier_phi_5_pred_3 = 0u;
+                frontier_phi_5_pred_4 = _24;
+                break;
+            }
+            selector_5 = 0;
+            frontier_phi_5_pred = _24;
+            frontier_phi_5_pred_1 = _24;
+            frontier_phi_5_pred_2 = _24;
+            frontier_phi_5_pred_3 = 0u;
+            frontier_phi_5_pred_4 = _24;
+            break;
+        }
+        else
+        {
+            selector_5 = 2;
+            frontier_phi_5_pred = _13;
+            frontier_phi_5_pred_1 = _75;
+            frontier_phi_5_pred_2 = _78;
+            frontier_phi_5_pred_3 = _90;
+            frontier_phi_5_pred_4 = _102;
+            break;
+        }
+    }
+    uint _21 = frontier_phi_5_pred;
+    uint _30 = frontier_phi_5_pred_1;
+    uint _46 = frontier_phi_5_pred_2;
+    uint _34;
+    uint _36;
+    switch (selector_5)
+    {
+        case 0:
+        {
+            uint _37 = imageAtomicOr(_8, int(0u), _46);
+            _34 = 3u;
+            _36 = _37;
+            break;
+        }
+        case 1:
+        {
+            if ((_30 & 4u) == 0u)
+            {
+                _34 = 2u;
+                _36 = _30;
+                break;
+            }
+            uint _38 = imageAtomicOr(_8, int(0u), _30);
+            _34 = 2u;
+            _36 = _38;
+            break;
+        }
+        case 2:
+        {
+            uint _26 = imageAtomicAdd(_8, int(0u), _21);
+            _34 = 1u;
+            _36 = _26;
+            break;
+        }
+        default:
+        {
+            _34 = frontier_phi_5_pred_3;
+            _36 = frontier_phi_5_pred_4;
+            break;
+        }
+    }
+    uint _40 = imageAtomicAdd(_8, int(0u), _36);
+    uint _42 = imageAtomicAdd(_8, int(0u), _34);
+}
+
+
+#if 0
+// SPIR-V disassembly
+; SPIR-V
+; Version: 1.3
+; Generator: Unknown(30017); 21022
+; Bound: 124
+; Schema: 0
+OpCapability Shader
+OpCapability ImageBuffer
+OpMemoryModel Logical GLSL450
+OpEntryPoint GLCompute %3 "main"
+OpExecutionMode %3 LocalSize 1 1 1
+OpName %3 "main"
+OpName %67 "selector_5"
+OpName %73 "frontier_phi_5.pred"
+OpName %74 "frontier_phi_5.pred"
+OpName %76 "frontier_phi_5.pred"
+OpName %79 "frontier_phi_5.pred"
+OpName %91 "frontier_phi_5.pred"
+OpDecorate %8 DescriptorSet 0
+OpDecorate %8 Binding 0
+%1 = OpTypeVoid
+%2 = OpTypeFunction %1
+%5 = OpTypeInt 32 0
+%6 = OpTypeImage %5 Buffer 0 0 0 2 R32ui
+%7 = OpTypePointer UniformConstant %6
+%8 = OpVariable %7 UniformConstant
+%10 = OpConstant %5 0
+%11 = OpTypePointer Image %5
+%14 = OpConstant %5 1
+%16 = OpConstant %5 13
+%17 = OpTypeBool
+%28 = OpConstant %5 2
+%32 = OpConstant %5 4
+%35 = OpConstant %5 3
+%68 = OpTypeInt 32 1
+%69 = OpConstant %68 -1
+%70 = OpConstant %68 0
+%71 = OpConstant %68 1
+%72 = OpConstant %68 2
+%3 = OpFunction %1 None %2
+%4 = OpLabel
+%75 = OpUndef %5
+%77 = OpUndef %5
+%78 = OpUndef %5
+%80 = OpUndef %5
+%81 = OpUndef %5
+%82 = OpUndef %5
+%83 = OpUndef %5
+%84 = OpUndef %5
+%85 = OpUndef %5
+%86 = OpUndef %5
+%87 = OpUndef %5
+%88 = OpUndef %5
+%89 = OpUndef %5
+%90 = OpUndef %5
+%92 = OpUndef %5
+%93 = OpUndef %5
+%94 = OpUndef %5
+%95 = OpUndef %5
+%96 = OpUndef %5
+%97 = OpUndef %5
+%98 = OpUndef %5
+%99 = OpUndef %5
+%100 = OpUndef %5
+%101 = OpUndef %5
+%102 = OpUndef %5
+OpBranch %103
+%103 = OpLabel
+%9 = OpLoad %6 %8
+%12 = OpImageTexelPointer %11 %8 %10 %10
+%13 = OpAtomicIAdd %5 %12 %14 %10 %14
+%15 = OpBitwiseAnd %5 %13 %16
+%18 = OpIEqual %17 %15 %10
+OpLoopMerge %116 %122 None
+OpBranchConditional %18 %104 %116
+%104 = OpLabel
+%19 = OpBitwiseAnd %5 %13 %14
+%20 = OpIEqual %17 %19 %10
+OpSelectionMerge %105 None
+OpBranchConditional %20 %105 %116
+%105 = OpLabel
+%27 = OpBitwiseAnd %5 %13 %28
+%29 = OpIEqual %17 %27 %10
+OpSelectionMerge %106 None
+OpBranchConditional %29 %106 %116
+%106 = OpLabel
+%43 = OpImageTexelPointer %11 %8 %10 %10
+%22 = OpAtomicIAdd %5 %43 %14 %10 %13
+%44 = OpBitwiseAnd %5 %22 %16
+%45 = OpIEqual %17 %44 %10
+OpSelectionMerge %107 None
+OpBranchConditional %45 %107 %116
+%107 = OpLabel
+%49 = OpBitwiseAnd %5 %22 %14
+%50 = OpIEqual %17 %49 %10
+OpSelectionMerge %108 None
+OpBranchConditional %50 %108 %116
+%108 = OpLabel
+%51 = OpBitwiseAnd %5 %22 %28
+%52 = OpIEqual %17 %51 %10
+OpSelectionMerge %109 None
+OpBranchConditional %52 %109 %116
+%109 = OpLabel
+%53 = OpImageTexelPointer %11 %8 %10 %10
+%23 = OpAtomicIAdd %5 %53 %14 %10 %22
+%54 = OpBitwiseAnd %5 %23 %16
+%55 = OpIEqual %17 %54 %10
+OpSelectionMerge %110 None
+OpBranchConditional %55 %110 %116
+%110 = OpLabel
+%56 = OpBitwiseAnd %5 %23 %14
+%57 = OpIEqual %17 %56 %10
+OpSelectionMerge %111 None
+OpBranchConditional %57 %111 %116
+%111 = OpLabel
+%58 = OpBitwiseAnd %5 %23 %28
+%59 = OpIEqual %17 %58 %10
+OpSelectionMerge %112 None
+OpBranchConditional %59 %112 %116
+%112 = OpLabel
+%60 = OpImageTexelPointer %11 %8 %10 %10
+%24 = OpAtomicIAdd %5 %60 %14 %10 %23
+%61 = OpBitwiseAnd %5 %24 %16
+%62 = OpIEqual %17 %61 %10
+OpSelectionMerge %113 None
+OpBranchConditional %62 %113 %116
+%113 = OpLabel
+%63 = OpBitwiseAnd %5 %24 %14
+%64 = OpIEqual %17 %63 %10
+OpSelectionMerge %114 None
+OpBranchConditional %64 %114 %116
+%114 = OpLabel
+%65 = OpBitwiseAnd %5 %24 %28
+%66 = OpIEqual %17 %65 %10
+OpSelectionMerge %115 None
+OpBranchConditional %66 %116 %115
+%115 = OpLabel
+OpBranch %116
+%122 = OpLabel
+OpBranch %103
+%116 = OpLabel
+%67 = OpPhi %68 %69 %114 %70 %115 %70 %111 %70 %108 %70 %105 %71 %113 %71 %110 %71 %107 %71 %104 %72 %112 %72 %109 %72 %106 %72 %103
+%73 = OpPhi %5 %24 %114 %24 %115 %23 %111 %22 %108 %13 %105 %24 %113 %23 %110 %22 %107 %13 %104 %24 %112 %23 %109 %22 %106 %13 %103
+%74 = OpPhi %5 %24 %114 %24 %115 %23 %111 %22 %108 %13 %105 %24 %113 %23 %110 %22 %107 %13 %104 %23 %112 %22 %109 %13 %106 %75 %103
+%76 = OpPhi %5 %24 %114 %24 %115 %23 %111 %22 %108 %13 %105 %23 %113 %22 %110 %13 %107 %77 %104 %23 %112 %22 %109 %13 %106 %78 %103
+%79 = OpPhi %5 %10 %114 %10 %115 %80 %111 %81 %108 %82 %105 %83 %113 %84 %110 %85 %107 %86 %104 %87 %112 %88 %109 %89 %106 %90 %103
+%91 = OpPhi %5 %24 %114 %24 %115 %92 %111 %93 %108 %94 %105 %95 %113 %96 %110 %97 %107 %98 %104 %99 %112 %100 %109 %101 %106 %102 %103
+%21 = OpCopyObject %5 %73
+%30 = OpCopyObject %5 %74
+%46 = OpCopyObject %5 %76
+OpSelectionMerge %121 None
+OpSwitch %67 %121 0 %120 1 %118 2 %117
+%120 = OpLabel
+%47 = OpImageTexelPointer %11 %8 %10 %10
+%37 = OpAtomicOr %5 %47 %14 %10 %46
+OpBranch %121
+%118 = OpLabel
+%31 = OpBitwiseAnd %5 %30 %32
+%33 = OpIEqual %17 %31 %10
+OpSelectionMerge %119 None
+OpBranchConditional %33 %121 %119
+%119 = OpLabel
+%48 = OpImageTexelPointer %11 %8 %10 %10
+%38 = OpAtomicOr %5 %48 %14 %10 %30
+OpBranch %121
+%117 = OpLabel
+%25 = OpImageTexelPointer %11 %8 %10 %10
+%26 = OpAtomicIAdd %5 %25 %14 %10 %21
+OpBranch %121
+%121 = OpLabel
+%34 = OpPhi %5 %79 %116 %35 %120 %28 %118 %28 %119 %14 %117
+%36 = OpPhi %5 %91 %116 %37 %120 %30 %118 %38 %119 %26 %117
+%39 = OpImageTexelPointer %11 %8 %10 %10
+%40 = OpAtomicIAdd %5 %39 %14 %10 %36
+%41 = OpImageTexelPointer %11 %8 %10 %10
+%42 = OpAtomicIAdd %5 %41 %14 %10 %34
+OpReturn
+OpFunctionEnd
+#endif
diff --git a/shaders/control-flow/interleaved-unrolled-loop-breaks.comp b/shaders/control-flow/interleaved-unrolled-loop-breaks.comp
new file mode 100644
index 0000000..ed0ca79
--- /dev/null
+++ b/shaders/control-flow/interleaved-unrolled-loop-breaks.comp
@@ -0,0 +1,44 @@
+RWStructuredBuffer<uint> RW : register(u0);
+
+[numthreads(1, 1, 1)]
+void main(uint id : SV_DispatchThreadID)
+{
+	uint v;
+	uint w = 1;
+	uint dummy = 0;
+
+	[unroll]
+	for (int i = 0; i < 4; i++)
+	{
+		InterlockedAdd(RW[0], w, v); w = v;
+
+		[branch]
+		if (w & 13)
+		{
+			InterlockedAdd(RW[0], w, v); w = v;
+			dummy = 1;
+			break;
+		}
+
+		[branch]
+		if (w & 1)
+		{
+			[branch]
+			if (w & 4)
+				InterlockedOr(RW[0], w, v); w = v;
+			dummy = 2;
+			break;
+		}
+
+		[branch]
+		if (w & 2)
+		{
+			InterlockedOr(RW[0], w, v); w = v;
+			dummy = 3;
+			break;
+		}
+	}
+
+	InterlockedAdd(RW[0], w, v); w = v;
+	InterlockedAdd(RW[0], dummy, v);
+}
author	Hans-Kristian Arntzen <post@arntzen-software.no>	2022-08-08 17:53:41 +0300
committer	Hans-Kristian Arntzen <post@arntzen-software.no>	2022-08-08 19:59:54 +0300
commit	199eba4b501ae02363a41e40d448ea53aa488f82 (patch)
tree	582f8fc48ba1bfbe80e64bd811dd06ddc37ea283
parent	9f2fd6356c14376ab5b88518d6dd4e6787084525 (diff)