vkd3d: Optimize ExecuteIndirect() if no INDIRECT transitions happened.execute-indirect-phase-3

The D3D12 docs outline this as an implementation detail explicitly, so we should do the same thing. Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
author: Hans-Kristian Arntzen <post@arntzen-software.no> 2021-11-26 15:51:51 +0300
committer: Hans-Kristian Arntzen <post@arntzen-software.no> 2022-06-23 15:39:48 +0300
commit: b37ce4b35d09cc8fc2b249e0fc07abc8307dc86e (patch)
tree: 409a8285feacc81416072c67ea4a2153e9c15289
parent: 98bc3bd52ab48a8128aa01304ac39c1603300f47 (diff)
2 files changed, 65 insertions, 26 deletions
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c
index 5a2fc054..e79d922a 100644
--- a/libs/vkd3d/command.c
+++ b/libs/vkd3d/command.c
@@ -4077,6 +4077,7 @@ static HRESULT d3d12_command_list_batch_reset_query_pools(struct d3d12_command_l
 static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list *list)
 {
     const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs;
+    VkMemoryBarrier barrier;
     VkResult vr;
     HRESULT hr;
 
@@ -4086,6 +4087,18 @@ static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list
     if (!list->vk_init_commands)
         return S_OK;
 
+    if (list->execute_indirect.has_emitted_indirect_to_compute_barrier)
+    {
+        /* We've patched an indirect command stream here, so do the final barrier now. */
+        barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+        barrier.pNext = NULL;
+        barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+        VK_CALL(vkCmdPipelineBarrier(list->vk_init_commands, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+                0, 1, &barrier, 0, NULL, 0, NULL));
+    }
+
     if ((vr = VK_CALL(vkEndCommandBuffer(list->vk_init_commands))) < 0)
     {
         WARN("Failed to end command buffer, vr %d.\n", vr);
@@ -4389,6 +4402,8 @@ static void d3d12_command_list_reset_internal_state(struct d3d12_command_list *l
     list->tracked_copy_buffer_count = 0;
 
     list->rendering_info.state_flags = 0;
+    list->execute_indirect.has_emitted_indirect_to_compute_barrier = false;
+    list->execute_indirect.has_observed_transition_to_indirect = false;
 }
 
 static void d3d12_command_list_reset_state(struct d3d12_command_list *list,
@@ -7340,6 +7355,13 @@ static void STDMETHODCALLTYPE d3d12_command_list_ResourceBarrier(d3d12_command_l
                 VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
                 uint32_t dsv_decay_mask = 0;
 
+                /* If we have not observed any transition to INDIRECT_ARGUMENT it means
+                 * that in this command buffer there couldn't legally have been writes to an indirect
+                 * command buffer. The docs mention an implementation strategy where we can do this optimization.
+                 * This is very handy when handling back-to-back ExecuteIndirects(). */
+                if (transition->StateAfter == D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT)
+                    list->execute_indirect.has_observed_transition_to_indirect = true;
+
                 if (!is_valid_resource_state(transition->StateBefore))
                 {
                     d3d12_command_list_mark_as_invalid(list,
@@ -9630,6 +9652,7 @@ static void d3d12_command_list_execute_indirect_state_template(
     struct vkd3d_scratch_allocation count_allocation;
     struct vkd3d_execute_indirect_args patch_args;
     VkGeneratedCommandsInfoNV generated;
+    VkCommandBuffer vk_patch_cmd_buffer;
     VkIndirectCommandsStreamNV stream;
     VkDeviceSize preprocess_size;
     VkPipeline current_pipeline;
@@ -9648,16 +9671,6 @@ static void d3d12_command_list_execute_indirect_state_template(
         return;
     current_pipeline = list->current_pipeline;
 
-    /* FIXME: If we're forced to emit non-dynamic vertex strides, and the indirect state
-     * wants to emit dynamic VBOs (dynamic stride), can that possibly work? Extremely unlikely to
-     * actually happen in practice, but something to consider for later ... */
-
-    /* TODO: If we can prove that there have been no transitions to INDIRECT state,
-     * we can hoist all patch jobs to the beginning of the command buffer and build a fixup
-     * command buffer that batches everything. For now, take the slow path always. */
-    d3d12_command_list_end_current_render_pass(list, true);
-    d3d12_command_list_invalidate_current_pipeline(list, true);
-
     memset(&patch_args, 0, sizeof(patch_args));
     if (FAILED(hr = d3d12_command_signature_allocate_preprocess_memory_for_list(
             list, signature, current_pipeline,
@@ -9695,31 +9708,51 @@ static void d3d12_command_list_execute_indirect_state_template(
     patch_args.api_buffer_word_stride = signature->desc.ByteStride / sizeof(uint32_t);
     patch_args.device_generated_commands_word_stride = signature->state_template.stride / sizeof(uint32_t);
 
-    VK_CALL(vkCmdPushConstants(list->vk_command_buffer, signature->state_template.pipeline.vk_pipeline_layout,
-            VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args));
-    VK_CALL(vkCmdBindPipeline(list->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-            signature->state_template.pipeline.vk_pipeline));
-
-    /* TODO: We can batch the {prologue barrier} { work } { work } ... {epilogue barrier} later. */
-    /* The argument buffer and indirect count buffers are in indirect state, but we'll need to read it. */
     barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
     barrier.pNext = NULL;
-
     barrier.srcAccessMask = 0;
     barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-    VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
-            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
+
+    if (!list->execute_indirect.has_observed_transition_to_indirect)
+    {
+        /* Fast path, throw the template resolve to the init command buffer. */
+        d3d12_command_allocator_allocate_init_command_buffer(list->allocator, list);
+        vk_patch_cmd_buffer = list->vk_init_commands;
+        if (!list->execute_indirect.has_emitted_indirect_to_compute_barrier)
+        {
+            VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
+            list->execute_indirect.has_emitted_indirect_to_compute_barrier = true;
+        }
+    }
+    else
+    {
+        vk_patch_cmd_buffer = list->vk_command_buffer;
+        d3d12_command_list_end_current_render_pass(list, true);
+        VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+                VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
+        d3d12_command_list_invalidate_current_pipeline(list, true);
+    }
+
+    VK_CALL(vkCmdPushConstants(vk_patch_cmd_buffer, signature->state_template.pipeline.vk_pipeline_layout,
+            VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args));
+    VK_CALL(vkCmdBindPipeline(vk_patch_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+            signature->state_template.pipeline.vk_pipeline));
 
     /* One workgroup processes the patching for one draw. We could potentially use indirect dispatch
      * to restrict the patching work to just the indirect count, but meh, just more barriers.
      * We'll nop out the workgroup early based on direct count, and the number of threads should be trivial either way. */
-    VK_CALL(vkCmdDispatch(list->vk_command_buffer, max_command_count, 1, 1));
+    VK_CALL(vkCmdDispatch(vk_patch_cmd_buffer, max_command_count, 1, 1));
 
-    barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-    barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
-    VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
-            0, 1, &barrier, 0, NULL, 0, NULL));
+    if (vk_patch_cmd_buffer == list->vk_command_buffer)
+    {
+        barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+        VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+                0, 1, &barrier, 0, NULL, 0, NULL));
+        /* The barrier is deferred if we moved the dispatch to init command buffer. */
+    }
 
     if (!d3d12_command_list_begin_render_pass(list))
     {
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h
index adc0e7b2..d1618179 100644
--- a/libs/vkd3d/vkd3d_private.h
+++ b/libs/vkd3d/vkd3d_private.h
@@ -2127,6 +2127,12 @@ struct d3d12_command_list
         bool is_dirty;
     } index_buffer;
 
+    struct
+    {
+        bool has_observed_transition_to_indirect;
+        bool has_emitted_indirect_to_compute_barrier;
+    } execute_indirect;
+
     VkCommandBuffer vk_command_buffer;
     VkCommandBuffer vk_init_commands;
author	Hans-Kristian Arntzen <post@arntzen-software.no>	2021-11-26 15:51:51 +0300
committer	Hans-Kristian Arntzen <post@arntzen-software.no>	2022-06-23 15:39:48 +0300
commit	b37ce4b35d09cc8fc2b249e0fc07abc8307dc86e (patch)
tree	409a8285feacc81416072c67ea4a2153e9c15289
parent	98bc3bd52ab48a8128aa01304ac39c1603300f47 (diff)