Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/HansKristian-Work/vkd3d-proton.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans-Kristian Arntzen <post@arntzen-software.no>2021-11-26 15:51:51 +0300
committerHans-Kristian Arntzen <post@arntzen-software.no>2022-06-23 15:39:48 +0300
commitb37ce4b35d09cc8fc2b249e0fc07abc8307dc86e (patch)
tree409a8285feacc81416072c67ea4a2153e9c15289
parent98bc3bd52ab48a8128aa01304ac39c1603300f47 (diff)
vkd3d: Optimize ExecuteIndirect() if no INDIRECT transitions happened.execute-indirect-phase-3
The D3D12 docs outline this as an implementation detail explicitly, so we should do the same thing. Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
-rw-r--r--libs/vkd3d/command.c85
-rw-r--r--libs/vkd3d/vkd3d_private.h6
2 files changed, 65 insertions, 26 deletions
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c
index 5a2fc054..e79d922a 100644
--- a/libs/vkd3d/command.c
+++ b/libs/vkd3d/command.c
@@ -4077,6 +4077,7 @@ static HRESULT d3d12_command_list_batch_reset_query_pools(struct d3d12_command_l
static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list *list)
{
const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs;
+ VkMemoryBarrier barrier;
VkResult vr;
HRESULT hr;
@@ -4086,6 +4087,18 @@ static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list
if (!list->vk_init_commands)
return S_OK;
+ if (list->execute_indirect.has_emitted_indirect_to_compute_barrier)
+ {
+ /* We've patched an indirect command stream here, so do the final barrier now. */
+ barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+ barrier.pNext = NULL;
+ barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+ barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+ VK_CALL(vkCmdPipelineBarrier(list->vk_init_commands, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+ VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+ 0, 1, &barrier, 0, NULL, 0, NULL));
+ }
+
if ((vr = VK_CALL(vkEndCommandBuffer(list->vk_init_commands))) < 0)
{
WARN("Failed to end command buffer, vr %d.\n", vr);
@@ -4389,6 +4402,8 @@ static void d3d12_command_list_reset_internal_state(struct d3d12_command_list *l
list->tracked_copy_buffer_count = 0;
list->rendering_info.state_flags = 0;
+ list->execute_indirect.has_emitted_indirect_to_compute_barrier = false;
+ list->execute_indirect.has_observed_transition_to_indirect = false;
}
static void d3d12_command_list_reset_state(struct d3d12_command_list *list,
@@ -7340,6 +7355,13 @@ static void STDMETHODCALLTYPE d3d12_command_list_ResourceBarrier(d3d12_command_l
VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
uint32_t dsv_decay_mask = 0;
+ /* If we have not observed any transition to INDIRECT_ARGUMENT it means
+ * that in this command buffer there couldn't legally have been writes to an indirect
+ * command buffer. The docs mention an implementation strategy where we can do this optimization.
+ * This is very handy when handling back-to-back ExecuteIndirects(). */
+ if (transition->StateAfter == D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT)
+ list->execute_indirect.has_observed_transition_to_indirect = true;
+
if (!is_valid_resource_state(transition->StateBefore))
{
d3d12_command_list_mark_as_invalid(list,
@@ -9630,6 +9652,7 @@ static void d3d12_command_list_execute_indirect_state_template(
struct vkd3d_scratch_allocation count_allocation;
struct vkd3d_execute_indirect_args patch_args;
VkGeneratedCommandsInfoNV generated;
+ VkCommandBuffer vk_patch_cmd_buffer;
VkIndirectCommandsStreamNV stream;
VkDeviceSize preprocess_size;
VkPipeline current_pipeline;
@@ -9648,16 +9671,6 @@ static void d3d12_command_list_execute_indirect_state_template(
return;
current_pipeline = list->current_pipeline;
- /* FIXME: If we're forced to emit non-dynamic vertex strides, and the indirect state
- * wants to emit dynamic VBOs (dynamic stride), can that possibly work? Extremely unlikely to
- * actually happen in practice, but something to consider for later ... */
-
- /* TODO: If we can prove that there have been no transitions to INDIRECT state,
- * we can hoist all patch jobs to the beginning of the command buffer and build a fixup
- * command buffer that batches everything. For now, take the slow path always. */
- d3d12_command_list_end_current_render_pass(list, true);
- d3d12_command_list_invalidate_current_pipeline(list, true);
-
memset(&patch_args, 0, sizeof(patch_args));
if (FAILED(hr = d3d12_command_signature_allocate_preprocess_memory_for_list(
list, signature, current_pipeline,
@@ -9695,31 +9708,51 @@ static void d3d12_command_list_execute_indirect_state_template(
patch_args.api_buffer_word_stride = signature->desc.ByteStride / sizeof(uint32_t);
patch_args.device_generated_commands_word_stride = signature->state_template.stride / sizeof(uint32_t);
- VK_CALL(vkCmdPushConstants(list->vk_command_buffer, signature->state_template.pipeline.vk_pipeline_layout,
- VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args));
- VK_CALL(vkCmdBindPipeline(list->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
- signature->state_template.pipeline.vk_pipeline));
-
- /* TODO: We can batch the {prologue barrier} { work } { work } ... {epilogue barrier} later. */
- /* The argument buffer and indirect count buffers are in indirect state, but we'll need to read it. */
barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
barrier.pNext = NULL;
-
barrier.srcAccessMask = 0;
barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
- VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
- VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
+
+ if (!list->execute_indirect.has_observed_transition_to_indirect)
+ {
+ /* Fast path, throw the template resolve to the init command buffer. */
+ d3d12_command_allocator_allocate_init_command_buffer(list->allocator, list);
+ vk_patch_cmd_buffer = list->vk_init_commands;
+ if (!list->execute_indirect.has_emitted_indirect_to_compute_barrier)
+ {
+ VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+ VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
+ list->execute_indirect.has_emitted_indirect_to_compute_barrier = true;
+ }
+ }
+ else
+ {
+ vk_patch_cmd_buffer = list->vk_command_buffer;
+ d3d12_command_list_end_current_render_pass(list, true);
+ VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+ VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
+ d3d12_command_list_invalidate_current_pipeline(list, true);
+ }
+
+ VK_CALL(vkCmdPushConstants(vk_patch_cmd_buffer, signature->state_template.pipeline.vk_pipeline_layout,
+ VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args));
+ VK_CALL(vkCmdBindPipeline(vk_patch_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+ signature->state_template.pipeline.vk_pipeline));
/* One workgroup processes the patching for one draw. We could potentially use indirect dispatch
* to restrict the patching work to just the indirect count, but meh, just more barriers.
* We'll nop out the workgroup early based on direct count, and the number of threads should be trivial either way. */
- VK_CALL(vkCmdDispatch(list->vk_command_buffer, max_command_count, 1, 1));
+ VK_CALL(vkCmdDispatch(vk_patch_cmd_buffer, max_command_count, 1, 1));
- barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
- barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
- VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
- VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
- 0, 1, &barrier, 0, NULL, 0, NULL));
+ if (vk_patch_cmd_buffer == list->vk_command_buffer)
+ {
+ barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+ barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
+ VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+ VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+ 0, 1, &barrier, 0, NULL, 0, NULL));
+ /* The barrier is deferred if we moved the dispatch to init command buffer. */
+ }
if (!d3d12_command_list_begin_render_pass(list))
{
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h
index adc0e7b2..d1618179 100644
--- a/libs/vkd3d/vkd3d_private.h
+++ b/libs/vkd3d/vkd3d_private.h
@@ -2127,6 +2127,12 @@ struct d3d12_command_list
bool is_dirty;
} index_buffer;
+ struct
+ {
+ bool has_observed_transition_to_indirect;
+ bool has_emitted_indirect_to_compute_barrier;
+ } execute_indirect;
+
VkCommandBuffer vk_command_buffer;
VkCommandBuffer vk_init_commands;