From 43a461a9a77515ab95b383ceb3a3f03ee2fcfc52 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 22 Sep 2022 16:08:40 +0200 Subject: vkd3d: Require VK_KHR_buffer_device_address. The old path is completely untested at this point, and I found a bug with root descriptors since we don't do the proper deref to obtain proper VA anymore, so that would have crashed GPUs hard ... Removes a lot of old jank code. Signed-off-by: Hans-Kristian Arntzen --- README.md | 2 +- libs/vkd3d/command.c | 79 ++++++---------------------- libs/vkd3d/debug_ring.c | 6 --- libs/vkd3d/device.c | 6 +++ libs/vkd3d/memory.c | 26 +++------- libs/vkd3d/resource.c | 14 +---- libs/vkd3d/state.c | 21 ++++---- libs/vkd3d/va_map.c | 127 --------------------------------------------- libs/vkd3d/vkd3d_private.h | 14 ----- 9 files changed, 41 insertions(+), 254 deletions(-) diff --git a/README.md b/README.md index af3444f5..f23ddcf5 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,11 @@ There are some hard requirements on drivers to be able to implement D3D12 in a r - `VK_KHR_dynamic_rendering` - `VK_EXT_extended_dynamic_state` - `VK_EXT_extended_dynamic_state2` +- `VK_KHR_buffer_device_address` Some notable extensions that **should** be supported for optimal or correct behavior. These extensions will likely become mandatory later. -- `VK_KHR_buffer_device_address` - `VK_EXT_image_view_min_lod` `VK_EXT_mutable_descriptor_type` (or the vendor `VALVE` alias) is also highly recommended, but not mandatory. diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index afe99ff1..4b7b1356 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -4296,16 +4296,8 @@ static void vk_access_and_stage_flags_from_d3d12_resource_state(const struct d3d *access |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT; /* D3D12_RESOURCE_STATE_PREDICATION */ - if (device->device_info.buffer_device_address_features.bufferDeviceAddress) - { - *stages |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - *access |= VK_ACCESS_SHADER_READ_BIT; - } - else - { - *stages |= VK_PIPELINE_STAGE_TRANSFER_BIT; - *access |= VK_ACCESS_TRANSFER_READ_BIT; - } + *stages |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + *access |= VK_ACCESS_SHADER_READ_BIT; break; case D3D12_RESOURCE_STATE_COPY_DEST: @@ -9940,8 +9932,6 @@ static void STDMETHODCALLTYPE d3d12_command_list_SetPredication(d3d12_command_li VkPipelineStageFlags dst_stages, src_stages; struct vkd3d_scratch_allocation scratch; VkAccessFlags dst_access, src_access; - VkCopyBufferInfo2KHR copy_info; - VkBufferCopy2KHR copy_region; VkMemoryBarrier vk_barrier; TRACE("iface %p, buffer %p, aligned_buffer_offset %#"PRIx64", operation %#x.\n", @@ -9952,13 +9942,6 @@ static void STDMETHODCALLTYPE d3d12_command_list_SetPredication(d3d12_command_li if (resource && (aligned_buffer_offset & 0x7)) return; - if (!list->device->device_info.buffer_device_address_features.bufferDeviceAddress && - !list->device->device_info.conditional_rendering_features.conditionalRendering) - { - FIXME_ONCE("Conditional rendering not supported by device.\n"); - return; - } - if (list->predicate_enabled) VK_CALL(vkCmdEndConditionalRenderingEXT(list->vk_command_buffer)); @@ -9975,52 +9958,24 @@ static void STDMETHODCALLTYPE d3d12_command_list_SetPredication(d3d12_command_li begin_info.offset = scratch.offset; begin_info.flags = 0; - if (list->device->device_info.buffer_device_address_features.bufferDeviceAddress) - { - /* Resolve 64-bit predicate into a 32-bit location so that this works with - * VK_EXT_conditional_rendering. We'll handle the predicate operation here - * so setting VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT is not necessary. */ - d3d12_command_list_invalidate_current_pipeline(list, true); - d3d12_command_list_invalidate_root_parameters(list, &list->compute_bindings, true); - - resolve_args.src_va = d3d12_resource_get_va(resource, aligned_buffer_offset); - resolve_args.dst_va = scratch.va; - resolve_args.invert = operation != D3D12_PREDICATION_OP_EQUAL_ZERO; - - VK_CALL(vkCmdBindPipeline(list->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, - predicate_ops->vk_resolve_pipeline)); - VK_CALL(vkCmdPushConstants(list->vk_command_buffer, predicate_ops->vk_resolve_pipeline_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(resolve_args), &resolve_args)); - VK_CALL(vkCmdDispatch(list->vk_command_buffer, 1, 1, 1)); - - src_stages = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - src_access = VK_ACCESS_SHADER_WRITE_BIT; - } - else - { - FIXME_ONCE("64-bit predicates not supported.\n"); - - copy_region.sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2_KHR; - copy_region.pNext = NULL; - copy_region.srcOffset = resource->mem.offset + aligned_buffer_offset; - copy_region.dstOffset = scratch.offset; - copy_region.size = sizeof(uint32_t); - - copy_info.sType = VK_STRUCTURE_TYPE_COPY_BUFFER_INFO_2_KHR; - copy_info.pNext = NULL; - copy_info.srcBuffer = resource->res.vk_buffer; - copy_info.dstBuffer = scratch.buffer; - copy_info.regionCount = 1; - copy_info.pRegions = ©_region; + /* Resolve 64-bit predicate into a 32-bit location so that this works with + * VK_EXT_conditional_rendering. We'll handle the predicate operation here + * so setting VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT is not necessary. */ + d3d12_command_list_invalidate_current_pipeline(list, true); + d3d12_command_list_invalidate_root_parameters(list, &list->compute_bindings, true); - VK_CALL(vkCmdCopyBuffer2KHR(list->vk_command_buffer, ©_info)); + resolve_args.src_va = d3d12_resource_get_va(resource, aligned_buffer_offset); + resolve_args.dst_va = scratch.va; + resolve_args.invert = operation != D3D12_PREDICATION_OP_EQUAL_ZERO; - src_stages = VK_PIPELINE_STAGE_TRANSFER_BIT; - src_access = VK_ACCESS_TRANSFER_WRITE_BIT; + VK_CALL(vkCmdBindPipeline(list->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, + predicate_ops->vk_resolve_pipeline)); + VK_CALL(vkCmdPushConstants(list->vk_command_buffer, predicate_ops->vk_resolve_pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(resolve_args), &resolve_args)); + VK_CALL(vkCmdDispatch(list->vk_command_buffer, 1, 1, 1)); - if (operation != D3D12_PREDICATION_OP_EQUAL_ZERO) - begin_info.flags = VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; - } + src_stages = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + src_access = VK_ACCESS_SHADER_WRITE_BIT; if (list->device->device_info.conditional_rendering_features.conditionalRendering) { diff --git a/libs/vkd3d/debug_ring.c b/libs/vkd3d/debug_ring.c index 2d67ee88..26d8db25 100644 --- a/libs/vkd3d/debug_ring.c +++ b/libs/vkd3d/debug_ring.c @@ -373,12 +373,6 @@ HRESULT vkd3d_shader_debug_ring_init(struct vkd3d_shader_debug_ring *ring, INFO("Enabling shader debug ring of size: %zu.\n", ring->ring_size); - if (!device->device_info.buffer_device_address_features.bufferDeviceAddress) - { - ERR("Buffer device address must be supported to use VKD3D_SHADER_DEBUG_RING feature.\n"); - return E_INVALIDARG; - } - memset(&heap_properties, 0, sizeof(heap_properties)); heap_properties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_WRITE_BACK; heap_properties.Type = D3D12_HEAP_TYPE_CUSTOM; diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index edc8bb88..b5baaebf 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2181,6 +2181,12 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device, return E_INVALIDARG; } + if (!physical_device_info->buffer_device_address_features.bufferDeviceAddress) + { + ERR("Buffer device address is not supported by this implementation. This is required for correct operation.\n"); + return E_INVALIDARG; + } + return S_OK; } diff --git a/libs/vkd3d/memory.c b/libs/vkd3d/memory.c index bbe4026e..39b7ec69 100644 --- a/libs/vkd3d/memory.c +++ b/libs/vkd3d/memory.c @@ -372,12 +372,7 @@ static HRESULT vkd3d_import_host_memory(struct d3d12_device *device, void *host_ static HRESULT vkd3d_allocation_assign_gpu_address(struct vkd3d_memory_allocation *allocation, struct d3d12_device *device, struct vkd3d_memory_allocator *allocator) { - if (device->device_info.buffer_device_address_features.bufferDeviceAddress) - allocation->resource.va = vkd3d_get_buffer_device_address(device, allocation->resource.vk_buffer); - else if (!(allocation->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH)) - allocation->resource.va = vkd3d_va_map_alloc_fake_va(&allocator->va_map, allocation->resource.size); - else - allocation->resource.va = 0xdeadbeef; + allocation->resource.va = vkd3d_get_buffer_device_address(device, allocation->resource.vk_buffer); if (!allocation->resource.va) { @@ -469,14 +464,10 @@ static void vkd3d_memory_allocation_free(const struct vkd3d_memory_allocation *a if (allocation->flags & VKD3D_ALLOCATION_FLAG_ALLOW_WRITE_WATCH) vkd3d_free_write_watch_pointer(allocation->cpu_address); - if ((allocation->flags & VKD3D_ALLOCATION_FLAG_GPU_ADDRESS) && allocation->resource.va) + if ((allocation->flags & VKD3D_ALLOCATION_FLAG_GPU_ADDRESS) && allocation->resource.va && + !(allocation->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH)) { - if (!(allocation->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH)) - { - vkd3d_va_map_remove(&allocator->va_map, &allocation->resource); - if (!device->device_info.buffer_device_address_features.bufferDeviceAddress) - vkd3d_va_map_free_fake_va(&allocator->va_map, allocation->resource.va, allocation->resource.size); - } + vkd3d_va_map_remove(&allocator->va_map, &allocation->resource); } if (allocation->resource.view_map) @@ -560,9 +551,7 @@ static HRESULT vkd3d_memory_allocation_init(struct vkd3d_memory_allocation *allo if (allocation->resource.vk_buffer) { allocation->flags |= VKD3D_ALLOCATION_FLAG_GPU_ADDRESS; - - if (device->device_info.buffer_device_address_features.bufferDeviceAddress) - flags_info.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; + flags_info.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; } allocation->resource.size = info->memory_requirements.size; @@ -1561,10 +1550,7 @@ HRESULT vkd3d_allocate_buffer_memory(struct d3d12_device *device, VkBuffer vk_bu flags_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO; flags_info.pNext = NULL; - flags_info.flags = 0; - - if (device->device_info.buffer_device_address_features.bufferDeviceAddress) - flags_info.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; + flags_info.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; VK_CALL(vkGetBufferMemoryRequirements(device->vk_device, vk_buffer, &memory_requirements)); diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 80c1f400..f9485de6 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -162,8 +162,7 @@ HRESULT vkd3d_create_buffer(struct d3d12_device *device, buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; } - if (device->device_info.buffer_device_address_features.bufferDeviceAddress) - buffer_info.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR; + buffer_info.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR; if (desc->Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) buffer_info.usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; @@ -2614,12 +2613,7 @@ static void d3d12_resource_destroy(struct d3d12_resource *resource, struct d3d12 vkd3d_free(resource->sparse.tilings); if (resource->res.va) - { vkd3d_va_map_remove(&device->memory_allocator.va_map, &resource->res); - - if (!device->device_info.buffer_device_address_features.bufferDeviceAddress) - vkd3d_va_map_free_fake_va(&device->memory_allocator.va_map, resource->res.va, resource->res.size); - } } if (d3d12_resource_is_texture(resource)) @@ -3062,11 +3056,7 @@ HRESULT d3d12_resource_create_reserved(struct d3d12_device *device, if (d3d12_resource_is_buffer(object)) { object->res.size = object->desc.Width; - - if (device->device_info.buffer_device_address_features.bufferDeviceAddress) - object->res.va = vkd3d_get_buffer_device_address(device, object->res.vk_buffer); - else - object->res.va = vkd3d_va_map_alloc_fake_va(&device->memory_allocator.va_map, object->res.size); + object->res.va = vkd3d_get_buffer_device_address(device, object->res.vk_buffer); if (!object->res.va) { diff --git a/libs/vkd3d/state.c b/libs/vkd3d/state.c index adbadfe8..fb73e5b2 100644 --- a/libs/vkd3d/state.c +++ b/libs/vkd3d/state.c @@ -5218,22 +5218,19 @@ static uint32_t vkd3d_bindless_state_get_bindless_flags(struct d3d12_device *dev /* Always use a typed offset buffer. Otherwise, we risk ending up with unbounded size on view maps. */ flags |= VKD3D_TYPED_OFFSET_BUFFER; - if (device_info->buffer_device_address_features.bufferDeviceAddress && (flags & VKD3D_BINDLESS_UAV)) + if (flags & VKD3D_BINDLESS_UAV) flags |= VKD3D_RAW_VA_AUX_BUFFER; /* We must use root SRV and UAV due to alignment requirements for 16-bit storage, * but root CBV is more lax. */ - if (device_info->buffer_device_address_features.bufferDeviceAddress) - { - flags |= VKD3D_RAW_VA_ROOT_DESCRIPTOR_SRV_UAV; - /* CBV's really require push descriptors on NVIDIA to get maximum performance. - * The difference in performance is profound (~15% in some cases). - * On ACO, BDA with NonWritable can be promoted directly to scalar loads, - * which is great. */ - if ((vkd3d_config_flags & VKD3D_CONFIG_FLAG_FORCE_RAW_VA_CBV) || - device_info->properties2.properties.vendorID != VKD3D_VENDOR_ID_NVIDIA) - flags |= VKD3D_RAW_VA_ROOT_DESCRIPTOR_CBV; - } + flags |= VKD3D_RAW_VA_ROOT_DESCRIPTOR_SRV_UAV; + /* CBV's really require push descriptors on NVIDIA to get maximum performance. + * The difference in performance is profound (~15% in some cases). + * On ACO, BDA with NonWritable can be promoted directly to scalar loads, + * which is great. */ + if ((vkd3d_config_flags & VKD3D_CONFIG_FLAG_FORCE_RAW_VA_CBV) || + device_info->properties2.properties.vendorID != VKD3D_VENDOR_ID_NVIDIA) + flags |= VKD3D_RAW_VA_ROOT_DESCRIPTOR_CBV; if (device_info->properties2.properties.vendorID == VKD3D_VENDOR_ID_NVIDIA && !(flags & VKD3D_RAW_VA_ROOT_DESCRIPTOR_CBV)) diff --git a/libs/vkd3d/va_map.c b/libs/vkd3d/va_map.c index d0ef1a03..56b99852 100644 --- a/libs/vkd3d/va_map.c +++ b/libs/vkd3d/va_map.c @@ -300,143 +300,16 @@ VkAccelerationStructureKHR vkd3d_va_map_place_acceleration_structure(struct vkd3 return view->vk_acceleration_structure; } -#define VKD3D_FAKE_VA_ALIGNMENT (65536) - -VkDeviceAddress vkd3d_va_map_alloc_fake_va(struct vkd3d_va_map *va_map, VkDeviceSize size) -{ - struct vkd3d_va_allocator *allocator = &va_map->va_allocator; - struct vkd3d_va_range range; - VkDeviceAddress va; - size_t i; - int rc; - - if ((rc = pthread_mutex_lock(&allocator->mutex))) - { - ERR("Failed to lock mutex, rc %d.\n", rc); - return 0; - } - - memset(&range, 0, sizeof(range)); - size = align(size, VKD3D_FAKE_VA_ALIGNMENT); - - /* The free list is ordered in such a way that the largest range - * is always first, so we don't have to iterate over the list */ - if (allocator->free_range_count) - range = allocator->free_ranges[0]; - - if (range.size >= size) - { - va = range.base; - - range.base += size; - range.size -= size; - - for (i = 0; i < allocator->free_range_count - 1; i++) - { - if (allocator->free_ranges[i + 1].size > range.size) - allocator->free_ranges[i] = allocator->free_ranges[i + 1]; - else - break; - } - - if (range.size) - allocator->free_ranges[i] = range; - else - allocator->free_range_count--; - } - else - { - va = allocator->next_va; - allocator->next_va += size; - } - - pthread_mutex_unlock(&allocator->mutex); - return va; -} - -void vkd3d_va_map_free_fake_va(struct vkd3d_va_map *va_map, VkDeviceAddress va, VkDeviceSize size) -{ - struct vkd3d_va_allocator *allocator = &va_map->va_allocator; - size_t range_idx, range_shift, i; - struct vkd3d_va_range new_range; - int rc; - - if ((rc = pthread_mutex_lock(&allocator->mutex))) - { - ERR("Failed to lock mutex, rc %d.\n", rc); - return; - } - - new_range.base = va; - new_range.size = align(size, VKD3D_FAKE_VA_ALIGNMENT); - - range_idx = allocator->free_range_count; - range_shift = 0; - - /* Find and effectively delete any free range adjacent to new_range */ - for (i = 0; i < allocator->free_range_count; i++) - { - const struct vkd3d_va_range *cur_range = &allocator->free_ranges[i]; - - if (range_shift) - allocator->free_ranges[i - range_shift] = *cur_range; - - if (cur_range->base == new_range.base + new_range.size || cur_range->base + cur_range->size == new_range.base) - { - if (range_idx == allocator->free_range_count) - range_idx = i; - else - range_shift++; - - new_range.base = min(new_range.base, cur_range->base); - new_range.size += cur_range->size; - } - } - - if (range_idx == allocator->free_range_count) - { - /* range_idx will be valid and point to the last element afterwards */ - if (!(vkd3d_array_reserve((void **)&allocator->free_ranges, &allocator->free_ranges_size, - allocator->free_range_count + 1, sizeof(*allocator->free_ranges)))) - { - ERR("Failed to add free range.\n"); - pthread_mutex_unlock(&allocator->mutex); - return; - } - - allocator->free_range_count += 1; - } - else - allocator->free_range_count -= range_shift; - - /* Move ranges smaller than our new free range back to keep the list ordered */ - while (range_idx && allocator->free_ranges[range_idx - 1].size < new_range.size) - { - allocator->free_ranges[range_idx] = allocator->free_ranges[range_idx - 1]; - range_idx--; - } - - allocator->free_ranges[range_idx] = new_range; - pthread_mutex_unlock(&allocator->mutex); -} - void vkd3d_va_map_init(struct vkd3d_va_map *va_map) { memset(va_map, 0, sizeof(*va_map)); pthread_mutex_init(&va_map->mutex, NULL); - pthread_mutex_init(&va_map->va_allocator.mutex, NULL); - - /* Make sure we never return 0 as a valid VA */ - va_map->va_allocator.next_va = VKD3D_VA_BLOCK_SIZE; } void vkd3d_va_map_cleanup(struct vkd3d_va_map *va_map) { vkd3d_va_map_cleanup_tree(&va_map->va_tree); - - pthread_mutex_destroy(&va_map->va_allocator.mutex); pthread_mutex_destroy(&va_map->mutex); - vkd3d_free(va_map->va_allocator.free_ranges); vkd3d_free(va_map->small_entries); } diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index e83b13f8..797ca0a3 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -299,21 +299,9 @@ struct vkd3d_va_range VkDeviceSize size; }; -struct vkd3d_va_allocator -{ - pthread_mutex_t mutex; - - struct vkd3d_va_range *free_ranges; - size_t free_ranges_size; - size_t free_range_count; - - VkDeviceAddress next_va; -}; - struct vkd3d_va_map { struct vkd3d_va_tree va_tree; - struct vkd3d_va_allocator va_allocator; pthread_mutex_t mutex; @@ -328,8 +316,6 @@ const struct vkd3d_unique_resource *vkd3d_va_map_deref(struct vkd3d_va_map *va_m VkAccelerationStructureKHR vkd3d_va_map_place_acceleration_structure(struct vkd3d_va_map *va_map, struct d3d12_device *device, VkDeviceAddress va); -VkDeviceAddress vkd3d_va_map_alloc_fake_va(struct vkd3d_va_map *va_map, VkDeviceSize size); -void vkd3d_va_map_free_fake_va(struct vkd3d_va_map *va_map, VkDeviceAddress va, VkDeviceSize size); void vkd3d_va_map_init(struct vkd3d_va_map *va_map); void vkd3d_va_map_cleanup(struct vkd3d_va_map *va_map); -- cgit v1.2.3