diff options
author | Jason Fielder <jason_apple> | 2022-09-01 22:42:47 +0300 |
---|---|---|
committer | Clément Foucault <foucault.clem@gmail.com> | 2022-09-01 22:45:12 +0300 |
commit | 5f4409b02ef7c54089ff1b491e008d4b86c030f4 (patch) | |
tree | 932cb6e7cb1dad3909f70120e694be0e6d3e7c90 /source/blender | |
parent | 9d59734ffd4093dea6d207ad8ee78f783f9b3fd6 (diff) |
Metal: MTLIndexBuf class implementation.
Implementation also contains a number of optimisations and feature enablements specific to the Metal API and Apple Silicon GPUs.
Ref T96261
Reviewed By: fclem
Maniphest Tasks: T96261
Differential Revision: https://developer.blender.org/D15369
Diffstat (limited to 'source/blender')
-rw-r--r-- | source/blender/gpu/CMakeLists.txt | 2 | ||||
-rw-r--r-- | source/blender/gpu/GPU_index_buffer.h | 3 | ||||
-rw-r--r-- | source/blender/gpu/GPU_primitive.h | 74 | ||||
-rw-r--r-- | source/blender/gpu/intern/gpu_index_buffer.cc | 105 | ||||
-rw-r--r-- | source/blender/gpu/intern/gpu_index_buffer_private.hh | 13 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_backend.hh | 1 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_backend.mm | 4 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_context.hh | 1 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_index_buffer.hh | 79 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_index_buffer.mm | 516 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_query.hh | 2 | ||||
-rw-r--r-- | source/blender/gpu/metal/mtl_query.mm | 6 | ||||
-rw-r--r-- | source/blender/gpu/opengl/gl_index_buffer.hh | 4 |
13 files changed, 788 insertions, 22 deletions
diff --git a/source/blender/gpu/CMakeLists.txt b/source/blender/gpu/CMakeLists.txt index c289a21421a..6758b4b8794 100644 --- a/source/blender/gpu/CMakeLists.txt +++ b/source/blender/gpu/CMakeLists.txt @@ -191,6 +191,7 @@ set(METAL_SRC metal/mtl_context.mm metal/mtl_debug.mm metal/mtl_framebuffer.mm + metal/mtl_index_buffer.mm metal/mtl_memory.mm metal/mtl_query.mm metal/mtl_state.mm @@ -204,6 +205,7 @@ set(METAL_SRC metal/mtl_context.hh metal/mtl_debug.hh metal/mtl_framebuffer.hh + metal/mtl_index_buffer.hh metal/mtl_memory.hh metal/mtl_query.hh metal/mtl_state.hh diff --git a/source/blender/gpu/GPU_index_buffer.h b/source/blender/gpu/GPU_index_buffer.h index bbb431cbc15..e6345b1e43b 100644 --- a/source/blender/gpu/GPU_index_buffer.h +++ b/source/blender/gpu/GPU_index_buffer.h @@ -26,6 +26,9 @@ typedef struct GPUIndexBufBuilder { uint index_len; uint index_min; uint index_max; + uint restart_index_value; + bool uses_restart_indices; + GPUPrimType prim_type; uint32_t *data; } GPUIndexBufBuilder; diff --git a/source/blender/gpu/GPU_primitive.h b/source/blender/gpu/GPU_primitive.h index 4860b037bfb..de2feac2607 100644 --- a/source/blender/gpu/GPU_primitive.h +++ b/source/blender/gpu/GPU_primitive.h @@ -9,6 +9,7 @@ #pragma once +#include "BLI_assert.h" #include "GPU_common.h" #ifdef __cplusplus @@ -42,6 +43,79 @@ typedef enum { GPU_PRIM_CLASS_ANY = GPU_PRIM_CLASS_POINT | GPU_PRIM_CLASS_LINE | GPU_PRIM_CLASS_SURFACE, } GPUPrimClass; +inline int gpu_get_prim_count_from_type(uint vertex_len, GPUPrimType prim_type) +{ + /* does vertex_len make sense for this primitive type? */ + if (vertex_len == 0) { + return 0; + } + + switch (prim_type) { + case GPU_PRIM_POINTS: + return vertex_len; + + case GPU_PRIM_LINES: + BLI_assert(vertex_len % 2 == 0); + return vertex_len / 2; + + case GPU_PRIM_LINE_STRIP: + return vertex_len - 1; + + case GPU_PRIM_LINE_LOOP: + return vertex_len; + + case GPU_PRIM_LINES_ADJ: + BLI_assert(vertex_len % 4 == 0); + return vertex_len / 4; + + case GPU_PRIM_LINE_STRIP_ADJ: + return vertex_len - 2; + + case GPU_PRIM_TRIS: + BLI_assert(vertex_len % 3 == 0); + return vertex_len / 3; + + case GPU_PRIM_TRI_STRIP: + BLI_assert(vertex_len >= 3); + return vertex_len - 2; + + case GPU_PRIM_TRI_FAN: + BLI_assert(vertex_len >= 3); + return vertex_len - 2; + + case GPU_PRIM_TRIS_ADJ: + BLI_assert(vertex_len % 6 == 0); + return vertex_len / 6; + + default: + BLI_assert_unreachable(); + return 0; + } +} + +inline bool is_restart_compatible(GPUPrimType type) +{ + switch (type) { + case GPU_PRIM_POINTS: + case GPU_PRIM_LINES: + case GPU_PRIM_TRIS: + case GPU_PRIM_LINES_ADJ: + case GPU_PRIM_TRIS_ADJ: + case GPU_PRIM_NONE: + default: { + return false; + } + case GPU_PRIM_LINE_STRIP: + case GPU_PRIM_LINE_LOOP: + case GPU_PRIM_TRI_STRIP: + case GPU_PRIM_TRI_FAN: + case GPU_PRIM_LINE_STRIP_ADJ: { + return true; + } + } + return false; +} + /** * TODO: Improve error checking by validating that the shader is suited for this primitive type. * GPUPrimClass GPU_primtype_class(GPUPrimType); diff --git a/source/blender/gpu/intern/gpu_index_buffer.cc b/source/blender/gpu/intern/gpu_index_buffer.cc index 146461d1dfb..08c31d0d589 100644 --- a/source/blender/gpu/intern/gpu_index_buffer.cc +++ b/source/blender/gpu/intern/gpu_index_buffer.cc @@ -16,6 +16,8 @@ #include "gpu_index_buffer_private.hh" +#include "GPU_platform.h" + #include <cstring> #define KEEP_SINGLE_COPY 1 @@ -40,6 +42,28 @@ void GPU_indexbuf_init_ex(GPUIndexBufBuilder *builder, builder->index_min = UINT32_MAX; builder->index_max = 0; builder->prim_type = prim_type; + +#ifdef __APPLE__ + /* Only encode restart indices for restart-compatible primitive types. + * Resolves out-of-bounds read error on macOS. Using 0-index will ensure + * degenerative primitives when skipping primitives is required and will + * incur no additional performance cost for rendering. */ + if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL)) { + /* We will still use restart-indices for point primtives and then + * patch these during IndexBuf::init, as we cannot benefit from degenerative + * primitives to eliminate these. */ + builder->restart_index_value = (is_restart_compatible(prim_type) || + prim_type == GPU_PRIM_POINTS) ? + RESTART_INDEX : + 0; + } + else { + builder->restart_index_value = RESTART_INDEX; + } +#else + builder->restart_index_value = RESTART_INDEX; +#endif + builder->uses_restart_indices = false; builder->data = (uint *)MEM_callocN(builder->max_index_len * sizeof(uint), "GPUIndexBuf data"); } @@ -94,7 +118,8 @@ void GPU_indexbuf_add_primitive_restart(GPUIndexBufBuilder *builder) assert(builder->data != nullptr); assert(builder->index_len < builder->max_index_len); #endif - builder->data[builder->index_len++] = RESTART_INDEX; + builder->data[builder->index_len++] = builder->restart_index_value; + builder->uses_restart_indices = true; } void GPU_indexbuf_add_point_vert(GPUIndexBufBuilder *builder, uint v) @@ -186,8 +211,9 @@ void GPU_indexbuf_set_point_restart(GPUIndexBufBuilder *builder, uint elem) { BLI_assert(builder->prim_type == GPU_PRIM_POINTS); BLI_assert(elem < builder->max_index_len); - builder->data[elem++] = RESTART_INDEX; + builder->data[elem++] = builder->restart_index_value; builder->index_len = MAX2(builder->index_len, elem); + builder->uses_restart_indices = true; } void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem) @@ -195,9 +221,10 @@ void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem) BLI_assert(builder->prim_type == GPU_PRIM_LINES); BLI_assert((elem + 1) * 2 <= builder->max_index_len); uint idx = elem * 2; - builder->data[idx++] = RESTART_INDEX; - builder->data[idx++] = RESTART_INDEX; + builder->data[idx++] = builder->restart_index_value; + builder->data[idx++] = builder->restart_index_value; builder->index_len = MAX2(builder->index_len, idx); + builder->uses_restart_indices = true; } void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem) @@ -205,10 +232,11 @@ void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem) BLI_assert(builder->prim_type == GPU_PRIM_TRIS); BLI_assert((elem + 1) * 3 <= builder->max_index_len); uint idx = elem * 3; - builder->data[idx++] = RESTART_INDEX; - builder->data[idx++] = RESTART_INDEX; - builder->data[idx++] = RESTART_INDEX; + builder->data[idx++] = builder->restart_index_value; + builder->data[idx++] = builder->restart_index_value; + builder->data[idx++] = builder->restart_index_value; builder->index_len = MAX2(builder->index_len, idx); + builder->uses_restart_indices = true; } /** \} */ @@ -226,7 +254,12 @@ IndexBuf::~IndexBuf() } } -void IndexBuf::init(uint indices_len, uint32_t *indices, uint min_index, uint max_index) +void IndexBuf::init(uint indices_len, + uint32_t *indices, + uint min_index, + uint max_index, + GPUPrimType prim_type, + bool uses_restart_indices) { is_init_ = true; data_ = indices; @@ -234,6 +267,21 @@ void IndexBuf::init(uint indices_len, uint32_t *indices, uint min_index, uint ma index_len_ = indices_len; is_empty_ = min_index > max_index; + /* Patch index buffer to remove restart indices from + * non-restart-compatible primitive types. Restart indices + * are situationally added to selectively hide vertices. + * Metal does not support restart-indices for non-restart-compatible + * types, as such we should remove these indices. + * + * We only need to perform this for point primitives, as + * line primitives/triangle primitives can use index 0 for all + * vertices to create a degenerative primitive, where all + * vertices share the same index and skip rendering via HW + * culling. */ + if (prim_type == GPU_PRIM_POINTS && uses_restart_indices) { + this->strip_restart_indices(); + } + #if GPU_TRACK_INDEX_RANGE /* Everything remains 32 bit while building to keep things simple. * Find min/max after, then convert to smallest index type possible. */ @@ -243,7 +291,18 @@ void IndexBuf::init(uint indices_len, uint32_t *indices, uint min_index, uint ma if (range <= 0xFFFF) { index_type_ = GPU_INDEX_U16; - this->squeeze_indices_short(min_index, max_index); + bool do_clamp_indices = false; +# ifdef __APPLE__ + /* NOTE: For the Metal Backend, we use degenerative primitives to hide vertices + * which are not restart compatible. When this is done, we need to ensure + * that compressed index ranges clamp all index values within the valid + * range, rather than maximally clamping against the USHORT restart index + * value of 0xFFFFu, as this will cause an out-of-bounds read during + * vertex assembly. */ + do_clamp_indices = GPU_type_matches_ex( + GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL); +# endif + this->squeeze_indices_short(min_index, max_index, prim_type, do_clamp_indices); } #endif } @@ -302,7 +361,10 @@ uint IndexBuf::index_range(uint *r_min, uint *r_max) return max_value - min_value; } -void IndexBuf::squeeze_indices_short(uint min_idx, uint max_idx) +void IndexBuf::squeeze_indices_short(uint min_idx, + uint max_idx, + GPUPrimType prim_type, + bool clamp_indices_in_range) { /* data will never be *larger* than builder->data... * converting in place to avoid extra allocation */ @@ -311,8 +373,22 @@ void IndexBuf::squeeze_indices_short(uint min_idx, uint max_idx) if (max_idx >= 0xFFFF) { index_base_ = min_idx; + /* NOTE: When using restart_index=0 for degenerative primitives indices, + * the compressed index will go below zero and wrap around when min_idx > 0. + * In order to ensure the resulting index is still within range, we instead + * clamp index to the maximum within the index range. + * + * `clamp_max_idx` represents the maximum possible index to clamp against. If primitive is + * restart-compatible, we can just clamp against the primtive-restart value, otherwise, we + * must assign to a valid index within the range. + * + * NOTE: For OpenGL we skip this by disabling clamping, as we still need to use + * restart index values for point primitives to disable rendering. */ + uint16_t clamp_max_idx = (is_restart_compatible(prim_type) || !clamp_indices_in_range) ? + 0xFFFFu : + (max_idx - min_idx); for (uint i = 0; i < index_len_; i++) { - ushort_idx[i] = (uint16_t)MIN2(0xFFFF, uint_idx[i] - min_idx); + ushort_idx[i] = (uint16_t)MIN2(clamp_max_idx, uint_idx[i] - min_idx); } } else { @@ -363,7 +439,12 @@ void GPU_indexbuf_build_in_place(GPUIndexBufBuilder *builder, GPUIndexBuf *elem) BLI_assert(builder->data != nullptr); /* Transfer data ownership to GPUIndexBuf. * It will be uploaded upon first use. */ - unwrap(elem)->init(builder->index_len, builder->data, builder->index_min, builder->index_max); + unwrap(elem)->init(builder->index_len, + builder->data, + builder->index_min, + builder->index_max, + builder->prim_type, + builder->uses_restart_indices); builder->data = nullptr; } diff --git a/source/blender/gpu/intern/gpu_index_buffer_private.hh b/source/blender/gpu/intern/gpu_index_buffer_private.hh index 84903b05273..4099d6641a6 100644 --- a/source/blender/gpu/intern/gpu_index_buffer_private.hh +++ b/source/blender/gpu/intern/gpu_index_buffer_private.hh @@ -59,7 +59,12 @@ class IndexBuf { IndexBuf(){}; virtual ~IndexBuf(); - void init(uint indices_len, uint32_t *indices, uint min_index, uint max_index); + void init(uint indices_len, + uint32_t *indices, + uint min_index, + uint max_index, + GPUPrimType prim_type, + bool uses_restart_indices); void init_subrange(IndexBuf *elem_src, uint start, uint length); void init_build_on_device(uint index_len); @@ -99,8 +104,12 @@ class IndexBuf { virtual void update_sub(uint start, uint len, const void *data) = 0; private: - inline void squeeze_indices_short(uint min_idx, uint max_idx); + inline void squeeze_indices_short(uint min_idx, + uint max_idx, + GPUPrimType prim_type, + bool clamp_indices_in_range); inline uint index_range(uint *r_min, uint *r_max); + virtual void strip_restart_indices() = 0; }; /* Syntactic sugar. */ diff --git a/source/blender/gpu/metal/mtl_backend.hh b/source/blender/gpu/metal/mtl_backend.hh index fe49a0fce60..214a5d738a9 100644 --- a/source/blender/gpu/metal/mtl_backend.hh +++ b/source/blender/gpu/metal/mtl_backend.hh @@ -16,7 +16,6 @@ namespace blender::gpu { class Batch; class DrawList; class FrameBuffer; -class IndexBuf; class QueryPool; class Shader; class UniformBuf; diff --git a/source/blender/gpu/metal/mtl_backend.mm b/source/blender/gpu/metal/mtl_backend.mm index a15da4df083..361b2ca05f5 100644 --- a/source/blender/gpu/metal/mtl_backend.mm +++ b/source/blender/gpu/metal/mtl_backend.mm @@ -10,6 +10,7 @@ #include "mtl_backend.hh" #include "mtl_context.hh" #include "mtl_framebuffer.hh" +#include "mtl_index_buffer.hh" #include "mtl_query.hh" #include "mtl_uniform_buffer.hh" @@ -60,8 +61,7 @@ FrameBuffer *MTLBackend::framebuffer_alloc(const char *name) IndexBuf *MTLBackend::indexbuf_alloc() { - /* TODO(Metal): Implement MTLIndexBuf. */ - return nullptr; + return new MTLIndexBuf(); }; QueryPool *MTLBackend::querypool_alloc() diff --git a/source/blender/gpu/metal/mtl_context.hh b/source/blender/gpu/metal/mtl_context.hh index 0db87bf5da5..d542f0e1025 100644 --- a/source/blender/gpu/metal/mtl_context.hh +++ b/source/blender/gpu/metal/mtl_context.hh @@ -3,7 +3,6 @@ /** \file * \ingroup gpu */ - #pragma once #include "MEM_guardedalloc.h" diff --git a/source/blender/gpu/metal/mtl_index_buffer.hh b/source/blender/gpu/metal/mtl_index_buffer.hh new file mode 100644 index 00000000000..5182eeab5e3 --- /dev/null +++ b/source/blender/gpu/metal/mtl_index_buffer.hh @@ -0,0 +1,79 @@ + +/** \file + * \ingroup gpu + */ + +#pragma once + +#include "MEM_guardedalloc.h" +#include "gpu_index_buffer_private.hh" +#include "mtl_context.hh" +#include <Cocoa/Cocoa.h> +#include <Metal/Metal.h> +#include <QuartzCore/QuartzCore.h> + +namespace blender::gpu { + +class MTLIndexBuf : public IndexBuf { + friend class MTLBatch; + friend class MTLDrawList; + + private: + /* Metal buffer resource. */ + gpu::MTLBuffer *ibo_ = nullptr; + uint64_t alloc_size_ = 0; + +#ifndef NDEBUG + /* Flags whether point index buffer has been compacted + * to remove false retart indices. */ + bool point_restarts_stripped_ = false; +#endif + + /* Optimised index buffers. + * NOTE(Metal): This optimization encodes a new index buffer following + * TriangleList topology. Parsing of Index buffers is more optimal + * when not using restart-compatible primitive topology types. */ + GPUPrimType optimized_primitive_type_; + gpu::MTLBuffer *optimized_ibo_ = nullptr; + uint32_t emulated_v_count = 0; + void free_optimized_buffer(); + + /* Flags whether an index buffer can be optimized. + * For index buffers which are partially modified + * on the host, or by the GPU, optimization cannot be performed. */ + bool can_optimize_ = true; + + public: + ~MTLIndexBuf(); + + void bind_as_ssbo(uint32_t binding) override; + const uint32_t *read() const override; + + void upload_data() override; + void update_sub(uint32_t start, uint32_t len, const void *data) override; + + /* get_index_buffer can conditionally return an optimized index buffer of a + * differing format, if it is concluded that optimization is preferred + * for the given inputs. + * Index buffer optimization is used to replace restart-compatbiele + * primitive types with non-restart-compatible ones such as TriangleList and + * LineList. This improves GPU execution for these types significantly, while + * only incuring a small performance penalty. + * + * This is also used to emulate unsupported topology types + * such as triangle fan. */ + id<MTLBuffer> get_index_buffer(GPUPrimType &in_out_primitive_type, uint &in_out_v_count); + void flag_can_optimize(bool can_optimize); + + static MTLIndexType gpu_index_type_to_metal(GPUIndexBufType type) + { + return (type == GPU_INDEX_U16) ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32; + } + + private: + void strip_restart_indices() override; + + MEM_CXX_CLASS_ALLOC_FUNCS("MTLIndexBuf") +}; + +} // namespace blender::gpu diff --git a/source/blender/gpu/metal/mtl_index_buffer.mm b/source/blender/gpu/metal/mtl_index_buffer.mm new file mode 100644 index 00000000000..4a7875aaeb0 --- /dev/null +++ b/source/blender/gpu/metal/mtl_index_buffer.mm @@ -0,0 +1,516 @@ + +/** \file + * \ingroup gpu + */ +#include "mtl_index_buffer.hh" +#include "mtl_context.hh" +#include "mtl_debug.hh" + +#include "BLI_span.hh" + +namespace blender::gpu { + +/* -------------------------------------------------------------------- */ +/** \name Core MTLIndexBuf implementation. + * \{ */ + +MTLIndexBuf::~MTLIndexBuf() +{ + if (ibo_ != nullptr && !this->is_subrange_) { + ibo_->free(); + } + this->free_optimized_buffer(); +} + +void MTLIndexBuf::free_optimized_buffer() +{ + if (optimized_ibo_) { + optimized_ibo_->free(); + optimized_ibo_ = nullptr; + } +} + +void MTLIndexBuf::bind_as_ssbo(uint32_t binding) +{ + /* Flag buffer as incompatible with optimized/patched buffers as contents + * can now have partial modifications from the GPU. */ + this->flag_can_optimize(false); + this->free_optimized_buffer(); + + /* Ensure we have a valid IBO. */ + BLI_assert(this->ibo_); + + /* TODO(Metal): Support index buffer SSBOs. Dependent on compute impl. */ + MTL_LOG_WARNING("MTLIndexBuf::bind_as_ssbo not yet implemented!\n"); +} + +const uint32_t *MTLIndexBuf::read() const +{ + if (ibo_ != nullptr) { + + /* Return host pointer. */ + void *data = ibo_->get_host_ptr(); + return static_cast<uint32_t *>(data); + } + BLI_assert(false && "Index buffer not ready to be read."); + return nullptr; +} + +void MTLIndexBuf::upload_data() +{ + /* Handle subrange upload. */ + if (is_subrange_) { + MTLIndexBuf *mtlsrc = static_cast<MTLIndexBuf *>(src_); + mtlsrc->upload_data(); + +#ifndef NDEBUG + BLI_assert_msg(!mtlsrc->point_restarts_stripped_, + "Cannot use subrange on stripped point buffer."); +#endif + + /* If parent subrange allocation has changed, + * update our index buffer. */ + if (alloc_size_ != mtlsrc->alloc_size_ || ibo_ != mtlsrc->ibo_) { + + /* Update index buffer and allocation from source. */ + alloc_size_ = mtlsrc->alloc_size_; + ibo_ = mtlsrc->ibo_; + + /* Reset any allocated patched or optimized index buffers. */ + this->free_optimized_buffer(); + } + return; + } + + /* If new data ready, and index buffer already exists, release current. */ + if ((ibo_ != nullptr) && (this->data_ != nullptr)) { + MTL_LOG_INFO("Re-creating index buffer with new data. IndexBuf %p\n", this); + ibo_->free(); + ibo_ = nullptr; + } + + /* Prepare Buffer and Upload Data. */ + if (ibo_ == nullptr && data_ != nullptr) { + alloc_size_ = this->size_get(); + if (alloc_size_ == 0) { + MTL_LOG_WARNING("[Metal] Warning! Trying to allocate index buffer with size=0 bytes\n"); + } + else { + ibo_ = MTLContext::get_global_memory_manager().allocate_with_data(alloc_size_, true, data_); + BLI_assert(ibo_); + ibo_->set_label(@"Index Buffer"); + } + + /* No need to keep copy of data_ in system memory. */ + MEM_SAFE_FREE(data_); + } +} + +void MTLIndexBuf::update_sub(uint32_t start, uint32_t len, const void *data) +{ + BLI_assert(!is_subrange_); + + /* If host-side data still exists, modify and upload as normal */ + if (data_ != nullptr) { + + /* Free index buffer if one exists. */ + if (ibo_ != nullptr && !this->is_subrange_) { + ibo_->free(); + ibo_ = nullptr; + } + + BLI_assert(start + len < this->size_get()); + + /* Apply start byte offset to data pointer. */ + void *modified_base_ptr = data_; + uint8_t *ptr = static_cast<uint8_t *>(modified_base_ptr); + ptr += start; + modified_base_ptr = static_cast<void *>(ptr); + + /* Modify host-side data. */ + memcpy(modified_base_ptr, data, len); + return; + } + + /* Verify buffer. */ + BLI_assert(ibo_ != nullptr); + + /* Otherwise, we will inject a data update, using staged data, into the command stream. + * Stage update contents in temporary buffer*/ + MTLContext *ctx = static_cast<MTLContext *>(unwrap(GPU_context_active_get())); + BLI_assert(ctx); + MTLTemporaryBuffer range = ctx->get_scratchbuffer_manager().scratch_buffer_allocate_range(len); + memcpy(range.data, data, len); + + /* Copy updated contents into primary buffer. + * These changes need to be uploaded via blit to ensure the data copies happen in-order. */ + id<MTLBuffer> dest_buffer = ibo_->get_metal_buffer(); + BLI_assert(dest_buffer != nil); + + id<MTLBlitCommandEncoder> enc = ctx->main_command_buffer.ensure_begin_blit_encoder(); + [enc copyFromBuffer:range.metal_buffer + sourceOffset:(uint32_t)range.buffer_offset + toBuffer:dest_buffer + destinationOffset:start + size:len]; + + /* Synchronise changes back to host to ensure CPU-side data is up-to-date for non + * Shared buffers. */ + if (dest_buffer.storageMode == MTLStorageModeManaged) { + [enc synchronizeResource:dest_buffer]; + } + + /* Invalidate patched/optimized buffers. */ + this->free_optimized_buffer(); + + /* Flag buffer as incompatible with optimized/patched buffers as contents + * have partial modifications. */ + this->flag_can_optimize(false); + + BLI_assert(false); +} + +void MTLIndexBuf::flag_can_optimize(bool can_optimize) +{ + can_optimize_ = can_optimize; +} + +/** \} */ + +/** \name Index buffer optimization and topology emulation. + * Index buffer optimization and emulation. Optimise index buffers by + * eliminating restart-indices. + * Emulate unsupported index types e.g. Triangle Fan and Line Loop. + * \{ */ + +/* Returns total vertices in new buffer. */ +template<typename T> +static uint32_t populate_optimized_tri_strip_buf(Span<T> original_data, + MutableSpan<T> output_data, + uint32_t input_index_len) +{ + /* Generate TriangleList from TriangleStrip. */ + uint32_t current_vert_len = 0; + uint32_t current_output_ind = 0; + T indices[3]; + + for (int c_index = 0; c_index < input_index_len; c_index++) { + T current_index = original_data[c_index]; + if (current_index == T(-1)) { + /* Stop current primitive. Move onto next. */ + current_vert_len = 0; + } + else { + if (current_vert_len < 3) { + /* prepare first triangle. + * Cache indices before genrating a triangle, + * in case we have bad primitive-restarts. */ + indices[current_vert_len] = current_index; + } + + /* emit triangle once we reach 3 input verts in current strip. */ + if (current_vert_len == 3) { + /* First triangle in strip. */ + output_data[current_output_ind++] = indices[0]; + output_data[current_output_ind++] = indices[1]; + output_data[current_output_ind++] = indices[2]; + } + else if (current_vert_len > 3) { + /* All other triangles in strip. + * These triangles are populated using data from previous 2 vertices + * and the latest index. */ + uint32_t tri_id = current_vert_len - 3; + uint32_t base_output_ind = current_output_ind; + if ((tri_id % 2) == 0) { + output_data[base_output_ind + 0] = output_data[base_output_ind - 2]; + output_data[base_output_ind + 1] = current_index; + output_data[base_output_ind + 2] = output_data[base_output_ind - 1]; + } + else { + output_data[base_output_ind + 0] = output_data[base_output_ind - 1]; + output_data[base_output_ind + 1] = output_data[base_output_ind - 2]; + output_data[base_output_ind + 2] = current_index; + } + current_output_ind += 3; + } + + /* Increment relative vertex index. */ + current_vert_len++; + } + } + return current_output_ind; +} + +/* Returns total vertices in new buffer. */ +template<typename T> +static uint32_t populate_emulated_tri_fan_buf(Span<T> original_data, + MutableSpan<T> output_data, + uint32_t input_index_len) +{ + /* Generate TriangleList from TriangleFan. */ + T base_prim_ind_val = 0; + uint32_t current_vert_len = 0; + uint32_t current_output_ind = 0; + T indices[3]; + + for (int c_index = 0; c_index < input_index_len; c_index++) { + T current_index = original_data[c_index]; + if (current_index == T(-1)) { + /* Stop current primitive. Move onto next. */ + current_vert_len = 0; + } + else { + if (current_vert_len < 3) { + /* prepare first triangle. + * Cache indices before genrating a triangle, + * in case we have bad primitive-restarts. */ + indices[current_vert_len] = current_index; + } + + /* emit triangle once we reach 3 input verts in current strip. */ + if (current_vert_len == 3) { + /* First triangle in strip. */ + output_data[current_output_ind++] = indices[0]; + output_data[current_output_ind++] = indices[1]; + output_data[current_output_ind++] = indices[2]; + base_prim_ind_val = indices[0]; + } + else if (current_vert_len > 3) { + /* All other triangles in strip. + * These triangles are populated using data from previous 2 vertices + * and the latest index. */ + uint32_t base_output_ind = current_output_ind; + + output_data[base_output_ind + 0] = base_prim_ind_val; + output_data[base_output_ind + 1] = output_data[base_output_ind - 1]; + output_data[base_output_ind + 2] = current_index; + current_output_ind += 3; + } + + /* Increment relative vertex index. */ + current_vert_len++; + } + } + return current_output_ind; +} + +id<MTLBuffer> MTLIndexBuf::get_index_buffer(GPUPrimType &in_out_primitive_type, + uint32_t &in_out_v_count) +{ + /* Determine whether to return the original index buffer, or whether we + * should emulate an unsupported primitive type, or optimisze a restart- + * compatible type for faster performance. */ + bool should_optimize_or_emulate = (in_out_primitive_type == GPU_PRIM_TRI_FAN) || + (in_out_primitive_type == GPU_PRIM_TRI_STRIP); + if (!should_optimize_or_emulate || is_subrange_ || !can_optimize_) { + /* Ensure we are not optimized. */ + BLI_assert(this->optimized_ibo_ == nullptr); + + /* Return regular index buffer. */ + BLI_assert(this->ibo_ && this->ibo_->get_metal_buffer()); + return this->ibo_->get_metal_buffer(); + } + + /* Perform optimization on type. */ + GPUPrimType input_prim_type = in_out_primitive_type; + this->upload_data(); + if (!ibo_ && optimized_ibo_ == nullptr) { + /* Cannot optimize buffer if no source IBO exists. */ + return nil; + } + + /* Verify whether existing index buffer is valid. */ + if (optimized_ibo_ != nullptr && optimized_primitive_type_ != input_prim_type) { + BLI_assert_msg(false, + "Cannot change the optimized primitive format after generation, as source " + "index buffer data is discarded."); + return nil; + } + + /* Generate optimized index buffer. */ + if (optimized_ibo_ == nullptr) { + + /* Generate unwrapped index buffer. */ + switch (input_prim_type) { + case GPU_PRIM_TRI_FAN: { + + /* Calculate maximum size. */ + uint32_t max_possible_verts = (this->index_len_ - 2) * 3; + BLI_assert(max_possible_verts > 0); + + /* Allocate new buffer. */ + optimized_ibo_ = MTLContext::get_global_memory_manager().allocate( + max_possible_verts * + ((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)), + true); + + /* Populate new index buffer. */ + if (index_type_ == GPU_INDEX_U16) { + Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()), + this->index_len_); + MutableSpan<uint16_t> output_data( + static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), this->index_len_); + emulated_v_count = populate_emulated_tri_fan_buf<uint16_t>( + orig_data, output_data, this->index_len_); + } + else { + Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()), + this->index_len_); + MutableSpan<uint32_t> output_data( + static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), this->index_len_); + emulated_v_count = populate_emulated_tri_fan_buf<uint32_t>( + orig_data, output_data, this->index_len_); + } + + BLI_assert(emulated_v_count <= max_possible_verts); + + /* Flush buffer and output. */ + optimized_ibo_->flush(); + optimized_primitive_type_ = input_prim_type; + in_out_v_count = emulated_v_count; + in_out_primitive_type = GPU_PRIM_TRIS; + } + + case GPU_PRIM_TRI_STRIP: { + + /* Calculate maximum size. */ + uint32_t max_possible_verts = (this->index_len_ - 2) * 3; + BLI_assert(max_possible_verts > 0); + + /* Allocate new buffer. */ + optimized_ibo_ = MTLContext::get_global_memory_manager().allocate( + max_possible_verts * + ((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)), + true); + + /* Populate new index buffer. */ + if (index_type_ == GPU_INDEX_U16) { + Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()), + this->index_len_); + MutableSpan<uint16_t> output_data( + static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), this->index_len_); + emulated_v_count = populate_optimized_tri_strip_buf<uint16_t>( + orig_data, output_data, this->index_len_); + } + else { + Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()), + this->index_len_); + MutableSpan<uint32_t> output_data( + static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), this->index_len_); + emulated_v_count = populate_optimized_tri_strip_buf<uint32_t>( + orig_data, output_data, this->index_len_); + } + + BLI_assert(emulated_v_count <= max_possible_verts); + + /* Flush buffer and output. */ + optimized_ibo_->flush(); + optimized_primitive_type_ = input_prim_type; + in_out_v_count = emulated_v_count; + in_out_primitive_type = GPU_PRIM_TRIS; + } break; + + case GPU_PRIM_LINE_STRIP: { + /* TOOD(Metal): Line strip topology types would benefit from optimization to remove + * primitive restarts, however, these do not occur frequently, nor with + * significant geometry counts. */ + MTL_LOG_INFO("TODO: Primitive topology: Optimise line strip topology types\n"); + } break; + + case GPU_PRIM_LINE_LOOP: { + /* TOOD(Metal): Line Loop primitive type requires use of optimized index buffer for + * emulation, if used with indexed rendering. This path is currently not hit as LineLoop + * does not currently appear to be used alongisde an index buffer. */ + MTL_LOG_WARNING( + "TODO: Primitive topology: Line Loop Index buffer optimization required for " + "emulation.\n"); + } break; + + case GPU_PRIM_TRIS: + case GPU_PRIM_LINES: + case GPU_PRIM_POINTS: { + /* Should not get here - TRIS/LINES/POINTS do not require emulation or optimization. */ + BLI_assert_unreachable(); + return nil; + } + + default: + /* Should not get here - Invalid primitive type. */ + BLI_assert_unreachable(); + break; + } + } + + /* Return optimized buffer. */ + if (optimized_ibo_ != nullptr) { + + /* Delete original buffer if one still exists, as we do no need it. */ + if (ibo_ != nullptr) { + ibo_->free(); + ibo_ = nullptr; + } + + /* Output params. */ + in_out_v_count = emulated_v_count; + in_out_primitive_type = GPU_PRIM_TRIS; + return optimized_ibo_->get_metal_buffer(); + } + return nil; +} + +void MTLIndexBuf::strip_restart_indices() +{ + /* We remove point buffer primitive restart indices by swapping restart indices + * with the first valid index at the end of the index buffer and reducing the + * length. Primitive restarts are invalid in Metal for non-restart-compatible + * primitive types. We also cannot just use zero unlike for Lines and Triangles, + * as we cannot create de-generative point primitives to hide geometry, as each + * point is indepednent. + * Instead, we must remove these hidden indices from the index buffer. + * Note: This happens prior to index squeezing so operate on 32-bit indices. */ + MutableSpan<uint32_t> uint_idx(static_cast<uint32_t *>(data_), index_len_); + for (uint i = 0; i < index_len_; i++) { + if (uint_idx[i] == 0xFFFFFFFFu) { + + /* Find swap index at end of index buffer. */ + int swap_index = -1; + for (uint j = index_len_ - 1; j >= i; j--) { + /* If end index is restart, just reduce length. */ + if (uint_idx[j] == 0xFFFFFFFFu) { + index_len_--; + continue; + } + /* Otherwise assign swap index. */ + swap_index = j; + break; + } + + /* If swap index is not valid, then there were no valid non-restart indices + * to swap with. However, the above loop will have removed these indices by + * reducing the length of indices. Debug assertions verify that the restart + * index is no longer included. */ + if (swap_index == -1) { + BLI_assert(index_len_ <= i); + } + else { + /* If we have found an index we can swap with, flip the values. + * We also reduce the length. As per above loop, swap_index should + * now be outside the index length range. */ + uint32_t swap_index_value = uint_idx[swap_index]; + uint_idx[i] = swap_index_value; + uint_idx[swap_index] = 0xFFFFFFFFu; + index_len_--; + BLI_assert(index_len_ <= swap_index); + } + } + } + +#ifndef NDEBUG + /* Flag as having been stripped to ensure invalid usage is tracked. */ + point_restarts_stripped_ = true; +#endif +} + +/** \} */ + +} // blender::gpu diff --git a/source/blender/gpu/metal/mtl_query.hh b/source/blender/gpu/metal/mtl_query.hh index c1ec9a2a0f5..03436fcd67d 100644 --- a/source/blender/gpu/metal/mtl_query.hh +++ b/source/blender/gpu/metal/mtl_query.hh @@ -25,7 +25,7 @@ class MTLQueryPool : public QueryPool { MTLVisibilityResultMode mtl_type_; Vector<gpu::MTLBuffer *> buffer_; - void allocate_buffer(); + void allocate(); public: MTLQueryPool(); diff --git a/source/blender/gpu/metal/mtl_query.mm b/source/blender/gpu/metal/mtl_query.mm index 8983ea7ec44..f4bd5754b77 100644 --- a/source/blender/gpu/metal/mtl_query.mm +++ b/source/blender/gpu/metal/mtl_query.mm @@ -16,7 +16,7 @@ static const size_t VISIBILITY_RESULT_SIZE_IN_BYTES = 8; MTLQueryPool::MTLQueryPool() { - allocate_buffer(); + allocate(); } MTLQueryPool::~MTLQueryPool() { @@ -26,7 +26,7 @@ MTLQueryPool::~MTLQueryPool() } } -void MTLQueryPool::allocate_buffer() +void MTLQueryPool::allocate() { /* Allocate Metal buffer for visibility results. */ size_t buffer_size_in_bytes = VISIBILITY_COUNT_PER_BUFFER * VISIBILITY_RESULT_SIZE_IN_BYTES; @@ -62,7 +62,7 @@ void MTLQueryPool::begin_query() int query_id = query_issued_; int requested_buffer = query_id / VISIBILITY_COUNT_PER_BUFFER; if (requested_buffer >= buffer_.size()) { - allocate_buffer(); + allocate(); } BLI_assert(requested_buffer < buffer_.size()); diff --git a/source/blender/gpu/opengl/gl_index_buffer.hh b/source/blender/gpu/opengl/gl_index_buffer.hh index d9bd85cefb3..974c01d2b65 100644 --- a/source/blender/gpu/opengl/gl_index_buffer.hh +++ b/source/blender/gpu/opengl/gl_index_buffer.hh @@ -53,6 +53,10 @@ class GLIndexBuf : public IndexBuf { private: bool is_active() const; + void strip_restart_indices() override + { + /* No-op. */ + } MEM_CXX_CLASS_ALLOC_FUNCS("GLIndexBuf") }; |