/* SPDX-License-Identifier: GPL-2.0-or-later */ /** \file * \ingroup gpu */ #include "mtl_index_buffer.hh" #include "mtl_context.hh" #include "mtl_debug.hh" #include "BLI_span.hh" namespace blender::gpu { /* -------------------------------------------------------------------- */ /** \name Core MTLIndexBuf implementation. * \{ */ MTLIndexBuf::~MTLIndexBuf() { if (ibo_ != nullptr && !this->is_subrange_) { ibo_->free(); } this->free_optimized_buffer(); } void MTLIndexBuf::free_optimized_buffer() { if (optimized_ibo_) { optimized_ibo_->free(); optimized_ibo_ = nullptr; } } void MTLIndexBuf::bind_as_ssbo(uint32_t binding) { /* Flag buffer as incompatible with optimized/patched buffers as contents * can now have partial modifications from the GPU. */ this->flag_can_optimize(false); this->free_optimized_buffer(); /* Ensure we have a valid IBO. */ BLI_assert(this->ibo_); /* TODO(Metal): Support index buffer SSBO's. Dependent on compute implementation. */ MTL_LOG_WARNING("MTLIndexBuf::bind_as_ssbo not yet implemented!\n"); } const uint32_t *MTLIndexBuf::read() const { if (ibo_ != nullptr) { /* Return host pointer. */ void *data = ibo_->get_host_ptr(); return static_cast(data); } BLI_assert(false && "Index buffer not ready to be read."); return nullptr; } void MTLIndexBuf::upload_data() { /* Handle sub-range upload. */ if (is_subrange_) { MTLIndexBuf *mtlsrc = static_cast(src_); mtlsrc->upload_data(); #ifndef NDEBUG BLI_assert_msg(!mtlsrc->point_restarts_stripped_, "Cannot use sub-range on stripped point buffer."); #endif /* If parent sub-range allocation has changed, * update our index buffer. */ if (alloc_size_ != mtlsrc->alloc_size_ || ibo_ != mtlsrc->ibo_) { /* Update index buffer and allocation from source. */ alloc_size_ = mtlsrc->alloc_size_; ibo_ = mtlsrc->ibo_; /* Reset any allocated patched or optimized index buffers. */ this->free_optimized_buffer(); } return; } /* If new data ready, and index buffer already exists, release current. */ if ((ibo_ != nullptr) && (this->data_ != nullptr)) { MTL_LOG_INFO("Re-creating index buffer with new data. IndexBuf %p\n", this); ibo_->free(); ibo_ = nullptr; } /* Prepare Buffer and Upload Data. */ if (ibo_ == nullptr && data_ != nullptr) { alloc_size_ = this->size_get(); if (alloc_size_ == 0) { MTL_LOG_WARNING("[Metal] Warning! Trying to allocate index buffer with size=0 bytes\n"); } else { ibo_ = MTLContext::get_global_memory_manager().allocate_with_data(alloc_size_, true, data_); BLI_assert(ibo_); ibo_->set_label(@"Index Buffer"); } /* No need to keep copy of data_ in system memory. */ MEM_SAFE_FREE(data_); } } void MTLIndexBuf::update_sub(uint32_t start, uint32_t len, const void *data) { BLI_assert(!is_subrange_); /* If host-side data still exists, modify and upload as normal */ if (data_ != nullptr) { /* Free index buffer if one exists. */ if (ibo_ != nullptr && !this->is_subrange_) { ibo_->free(); ibo_ = nullptr; } BLI_assert(start + len < this->size_get()); /* Apply start byte offset to data pointer. */ void *modified_base_ptr = data_; uint8_t *ptr = static_cast(modified_base_ptr); ptr += start; modified_base_ptr = static_cast(ptr); /* Modify host-side data. */ memcpy(modified_base_ptr, data, len); return; } /* Verify buffer. */ BLI_assert(ibo_ != nullptr); /* Otherwise, we will inject a data update, using staged data, into the command stream. * Stage update contents in temporary buffer. */ MTLContext *ctx = static_cast(unwrap(GPU_context_active_get())); BLI_assert(ctx); MTLTemporaryBuffer range = ctx->get_scratchbuffer_manager().scratch_buffer_allocate_range(len); memcpy(range.data, data, len); /* Copy updated contents into primary buffer. * These changes need to be uploaded via blit to ensure the data copies happen in-order. */ id dest_buffer = ibo_->get_metal_buffer(); BLI_assert(dest_buffer != nil); id enc = ctx->main_command_buffer.ensure_begin_blit_encoder(); [enc copyFromBuffer:range.metal_buffer sourceOffset:(uint32_t)range.buffer_offset toBuffer:dest_buffer destinationOffset:start size:len]; /* Synchronize changes back to host to ensure CPU-side data is up-to-date for non * Shared buffers. */ if (dest_buffer.storageMode == MTLStorageModeManaged) { [enc synchronizeResource:dest_buffer]; } /* Invalidate patched/optimized buffers. */ this->free_optimized_buffer(); /* Flag buffer as incompatible with optimized/patched buffers as contents * have partial modifications. */ this->flag_can_optimize(false); BLI_assert(false); } void MTLIndexBuf::flag_can_optimize(bool can_optimize) { can_optimize_ = can_optimize; } /** \} */ /** \name Index buffer optimization and topology emulation * * Index buffer optimization and emulation. Optimize index buffers by * eliminating restart-indices. * Emulate unsupported index types e.g. Triangle Fan and Line Loop. * \{ */ /* Returns total vertices in new buffer. */ template static uint32_t populate_optimized_tri_strip_buf(Span original_data, MutableSpan output_data, uint32_t input_index_len) { /* Generate #TriangleList from #TriangleStrip. */ uint32_t current_vert_len = 0; uint32_t current_output_ind = 0; T indices[3]; for (int c_index = 0; c_index < input_index_len; c_index++) { T current_index = original_data[c_index]; if (current_index == T(-1)) { /* Stop current primitive. Move onto next. */ current_vert_len = 0; } else { if (current_vert_len < 3) { /* Prepare first triangle. * Cache indices before generating a triangle, in case we have bad primitive-restarts. */ indices[current_vert_len] = current_index; } /* Emit triangle once we reach 3 input verts in current strip. */ if (current_vert_len == 3) { /* First triangle in strip. */ output_data[current_output_ind++] = indices[0]; output_data[current_output_ind++] = indices[1]; output_data[current_output_ind++] = indices[2]; } else if (current_vert_len > 3) { /* All other triangles in strip. * These triangles are populated using data from previous 2 vertices * and the latest index. */ uint32_t tri_id = current_vert_len - 3; uint32_t base_output_ind = current_output_ind; if ((tri_id % 2) == 0) { output_data[base_output_ind + 0] = output_data[base_output_ind - 2]; output_data[base_output_ind + 1] = current_index; output_data[base_output_ind + 2] = output_data[base_output_ind - 1]; } else { output_data[base_output_ind + 0] = output_data[base_output_ind - 1]; output_data[base_output_ind + 1] = output_data[base_output_ind - 2]; output_data[base_output_ind + 2] = current_index; } current_output_ind += 3; } /* Increment relative vertex index. */ current_vert_len++; } } return current_output_ind; } /* Returns total vertices in new buffer. */ template static uint32_t populate_emulated_tri_fan_buf(Span original_data, MutableSpan output_data, uint32_t input_index_len) { /* Generate #TriangleList from #TriangleFan. */ T base_prim_ind_val = 0; uint32_t current_vert_len = 0; uint32_t current_output_ind = 0; T indices[3]; for (int c_index = 0; c_index < input_index_len; c_index++) { T current_index = original_data[c_index]; if (current_index == T(-1)) { /* Stop current primitive. Move onto next. */ current_vert_len = 0; } else { if (current_vert_len < 3) { /* Prepare first triangle. * Cache indices before generating a triangle, in case we have bad primitive-restarts. */ indices[current_vert_len] = current_index; } /* emit triangle once we reach 3 input verts in current strip. */ if (current_vert_len == 3) { /* First triangle in strip. */ output_data[current_output_ind++] = indices[0]; output_data[current_output_ind++] = indices[1]; output_data[current_output_ind++] = indices[2]; base_prim_ind_val = indices[0]; } else if (current_vert_len > 3) { /* All other triangles in strip. * These triangles are populated using data from previous 2 vertices * and the latest index. */ uint32_t base_output_ind = current_output_ind; output_data[base_output_ind + 0] = base_prim_ind_val; output_data[base_output_ind + 1] = output_data[base_output_ind - 1]; output_data[base_output_ind + 2] = current_index; current_output_ind += 3; } /* Increment relative vertex index. */ current_vert_len++; } } return current_output_ind; } id MTLIndexBuf::get_index_buffer(GPUPrimType &in_out_primitive_type, uint32_t &in_out_v_count) { /* Determine whether to return the original index buffer, or whether we * should emulate an unsupported primitive type, or optimize a restart- * compatible type for faster performance. */ bool should_optimize_or_emulate = (in_out_primitive_type == GPU_PRIM_TRI_FAN) || (in_out_primitive_type == GPU_PRIM_TRI_STRIP); if (!should_optimize_or_emulate || is_subrange_ || !can_optimize_) { /* Ensure we are not optimized. */ BLI_assert(this->optimized_ibo_ == nullptr); /* Return regular index buffer. */ BLI_assert(this->ibo_ && this->ibo_->get_metal_buffer()); return this->ibo_->get_metal_buffer(); } /* Perform optimization on type. */ GPUPrimType input_prim_type = in_out_primitive_type; this->upload_data(); if (!ibo_ && optimized_ibo_ == nullptr) { /* Cannot optimize buffer if no source IBO exists. */ return nil; } /* Verify whether existing index buffer is valid. */ if (optimized_ibo_ != nullptr && optimized_primitive_type_ != input_prim_type) { BLI_assert_msg(false, "Cannot change the optimized primitive format after generation, as source " "index buffer data is discarded."); return nil; } /* Generate optimized index buffer. */ if (optimized_ibo_ == nullptr) { /* Generate unwrapped index buffer. */ switch (input_prim_type) { case GPU_PRIM_TRI_FAN: { /* Calculate maximum size. */ uint32_t max_possible_verts = (this->index_len_ - 2) * 3; BLI_assert(max_possible_verts > 0); /* Allocate new buffer. */ optimized_ibo_ = MTLContext::get_global_memory_manager().allocate( max_possible_verts * ((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)), true); /* Populate new index buffer. */ if (index_type_ == GPU_INDEX_U16) { Span orig_data(static_cast(ibo_->get_host_ptr()), this->index_len_); MutableSpan output_data( static_cast(optimized_ibo_->get_host_ptr()), this->index_len_); emulated_v_count = populate_emulated_tri_fan_buf( orig_data, output_data, this->index_len_); } else { Span orig_data(static_cast(ibo_->get_host_ptr()), this->index_len_); MutableSpan output_data( static_cast(optimized_ibo_->get_host_ptr()), this->index_len_); emulated_v_count = populate_emulated_tri_fan_buf( orig_data, output_data, this->index_len_); } BLI_assert(emulated_v_count <= max_possible_verts); /* Flush buffer and output. */ optimized_ibo_->flush(); optimized_primitive_type_ = input_prim_type; in_out_v_count = emulated_v_count; in_out_primitive_type = GPU_PRIM_TRIS; } case GPU_PRIM_TRI_STRIP: { /* Calculate maximum size. */ uint32_t max_possible_verts = (this->index_len_ - 2) * 3; BLI_assert(max_possible_verts > 0); /* Allocate new buffer. */ optimized_ibo_ = MTLContext::get_global_memory_manager().allocate( max_possible_verts * ((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)), true); /* Populate new index buffer. */ if (index_type_ == GPU_INDEX_U16) { Span orig_data(static_cast(ibo_->get_host_ptr()), this->index_len_); MutableSpan output_data( static_cast(optimized_ibo_->get_host_ptr()), this->index_len_); emulated_v_count = populate_optimized_tri_strip_buf( orig_data, output_data, this->index_len_); } else { Span orig_data(static_cast(ibo_->get_host_ptr()), this->index_len_); MutableSpan output_data( static_cast(optimized_ibo_->get_host_ptr()), this->index_len_); emulated_v_count = populate_optimized_tri_strip_buf( orig_data, output_data, this->index_len_); } BLI_assert(emulated_v_count <= max_possible_verts); /* Flush buffer and output. */ optimized_ibo_->flush(); optimized_primitive_type_ = input_prim_type; in_out_v_count = emulated_v_count; in_out_primitive_type = GPU_PRIM_TRIS; } break; case GPU_PRIM_LINE_STRIP: { /* TODO(Metal): Line strip topology types would benefit from optimization to remove * primitive restarts, however, these do not occur frequently, nor with * significant geometry counts. */ MTL_LOG_INFO("TODO: Primitive topology: Optimize line strip topology types\n"); } break; case GPU_PRIM_LINE_LOOP: { /* TODO(Metal): Line Loop primitive type requires use of optimized index buffer for * emulation, if used with indexed rendering. This path is currently not hit as #LineLoop * does not currently appear to be used alongside an index buffer. */ MTL_LOG_WARNING( "TODO: Primitive topology: Line Loop Index buffer optimization required for " "emulation.\n"); } break; case GPU_PRIM_TRIS: case GPU_PRIM_LINES: case GPU_PRIM_POINTS: { /* Should not get here - TRIS/LINES/POINTS do not require emulation or optimization. */ BLI_assert_unreachable(); return nil; } default: /* Should not get here - Invalid primitive type. */ BLI_assert_unreachable(); break; } } /* Return optimized buffer. */ if (optimized_ibo_ != nullptr) { /* Delete original buffer if one still exists, as we do no need it. */ if (ibo_ != nullptr) { ibo_->free(); ibo_ = nullptr; } /* Output params. */ in_out_v_count = emulated_v_count; in_out_primitive_type = GPU_PRIM_TRIS; return optimized_ibo_->get_metal_buffer(); } return nil; } void MTLIndexBuf::strip_restart_indices() { /* We remove point buffer primitive restart indices by swapping restart indices * with the first valid index at the end of the index buffer and reducing the * length. Primitive restarts are invalid in Metal for non-restart-compatible * primitive types. We also cannot just use zero unlike for Lines and Triangles, * as we cannot create de-generative point primitives to hide geometry, as each * point is independent. * Instead, we must remove these hidden indices from the index buffer. * NOTE: This happens prior to index squeezing so operate on 32-bit indices. */ MutableSpan uint_idx(static_cast(data_), index_len_); for (uint i = 0; i < index_len_; i++) { if (uint_idx[i] == 0xFFFFFFFFu) { /* Find swap index at end of index buffer. */ int swap_index = -1; for (uint j = index_len_ - 1; j >= i; j--) { /* If end index is restart, just reduce length. */ if (uint_idx[j] == 0xFFFFFFFFu) { index_len_--; continue; } /* Otherwise assign swap index. */ swap_index = j; break; } /* If swap index is not valid, then there were no valid non-restart indices * to swap with. However, the above loop will have removed these indices by * reducing the length of indices. Debug assertions verify that the restart * index is no longer included. */ if (swap_index == -1) { BLI_assert(index_len_ <= i); } else { /* If we have found an index we can swap with, flip the values. * We also reduce the length. As per above loop, swap_index should * now be outside the index length range. */ uint32_t swap_index_value = uint_idx[swap_index]; uint_idx[i] = swap_index_value; uint_idx[swap_index] = 0xFFFFFFFFu; index_len_--; BLI_assert(index_len_ <= swap_index); } } } #ifndef NDEBUG /* Flag as having been stripped to ensure invalid usage is tracked. */ point_restarts_stripped_ = true; #endif } /** \} */ } // blender::gpu