From 5f4409b02ef7c54089ff1b491e008d4b86c030f4 Mon Sep 17 00:00:00 2001
From: Jason Fielder <jason_apple>
Date: Thu, 1 Sep 2022 21:42:47 +0200
Subject: Metal: MTLIndexBuf class implementation.

Implementation also contains a number of optimisations and feature enablements specific to the Metal API and Apple Silicon GPUs.

Ref T96261

Reviewed By: fclem

Maniphest Tasks: T96261

Differential Revision: https://developer.blender.org/D15369
---
 source/blender/gpu/metal/mtl_backend.hh      |   1 -
 source/blender/gpu/metal/mtl_backend.mm      |   4 +-
 source/blender/gpu/metal/mtl_context.hh      |   1 -
 source/blender/gpu/metal/mtl_index_buffer.hh |  79 ++++
 source/blender/gpu/metal/mtl_index_buffer.mm | 516 +++++++++++++++++++++++++++
 source/blender/gpu/metal/mtl_query.hh        |   2 +-
 source/blender/gpu/metal/mtl_query.mm        |   6 +-
 7 files changed, 601 insertions(+), 8 deletions(-)
 create mode 100644 source/blender/gpu/metal/mtl_index_buffer.hh
 create mode 100644 source/blender/gpu/metal/mtl_index_buffer.mm

(limited to 'source/blender/gpu/metal')

diff --git a/source/blender/gpu/metal/mtl_backend.hh b/source/blender/gpu/metal/mtl_backend.hh
index fe49a0fce60..214a5d738a9 100644
--- a/source/blender/gpu/metal/mtl_backend.hh
+++ b/source/blender/gpu/metal/mtl_backend.hh
@@ -16,7 +16,6 @@ namespace blender::gpu {
 class Batch;
 class DrawList;
 class FrameBuffer;
-class IndexBuf;
 class QueryPool;
 class Shader;
 class UniformBuf;
diff --git a/source/blender/gpu/metal/mtl_backend.mm b/source/blender/gpu/metal/mtl_backend.mm
index a15da4df083..361b2ca05f5 100644
--- a/source/blender/gpu/metal/mtl_backend.mm
+++ b/source/blender/gpu/metal/mtl_backend.mm
@@ -10,6 +10,7 @@
 #include "mtl_backend.hh"
 #include "mtl_context.hh"
 #include "mtl_framebuffer.hh"
+#include "mtl_index_buffer.hh"
 #include "mtl_query.hh"
 #include "mtl_uniform_buffer.hh"
 
@@ -60,8 +61,7 @@ FrameBuffer *MTLBackend::framebuffer_alloc(const char *name)
 
 IndexBuf *MTLBackend::indexbuf_alloc()
 {
-  /* TODO(Metal): Implement MTLIndexBuf. */
-  return nullptr;
+  return new MTLIndexBuf();
 };
 
 QueryPool *MTLBackend::querypool_alloc()
diff --git a/source/blender/gpu/metal/mtl_context.hh b/source/blender/gpu/metal/mtl_context.hh
index 0db87bf5da5..d542f0e1025 100644
--- a/source/blender/gpu/metal/mtl_context.hh
+++ b/source/blender/gpu/metal/mtl_context.hh
@@ -3,7 +3,6 @@
 /** \file
  * \ingroup gpu
  */
-
 #pragma once
 
 #include "MEM_guardedalloc.h"
diff --git a/source/blender/gpu/metal/mtl_index_buffer.hh b/source/blender/gpu/metal/mtl_index_buffer.hh
new file mode 100644
index 00000000000..5182eeab5e3
--- /dev/null
+++ b/source/blender/gpu/metal/mtl_index_buffer.hh
@@ -0,0 +1,79 @@
+
+/** \file
+ * \ingroup gpu
+ */
+
+#pragma once
+
+#include "MEM_guardedalloc.h"
+#include "gpu_index_buffer_private.hh"
+#include "mtl_context.hh"
+#include <Cocoa/Cocoa.h>
+#include <Metal/Metal.h>
+#include <QuartzCore/QuartzCore.h>
+
+namespace blender::gpu {
+
+class MTLIndexBuf : public IndexBuf {
+  friend class MTLBatch;
+  friend class MTLDrawList;
+
+ private:
+  /* Metal buffer resource. */
+  gpu::MTLBuffer *ibo_ = nullptr;
+  uint64_t alloc_size_ = 0;
+
+#ifndef NDEBUG
+  /* Flags whether point index buffer has been compacted
+   * to remove false retart indices. */
+  bool point_restarts_stripped_ = false;
+#endif
+
+  /* Optimised index buffers.
+   * NOTE(Metal): This optimization encodes a new index buffer following
+   * TriangleList topology. Parsing of Index buffers is more optimal
+   * when not using restart-compatible primitive topology types. */
+  GPUPrimType optimized_primitive_type_;
+  gpu::MTLBuffer *optimized_ibo_ = nullptr;
+  uint32_t emulated_v_count = 0;
+  void free_optimized_buffer();
+
+  /* Flags whether an index buffer can be optimized.
+   * For index buffers which are partially modified
+   * on the host, or by the GPU, optimization cannot be performed. */
+  bool can_optimize_ = true;
+
+ public:
+  ~MTLIndexBuf();
+
+  void bind_as_ssbo(uint32_t binding) override;
+  const uint32_t *read() const override;
+
+  void upload_data() override;
+  void update_sub(uint32_t start, uint32_t len, const void *data) override;
+
+  /* get_index_buffer can conditionally return an optimized index buffer of a
+   * differing format, if it is concluded that optimization is preferred
+   * for the given inputs.
+   * Index buffer optimization is used to replace restart-compatbiele
+   * primitive types with non-restart-compatible ones such as TriangleList and
+   * LineList. This improves GPU execution for these types significantly, while
+   * only incuring a small performance penalty.
+   *
+   * This is also used to emulate unsupported topology types
+   * such as triangle fan. */
+  id<MTLBuffer> get_index_buffer(GPUPrimType &in_out_primitive_type, uint &in_out_v_count);
+  void flag_can_optimize(bool can_optimize);
+
+  static MTLIndexType gpu_index_type_to_metal(GPUIndexBufType type)
+  {
+    return (type == GPU_INDEX_U16) ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32;
+  }
+
+ private:
+  void strip_restart_indices() override;
+
+  MEM_CXX_CLASS_ALLOC_FUNCS("MTLIndexBuf")
+};
+
+}  // namespace blender::gpu
diff --git a/source/blender/gpu/metal/mtl_index_buffer.mm b/source/blender/gpu/metal/mtl_index_buffer.mm
new file mode 100644
index 00000000000..4a7875aaeb0
--- /dev/null
+++ b/source/blender/gpu/metal/mtl_index_buffer.mm
@@ -0,0 +1,516 @@
+
+/** \file
+ * \ingroup gpu
+ */
+#include "mtl_index_buffer.hh"
+#include "mtl_context.hh"
+#include "mtl_debug.hh"
+
+#include "BLI_span.hh"
+
+namespace blender::gpu {
+
+/* -------------------------------------------------------------------- */
+/** \name Core MTLIndexBuf implementation.
+ * \{ */
+
+MTLIndexBuf::~MTLIndexBuf()
+{
+  if (ibo_ != nullptr && !this->is_subrange_) {
+    ibo_->free();
+  }
+  this->free_optimized_buffer();
+}
+
+void MTLIndexBuf::free_optimized_buffer()
+{
+  if (optimized_ibo_) {
+    optimized_ibo_->free();
+    optimized_ibo_ = nullptr;
+  }
+}
+
+void MTLIndexBuf::bind_as_ssbo(uint32_t binding)
+{
+  /* Flag buffer as incompatible with optimized/patched buffers as contents
+   * can now have partial modifications from the GPU. */
+  this->flag_can_optimize(false);
+  this->free_optimized_buffer();
+
+  /* Ensure we have a valid IBO. */
+  BLI_assert(this->ibo_);
+
+  /* TODO(Metal): Support index buffer SSBOs. Dependent on compute impl. */
+  MTL_LOG_WARNING("MTLIndexBuf::bind_as_ssbo not yet implemented!\n");
+}
+
+const uint32_t *MTLIndexBuf::read() const
+{
+  if (ibo_ != nullptr) {
+
+    /* Return host pointer. */
+    void *data = ibo_->get_host_ptr();
+    return static_cast<uint32_t *>(data);
+  }
+  BLI_assert(false && "Index buffer not ready to be read.");
+  return nullptr;
+}
+
+void MTLIndexBuf::upload_data()
+{
+  /* Handle subrange upload. */
+  if (is_subrange_) {
+    MTLIndexBuf *mtlsrc = static_cast<MTLIndexBuf *>(src_);
+    mtlsrc->upload_data();
+
+#ifndef NDEBUG
+    BLI_assert_msg(!mtlsrc->point_restarts_stripped_,
+                   "Cannot use subrange on stripped point buffer.");
+#endif
+
+    /* If parent subrange allocation has changed,
+     * update our index buffer. */
+    if (alloc_size_ != mtlsrc->alloc_size_ || ibo_ != mtlsrc->ibo_) {
+
+      /* Update index buffer and allocation from source. */
+      alloc_size_ = mtlsrc->alloc_size_;
+      ibo_ = mtlsrc->ibo_;
+
+      /* Reset any allocated patched or optimized index buffers. */
+      this->free_optimized_buffer();
+    }
+    return;
+  }
+
+  /* If new data ready, and index buffer already exists, release current. */
+  if ((ibo_ != nullptr) && (this->data_ != nullptr)) {
+    MTL_LOG_INFO("Re-creating index buffer with new data. IndexBuf %p\n", this);
+    ibo_->free();
+    ibo_ = nullptr;
+  }
+
+  /* Prepare Buffer and Upload Data. */
+  if (ibo_ == nullptr && data_ != nullptr) {
+    alloc_size_ = this->size_get();
+    if (alloc_size_ == 0) {
+      MTL_LOG_WARNING("[Metal] Warning! Trying to allocate index buffer with size=0 bytes\n");
+    }
+    else {
+      ibo_ = MTLContext::get_global_memory_manager().allocate_with_data(alloc_size_, true, data_);
+      BLI_assert(ibo_);
+      ibo_->set_label(@"Index Buffer");
+    }
+
+    /* No need to keep copy of data_ in system memory. */
+    MEM_SAFE_FREE(data_);
+  }
+}
+
+void MTLIndexBuf::update_sub(uint32_t start, uint32_t len, const void *data)
+{
+  BLI_assert(!is_subrange_);
+
+  /* If host-side data still exists, modify and upload as normal */
+  if (data_ != nullptr) {
+
+    /* Free index buffer if one exists. */
+    if (ibo_ != nullptr && !this->is_subrange_) {
+      ibo_->free();
+      ibo_ = nullptr;
+    }
+
+    BLI_assert(start + len < this->size_get());
+
+    /* Apply start byte offset to data pointer. */
+    void *modified_base_ptr = data_;
+    uint8_t *ptr = static_cast<uint8_t *>(modified_base_ptr);
+    ptr += start;
+    modified_base_ptr = static_cast<void *>(ptr);
+
+    /* Modify host-side data. */
+    memcpy(modified_base_ptr, data, len);
+    return;
+  }
+
+  /* Verify buffer. */
+  BLI_assert(ibo_ != nullptr);
+
+  /* Otherwise, we will inject a data update, using staged data, into the command stream.
+   * Stage update contents in temporary buffer*/
+  MTLContext *ctx = static_cast<MTLContext *>(unwrap(GPU_context_active_get()));
+  BLI_assert(ctx);
+  MTLTemporaryBuffer range = ctx->get_scratchbuffer_manager().scratch_buffer_allocate_range(len);
+  memcpy(range.data, data, len);
+
+  /* Copy updated contents into primary buffer.
+   * These changes need to be uploaded via blit to ensure the data copies happen in-order. */
+  id<MTLBuffer> dest_buffer = ibo_->get_metal_buffer();
+  BLI_assert(dest_buffer != nil);
+
+  id<MTLBlitCommandEncoder> enc = ctx->main_command_buffer.ensure_begin_blit_encoder();
+  [enc copyFromBuffer:range.metal_buffer
+           sourceOffset:(uint32_t)range.buffer_offset
+               toBuffer:dest_buffer
+      destinationOffset:start
+                   size:len];
+
+  /* Synchronise changes back to host to ensure CPU-side data is up-to-date for non
+   * Shared buffers. */
+  if (dest_buffer.storageMode == MTLStorageModeManaged) {
+    [enc synchronizeResource:dest_buffer];
+  }
+
+  /* Invalidate patched/optimized buffers. */
+  this->free_optimized_buffer();
+
+  /* Flag buffer as incompatible with optimized/patched buffers as contents
+   * have partial modifications. */
+  this->flag_can_optimize(false);
+
+  BLI_assert(false);
+}
+
+void MTLIndexBuf::flag_can_optimize(bool can_optimize)
+{
+  can_optimize_ = can_optimize;
+}
+
+/** \} */
+
+/** \name Index buffer optimization and topology emulation.
+ * Index buffer optimization and emulation. Optimise index buffers by
+ * eliminating restart-indices.
+ * Emulate unsupported index types e.g. Triangle Fan and Line Loop.
+ * \{ */
+
+/* Returns total vertices in new buffer. */
+template<typename T>
+static uint32_t populate_optimized_tri_strip_buf(Span<T> original_data,
+                                                 MutableSpan<T> output_data,
+                                                 uint32_t input_index_len)
+{
+  /* Generate TriangleList from TriangleStrip. */
+  uint32_t current_vert_len = 0;
+  uint32_t current_output_ind = 0;
+  T indices[3];
+
+  for (int c_index = 0; c_index < input_index_len; c_index++) {
+    T current_index = original_data[c_index];
+    if (current_index == T(-1)) {
+      /* Stop current primitive. Move onto next. */
+      current_vert_len = 0;
+    }
+    else {
+      if (current_vert_len < 3) {
+        /* prepare first triangle.
+         * Cache indices before genrating a triangle,
+         * in case we have bad primitive-restarts. */
+        indices[current_vert_len] = current_index;
+      }
+
+      /* emit triangle once we reach 3 input verts in current strip. */
+      if (current_vert_len == 3) {
+        /* First triangle in strip. */
+        output_data[current_output_ind++] = indices[0];
+        output_data[current_output_ind++] = indices[1];
+        output_data[current_output_ind++] = indices[2];
+      }
+      else if (current_vert_len > 3) {
+        /* All other triangles in strip.
+         * These triangles are populated using data from previous 2 vertices
+         * and the latest index. */
+        uint32_t tri_id = current_vert_len - 3;
+        uint32_t base_output_ind = current_output_ind;
+        if ((tri_id % 2) == 0) {
+          output_data[base_output_ind + 0] = output_data[base_output_ind - 2];
+          output_data[base_output_ind + 1] = current_index;
+          output_data[base_output_ind + 2] = output_data[base_output_ind - 1];
+        }
+        else {
+          output_data[base_output_ind + 0] = output_data[base_output_ind - 1];
+          output_data[base_output_ind + 1] = output_data[base_output_ind - 2];
+          output_data[base_output_ind + 2] = current_index;
+        }
+        current_output_ind += 3;
+      }
+
+      /* Increment relative vertex index. */
+      current_vert_len++;
+    }
+  }
+  return current_output_ind;
+}
+
+/* Returns total vertices in new buffer. */
+template<typename T>
+static uint32_t populate_emulated_tri_fan_buf(Span<T> original_data,
+                                              MutableSpan<T> output_data,
+                                              uint32_t input_index_len)
+{
+  /* Generate TriangleList from TriangleFan. */
+  T base_prim_ind_val = 0;
+  uint32_t current_vert_len = 0;
+  uint32_t current_output_ind = 0;
+  T indices[3];
+
+  for (int c_index = 0; c_index < input_index_len; c_index++) {
+    T current_index = original_data[c_index];
+    if (current_index == T(-1)) {
+      /* Stop current primitive. Move onto next. */
+      current_vert_len = 0;
+    }
+    else {
+      if (current_vert_len < 3) {
+        /* prepare first triangle.
+         * Cache indices before genrating a triangle,
+         * in case we have bad primitive-restarts. */
+        indices[current_vert_len] = current_index;
+      }
+
+      /* emit triangle once we reach 3 input verts in current strip. */
+      if (current_vert_len == 3) {
+        /* First triangle in strip. */
+        output_data[current_output_ind++] = indices[0];
+        output_data[current_output_ind++] = indices[1];
+        output_data[current_output_ind++] = indices[2];
+        base_prim_ind_val = indices[0];
+      }
+      else if (current_vert_len > 3) {
+        /* All other triangles in strip.
+         * These triangles are populated using data from previous 2 vertices
+         * and the latest index. */
+        uint32_t base_output_ind = current_output_ind;
+
+        output_data[base_output_ind + 0] = base_prim_ind_val;
+        output_data[base_output_ind + 1] = output_data[base_output_ind - 1];
+        output_data[base_output_ind + 2] = current_index;
+        current_output_ind += 3;
+      }
+
+      /* Increment relative vertex index. */
+      current_vert_len++;
+    }
+  }
+  return current_output_ind;
+}
+
+id<MTLBuffer> MTLIndexBuf::get_index_buffer(GPUPrimType &in_out_primitive_type,
+                                            uint32_t &in_out_v_count)
+{
+  /* Determine whether to return the original index buffer, or whether we
+   * should emulate an unsupported primitive type, or optimisze a restart-
+   * compatible type for faster performance. */
+  bool should_optimize_or_emulate = (in_out_primitive_type == GPU_PRIM_TRI_FAN) ||
+                                    (in_out_primitive_type == GPU_PRIM_TRI_STRIP);
+  if (!should_optimize_or_emulate || is_subrange_ || !can_optimize_) {
+    /* Ensure we are not optimized. */
+    BLI_assert(this->optimized_ibo_ == nullptr);
+
+    /* Return regular index buffer. */
+    BLI_assert(this->ibo_ && this->ibo_->get_metal_buffer());
+    return this->ibo_->get_metal_buffer();
+  }
+
+  /* Perform optimization on type. */
+  GPUPrimType input_prim_type = in_out_primitive_type;
+  this->upload_data();
+  if (!ibo_ && optimized_ibo_ == nullptr) {
+    /* Cannot optimize buffer if no source IBO exists. */
+    return nil;
+  }
+
+  /* Verify whether existing index buffer is valid. */
+  if (optimized_ibo_ != nullptr && optimized_primitive_type_ != input_prim_type) {
+    BLI_assert_msg(false,
+                   "Cannot change the optimized primitive format after generation, as source "
+                   "index buffer data is discarded.");
+    return nil;
+  }
+
+  /* Generate optimized index buffer. */
+  if (optimized_ibo_ == nullptr) {
+
+    /* Generate unwrapped index buffer. */
+    switch (input_prim_type) {
+      case GPU_PRIM_TRI_FAN: {
+
+        /* Calculate maximum size. */
+        uint32_t max_possible_verts = (this->index_len_ - 2) * 3;
+        BLI_assert(max_possible_verts > 0);
+
+        /* Allocate new buffer. */
+        optimized_ibo_ = MTLContext::get_global_memory_manager().allocate(
+            max_possible_verts *
+                ((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)),
+            true);
+
+        /* Populate new index buffer. */
+        if (index_type_ == GPU_INDEX_U16) {
+          Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()),
+                                   this->index_len_);
+          MutableSpan<uint16_t> output_data(
+              static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
+          emulated_v_count = populate_emulated_tri_fan_buf<uint16_t>(
+              orig_data, output_data, this->index_len_);
+        }
+        else {
+          Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()),
+                                   this->index_len_);
+          MutableSpan<uint32_t> output_data(
+              static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
+          emulated_v_count = populate_emulated_tri_fan_buf<uint32_t>(
+              orig_data, output_data, this->index_len_);
+        }
+
+        BLI_assert(emulated_v_count <= max_possible_verts);
+
+        /* Flush buffer and output. */
+        optimized_ibo_->flush();
+        optimized_primitive_type_ = input_prim_type;
+        in_out_v_count = emulated_v_count;
+        in_out_primitive_type = GPU_PRIM_TRIS;
+      }
+
+      case GPU_PRIM_TRI_STRIP: {
+
+        /* Calculate maximum size. */
+        uint32_t max_possible_verts = (this->index_len_ - 2) * 3;
+        BLI_assert(max_possible_verts > 0);
+
+        /* Allocate new buffer. */
+        optimized_ibo_ = MTLContext::get_global_memory_manager().allocate(
+            max_possible_verts *
+                ((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)),
+            true);
+
+        /* Populate new index buffer. */
+        if (index_type_ == GPU_INDEX_U16) {
+          Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()),
+                                   this->index_len_);
+          MutableSpan<uint16_t> output_data(
+              static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
+          emulated_v_count = populate_optimized_tri_strip_buf<uint16_t>(
+              orig_data, output_data, this->index_len_);
+        }
+        else {
+          Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()),
+                                   this->index_len_);
+          MutableSpan<uint32_t> output_data(
+              static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
+          emulated_v_count = populate_optimized_tri_strip_buf<uint32_t>(
+              orig_data, output_data, this->index_len_);
+        }
+
+        BLI_assert(emulated_v_count <= max_possible_verts);
+
+        /* Flush buffer and output. */
+        optimized_ibo_->flush();
+        optimized_primitive_type_ = input_prim_type;
+        in_out_v_count = emulated_v_count;
+        in_out_primitive_type = GPU_PRIM_TRIS;
+      } break;
+
+      case GPU_PRIM_LINE_STRIP: {
+        /* TOOD(Metal): Line strip topology types would benefit from optimization to remove
+         * primitive restarts, however, these do not occur frequently, nor with
+         * significant geometry counts. */
+        MTL_LOG_INFO("TODO: Primitive topology: Optimise line strip topology types\n");
+      } break;
+
+      case GPU_PRIM_LINE_LOOP: {
+        /* TOOD(Metal): Line Loop primitive type requires use of optimized index buffer for
+         * emulation, if used with indexed rendering. This path is currently not hit as LineLoop
+         * does not currently appear to be used alongisde an index buffer. */
+        MTL_LOG_WARNING(
+            "TODO: Primitive topology: Line Loop Index buffer optimization required for "
+            "emulation.\n");
+      } break;
+
+      case GPU_PRIM_TRIS:
+      case GPU_PRIM_LINES:
+      case GPU_PRIM_POINTS: {
+        /* Should not get here - TRIS/LINES/POINTS do not require emulation or optimization. */
+        BLI_assert_unreachable();
+        return nil;
+      }
+
+      default:
+        /* Should not get here - Invalid primitive type. */
+        BLI_assert_unreachable();
+        break;
+    }
+  }
+
+  /* Return optimized buffer. */
+  if (optimized_ibo_ != nullptr) {
+
+    /* Delete original buffer if one still exists, as we do no need it. */
+    if (ibo_ != nullptr) {
+      ibo_->free();
+      ibo_ = nullptr;
+    }
+
+    /* Output params. */
+    in_out_v_count = emulated_v_count;
+    in_out_primitive_type = GPU_PRIM_TRIS;
+    return optimized_ibo_->get_metal_buffer();
+  }
+  return nil;
+}
+
+void MTLIndexBuf::strip_restart_indices()
+{
+  /* We remove point buffer primitive restart indices by swapping restart indices
+   * with the first valid index at the end of the index buffer and reducing the
+   * length. Primitive restarts are invalid in Metal for non-restart-compatible
+   * primitive types. We also cannot just use zero unlike for Lines and Triangles,
+   * as we cannot create de-generative point primitives to hide geometry, as each
+   * point is indepednent.
+   * Instead, we must remove these hidden indices from the index buffer.
+   * Note: This happens prior to index squeezing so operate on 32-bit indices. */
+  MutableSpan<uint32_t> uint_idx(static_cast<uint32_t *>(data_), index_len_);
+  for (uint i = 0; i < index_len_; i++) {
+    if (uint_idx[i] == 0xFFFFFFFFu) {
+
+      /* Find swap index at end of index buffer. */
+      int swap_index = -1;
+      for (uint j = index_len_ - 1; j >= i; j--) {
+        /* If end index is restart, just reduce length. */
+        if (uint_idx[j] == 0xFFFFFFFFu) {
+          index_len_--;
+          continue;
+        }
+        /* Otherwise assign swap index. */
+        swap_index = j;
+        break;
+      }
+
+      /* If swap index is not valid, then there were no valid non-restart indices
+       * to swap with. However, the above loop will have removed these indices by
+       * reducing the length of indices. Debug assertions verify that the restart
+       * index is no longer included. */
+      if (swap_index == -1) {
+        BLI_assert(index_len_ <= i);
+      }
+      else {
+        /* If we have found an index we can swap with, flip the values.
+         * We also reduce the length. As per above loop, swap_index should
+         * now be outside the index length range. */
+        uint32_t swap_index_value = uint_idx[swap_index];
+        uint_idx[i] = swap_index_value;
+        uint_idx[swap_index] = 0xFFFFFFFFu;
+        index_len_--;
+        BLI_assert(index_len_ <= swap_index);
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  /* Flag as having been stripped to ensure invalid usage is tracked. */
+  point_restarts_stripped_ = true;
+#endif
+}
+
+/** \} */
+
+}  // blender::gpu
diff --git a/source/blender/gpu/metal/mtl_query.hh b/source/blender/gpu/metal/mtl_query.hh
index c1ec9a2a0f5..03436fcd67d 100644
--- a/source/blender/gpu/metal/mtl_query.hh
+++ b/source/blender/gpu/metal/mtl_query.hh
@@ -25,7 +25,7 @@ class MTLQueryPool : public QueryPool {
   MTLVisibilityResultMode mtl_type_;
   Vector<gpu::MTLBuffer *> buffer_;
 
-  void allocate_buffer();
+  void allocate();
 
  public:
   MTLQueryPool();
diff --git a/source/blender/gpu/metal/mtl_query.mm b/source/blender/gpu/metal/mtl_query.mm
index 8983ea7ec44..f4bd5754b77 100644
--- a/source/blender/gpu/metal/mtl_query.mm
+++ b/source/blender/gpu/metal/mtl_query.mm
@@ -16,7 +16,7 @@ static const size_t VISIBILITY_RESULT_SIZE_IN_BYTES = 8;
 
 MTLQueryPool::MTLQueryPool()
 {
-  allocate_buffer();
+  allocate();
 }
 MTLQueryPool::~MTLQueryPool()
 {
@@ -26,7 +26,7 @@ MTLQueryPool::~MTLQueryPool()
   }
 }
 
-void MTLQueryPool::allocate_buffer()
+void MTLQueryPool::allocate()
 {
   /* Allocate Metal buffer for visibility results. */
   size_t buffer_size_in_bytes = VISIBILITY_COUNT_PER_BUFFER * VISIBILITY_RESULT_SIZE_IN_BYTES;
@@ -62,7 +62,7 @@ void MTLQueryPool::begin_query()
   int query_id = query_issued_;
   int requested_buffer = query_id / VISIBILITY_COUNT_PER_BUFFER;
   if (requested_buffer >= buffer_.size()) {
-    allocate_buffer();
+    allocate();
   }
 
   BLI_assert(requested_buffer < buffer_.size());
-- 
cgit v1.2.3