/* SPDX-License-Identifier: GPL-2.0-or-later */ #pragma once #include #include #include #include #include #include #include "mtl_common.hh" #include #include #include @class CAMetalLayer; @class MTLCommandQueue; @class MTLRenderPipelineState; /* Metal Memory Manager Overview. */ /* * The Metal Backend Memory manager is designed to provide an interface * for all other MTL_* modules where memory allocation is required. * * Different allocation strategies and data-structures are used depending * on how the data is used by the backend. These aim to optimally handle * system memory and abstract away any complexity from the MTL_* modules * themselves. * * There are two primary allocation modes which can be used: * * ** MTLScratchBufferManager ** * * Each MTLContext owns a ScratchBufferManager which is implemented * as a pool of circular buffers, designed to handle temporary * memory allocations which occur on a per-frame basis. The scratch * buffers allow flushing of host memory to the GPU to be batched. * * Each frame, the next scratch buffer is reset, then later flushed upon * command buffer submission. * * Note: This is allocated per-context due to allocations being tied * to workload submissions and context-specific submissions. * * Examples of scratch buffer usage are: * - Immediate-mode temporary vertex buffers. * - Shader uniform data updates * - Staging of data for resource copies, or, data reads/writes. * * Usage: * * MTLContext::get_scratchbuffer_manager() - to fetch active manager. * * MTLTemporaryBuffer scratch_buffer_allocate_range(size) * MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(size, align) * * --------------------------------------------------------------------------------- * ** MTLBufferPool ** * * For static and longer-lasting memory allocations, such as those for UBOs, * Vertex buffers, index buffers, etc; We want an optimal abstraction for * fetching a MTLBuffer of the desired size and resource options. * * Memory allocations can be expensive so the MTLBufferPool provides * functionality to track usage of these buffers and once a buffer * is no longer in use, it is returned to the buffer pool for use * by another backend resource. * * The MTLBufferPool provides functionality for safe tracking of resources, * as buffers freed on the host side must have their usage by the GPU tracked, * to ensure they are not prematurely re-used before they have finished being * used by the GPU. * * Note: The MTLBufferPool is a global construct which can be fetched from anywhere. * * Usage: * MTLContext::get_global_memory_manager(); - static routine to fetch global memory manager. * * gpu::MTLBuffer *allocate_buffer(size, is_cpu_visibile, bytes=nullptr) * gpu::MTLBuffer *allocate_buffer_aligned(size, alignment, is_cpu_visibile, bytes=nullptr) */ /* Debug memory statistics: Disabled by Macro rather than guarded for * performance considerations. */ #define MTL_DEBUG_MEMORY_STATISTICS 0 /* Allows a scratch buffer to temporarily grow beyond its maximum, which allows submission * of one-time-use data packets which are too large. */ #define MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION 1 namespace blender::gpu { /* Forward Declarations. */ class MTLContext; class MTLCommandBufferManager; class MTLUniformBuf; /* -------------------------------------------------------------------- */ /** \name Memory Management. * \{ */ /* MTLBuffer allocation wrapper. */ class MTLBuffer { private: /* Metal resource. */ id metal_buffer_; /* Host-visible mapped-memory pointer. Behavior depends on buffer type: * - Shared buffers: pointer represents base address of #MTLBuffer whose data * access has shared access by both the CPU and GPU on * Unified Memory Architectures (UMA). * - Managed buffer: Host-side mapped buffer region for CPU (Host) access. Managed buffers * must be manually flushed to transfer data to GPU-resident buffer. * - Private buffer: Host access is invalid, `data` will be nullptr. */ void *data_; /* Whether buffer is allocated from an external source. */ bool is_external_ = false; /* Allocation info. */ MTLResourceOptions options_; id device_; uint64_t alignment_; uint64_t size_; /* Allocated size may be larger than actual size. */ uint64_t usage_size_; /* Lifetime info - whether the current buffer is actively in use. A buffer * should be in use after it has been allocated. De-allocating the buffer, and * returning it to the free buffer pool will set in_use to false. Using a buffer * while it is not in-use should not be allowed and result in an error. */ std::atomic in_use_; public: MTLBuffer(id device, uint64_t size, MTLResourceOptions options, uint alignment = 1); MTLBuffer(id external_buffer); ~MTLBuffer(); /* Fetch information about backing MTLBuffer. */ id get_metal_buffer() const; void *get_host_ptr() const; uint64_t get_size_used() const; uint64_t get_size() const; /* Flush data to GPU. */ void flush(); void flush_range(uint64_t offset, uint64_t length); bool requires_flush(); /* Buffer usage tracking. */ void flag_in_use(bool used); bool get_in_use(); void set_usage_size(uint64_t size_used); /* Debug. */ void set_label(NSString *str); /* Read properties. */ MTLResourceOptions get_resource_options(); uint64_t get_alignment(); /* Resource-local free: For buffers allocated via memory manager, * this will call the context `free_buffer` method to return the buffer to the context memory * pool. * * Otherwise, free will release the associated metal resource. * As a note, calling the destructor will also destroy the buffer and associated metal * resource. */ void free(); /* Safety check to ensure buffers are not used after free. */ void debug_ensure_used(); }; /* View into part of an MTLBuffer. */ struct MTLBufferRange { id metal_buffer; void *data; uint64_t buffer_offset; uint64_t size; MTLResourceOptions options; void flush(); bool requires_flush(); }; /* Circular scratch buffer allocations should be seen as temporary and only used within the * lifetime of the frame. */ using MTLTemporaryBuffer = MTLBufferRange; /* Round-Robin Circular-buffer. */ class MTLCircularBuffer { friend class MTLScratchBufferManager; private: MTLContext &own_context_; /* Wrapped MTLBuffer allocation handled. */ gpu::MTLBuffer *cbuffer_; /* Current offset where next allocation will begin. */ uint64_t current_offset_; /* Whether the Circular Buffer can grow during re-allocation if * the size is exceeded. */ bool can_resize_; /* Usage information. */ uint64_t used_frame_index_; uint64_t last_flush_base_offset_; public: MTLCircularBuffer(MTLContext &ctx, uint64_t initial_size, bool allow_grow); ~MTLCircularBuffer(); MTLTemporaryBuffer allocate_range(uint64_t alloc_size); MTLTemporaryBuffer allocate_range_aligned(uint64_t alloc_size, uint alignment); void flush(); /* Reset pointer back to start of circular buffer. */ void reset(); }; /* Wrapper struct used by Memory Manager to sort and compare gpu::MTLBuffer resources inside the * memory pools. */ struct MTLBufferHandle { gpu::MTLBuffer *buffer; uint64_t buffer_size; inline MTLBufferHandle(gpu::MTLBuffer *buf) { this->buffer = buf; this->buffer_size = this->buffer->get_size(); } inline MTLBufferHandle(uint64_t compare_size) { this->buffer = nullptr; this->buffer_size = compare_size; } }; struct CompareMTLBuffer { bool operator()(const MTLBufferHandle &lhs, const MTLBufferHandle &rhs) const { return lhs.buffer_size < rhs.buffer_size; } }; /* An MTLSafeFreeList is a temporary list of gpu::MTLBuffers which have * been freed by the high level backend, but are pending GPU work execution before * the gpu::MTLBuffers can be returned to the Memory manager pools. * This list is implemented as a chunked linked-list. * * Only a single MTLSafeFreeList is active at one time and is associated with current command * buffer submissions. If an MTLBuffer is freed during the lifetime of a command buffer, it could * still possibly be in-use and as such, the MTLSafeFreeList will increment its reference count for * each command buffer submitted while the current pool is active. * * -- Reference count is incremented upon MTLCommandBuffer commit. * -- Reference count is decremented in the MTLCommandBuffer completion callback handler. * * A new MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than * individual buffer resource tracking reduces performance overhead. * * * The reference count starts at 1 to ensure that the reference count cannot prematurely reach * zero until any command buffers have been submitted. This additional decrement happens * when the next MTLSafeFreeList is created, to allow the existing pool to be released once * the reference count hits zero after submitted command buffers complete. * * Note: the Metal API independently tracks resources used by command buffers for the purpose of * keeping resources alive while in-use by the driver and CPU, however, this differs from the * MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing * previously allocated MTLBuffer resources to be re-used. This allows us to save on the expensive * cost of memory allocation. */ class MTLSafeFreeList { friend class MTLBufferPool; private: std::atomic reference_count_; std::atomic in_free_queue_; std::recursive_mutex lock_; /* Linked list of next MTLSafeFreeList chunk if current chunk is full. */ std::atomic has_next_pool_; std::atomic next_; /* Lockless list. MAX_NUM_BUFFERS_ within a chunk based on considerations * for performance and memory. */ static const int MAX_NUM_BUFFERS_ = 1024; std::atomic current_list_index_; gpu::MTLBuffer *safe_free_pool_[MAX_NUM_BUFFERS_]; public: MTLSafeFreeList(); /* Add buffer to Safe Free List, can be called from secondary threads. * Performs a lockless list insert. */ void insert_buffer(gpu::MTLBuffer *buffer); /* Increments command buffer reference count. */ void increment_reference(); /* Decrement and return of buffers to pool occur on MTLCommandBuffer completion callback thread. */ void decrement_reference(); void flag_in_queue() { in_free_queue_ = true; if (has_next_pool_) { MTLSafeFreeList *next_pool = next_.load(); BLI_assert(next_pool != nullptr); next_pool->flag_in_queue(); } } }; /* MTLBuffer pools. */ /* Allocating Metal buffers is expensive, so we cache all allocated buffers, * and when requesting a new buffer, find one which fits the required dimensions * from an existing pool of buffers. * * When freeing MTLBuffers, we insert them into the current MTLSafeFreeList, which defers * release of the buffer until the associated command buffers have finished executing. * This prevents a buffer from being re-used while it is still in-use by the GPU. * * * Once command buffers complete, MTLSafeFreeList's associated with the current * command buffer submission are added to the `completed_safelist_queue_`. * * * At a set point in time, all MTLSafeFreeList's in `completed_safelist_queue_` have their * MTLBuffers re-inserted into the Memory Manager's pools. */ class MTLBufferPool { private: /* Memory statistics. */ long long int total_allocation_bytes_ = 0; #if MTL_DEBUG_MEMORY_STATISTICS == 1 /* Debug statistics. */ std::atomic per_frame_allocation_count_; std::atomic allocations_in_pool_; std::atomic buffers_in_pool_; #endif /* Metal resources. */ bool ensure_initialised_ = false; id device_ = nil; /* The buffer selection aims to pick a buffer which meets the minimum size requirements. * To do this, we keep an ordered set of all available buffers. If the buffer is larger than the * desired allocation size, we check it against `mtl_buffer_size_threshold_factor_`, * which defines what % larger than the original allocation the buffer can be. * - A higher value results in greater re-use of previously allocated buffers of similar sizes. * - A lower value may result in more dynamic allocations, but minimized memory usage for a given * scenario. * The current value of 1.26 is calibrated for optimal performance and memory utilization. */ static constexpr float mtl_buffer_size_threshold_factor_ = 1.26; /* Buffer pools using MTLResourceOptions as key for allocation type. * Aliased as 'uint64_t' for map type compatibility. * - A size-ordered list (MultiSet) of allocated buffers is kept per MTLResourceOptions * permutation. This allows efficient lookup for buffers of a given requested size. * - MTLBufferHandle wraps a gpu::MTLBuffer pointer to achieve easy size-based sorting * via CompareMTLBuffer. */ using MTLBufferPoolOrderedList = std::multiset; using MTLBufferResourceOptions = uint64_t; blender::Map buffer_pools_; blender::Vector allocations_; /* Maintain a queue of all MTLSafeFreeList's that have been released * by the GPU and are ready to have their buffers re-inserted into the * MemoryManager pools. * Access to this queue is made thread-safe through safelist_lock_. */ std::mutex safelist_lock_; blender::Vector completed_safelist_queue_; /* Current free list, associated with active MTLCommandBuffer submission. */ /* MTLBuffer::free() can be called from separate threads, due to usage within animation * system/worker threads. */ std::atomic current_free_list_; public: void init(id device); ~MTLBufferPool(); gpu::MTLBuffer *allocate_buffer(uint64_t size, bool cpu_visible, const void *bytes = nullptr); gpu::MTLBuffer *allocate_buffer_aligned(uint64_t size, uint alignment, bool cpu_visible, const void *bytes = nullptr); bool free_buffer(gpu::MTLBuffer *buffer); /* Flush MTLSafeFreeList buffers, for completed lists in `completed_safelist_queue_`, * back to memory pools. */ void update_memory_pools(); /* Access and control over active MTLSafeFreeList. */ MTLSafeFreeList *get_current_safe_list(); void begin_new_safe_list(); /* Add a completed MTLSafeFreeList to completed_safelist_queue_. */ void push_completed_safe_list(MTLSafeFreeList *list); private: void ensure_buffer_pool(MTLResourceOptions options); void insert_buffer_into_pool(MTLResourceOptions options, gpu::MTLBuffer *buffer); void free(); }; /* Scratch buffers are circular-buffers used for temporary data within the current frame. * In order to preserve integrity of contents when having multiple-frames-in-flight, * we cycle through a collection of scratch buffers which are reset upon next use. * * Below are a series of properties, declared to manage scratch buffers. If a scratch buffer * overflows, then the original buffer will be flushed and submitted, with retained references * by usage within the command buffer, and a new buffer will be created. * - The new buffer will grow in size to account for increased demand in temporary memory. */ class MTLScratchBufferManager { private: /* Maximum number of scratch buffers to allocate. This should be the maximum number of * simultaneous frames in flight. */ static constexpr uint mtl_max_scratch_buffers_ = MTL_NUM_SAFE_FRAMES; public: /* Maximum size of single scratch buffer allocation. When re-sizing, this is the maximum size the * newly allocated buffers will grow to. Larger allocations are possible if * `MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION` is enabled, but these will instead allocate new * buffers from the memory pools on the fly. */ static constexpr uint mtl_scratch_buffer_max_size_ = 128 * 1024 * 1024; /* Initial size of circular scratch buffers prior to growth. */ static constexpr uint mtl_scratch_buffer_initial_size_ = 16 * 1024 * 1024; private: /* Parent MTLContext. */ MTLContext &context_; bool initialised_ = false; /* Scratch buffer currently in-use. */ uint current_scratch_buffer_ = 0; /* Scratch buffer pool. */ MTLCircularBuffer *scratch_buffers_[mtl_max_scratch_buffers_]; public: MTLScratchBufferManager(MTLContext &context) : context_(context){}; ~MTLScratchBufferManager(); /* Explicit initialization and freeing of resources. * Initialization must occur after device creation. */ void init(); void free(); /* Allocation functions for creating temporary allocations from active circular buffer. */ MTLTemporaryBuffer scratch_buffer_allocate_range(uint64_t alloc_size); MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(uint64_t alloc_size, uint alignment); /* Ensure a new scratch buffer is started if we move onto a new frame. * Called when a new command buffer begins. */ void ensure_increment_scratch_buffer(); /* Flush memory for active scratch buffer to GPU. * This call will perform a partial flush of the buffer starting from * the last offset the data was flushed from, to the current offset. */ void flush_active_scratch_buffer(); }; /** \} */ } // namespace blender::gpu