source/blender/gpu/metal/mtl_memory.hh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478

/* SPDX-License-Identifier: GPL-2.0-or-later */

#pragma once

#include <atomic>
#include <functional>
#include <map>
#include <mutex>
#include <set>
#include <unordered_map>

#include "mtl_common.hh"

#include <Cocoa/Cocoa.h>
#include <Metal/Metal.h>
#include <QuartzCore/QuartzCore.h>

@class CAMetalLayer;
@class MTLCommandQueue;
@class MTLRenderPipelineState;

/* Metal Memory Manager Overview. */
/*
 * The Metal Backend Memory manager is designed to provide an interface
 * for all other MTL_* modules where memory allocation is required.
 *
 * Different allocation strategies and data-structures are used depending
 * on how the data is used by the backend. These aim to optimally handle
 * system memory and abstract away any complexity from the MTL_* modules
 * themselves.
 *
 * There are two primary allocation modes which can be used:
 *
 * ** MTLScratchBufferManager **
 *
 *    Each MTLContext owns a ScratchBufferManager which is implemented
 *    as a pool of circular buffers, designed to handle temporary
 *    memory allocations which occur on a per-frame basis. The scratch
 *    buffers allow flushing of host memory to the GPU to be batched.
 *
 *    Each frame, the next scratch buffer is reset, then later flushed upon
 *    command buffer submission.
 *
 *    Note: This is allocated per-context due to allocations being tied
 *    to workload submissions and context-specific submissions.
 *
 *    Examples of scratch buffer usage are:
 *      - Immediate-mode temporary vertex buffers.
 *      - Shader uniform data updates
 *      - Staging of data for resource copies, or, data reads/writes.
 *
 *  Usage:
 *
 *    MTLContext::get_scratchbuffer_manager() - to fetch active manager.
 *
 *    MTLTemporaryBuffer scratch_buffer_allocate_range(size)
 *    MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(size, align)
 *
 * ---------------------------------------------------------------------------------
 *  ** MTLBufferPool **
 *
 *    For static and longer-lasting memory allocations, such as those for UBOs,
 *    Vertex buffers, index buffers, etc; We want an optimal abstraction for
 *    fetching a MTLBuffer of the desired size and resource options.
 *
 *    Memory allocations can be expensive so the MTLBufferPool provides
 *    functionality to track usage of these buffers and once a buffer
 *    is no longer in use, it is returned to the buffer pool for use
 *    by another backend resource.
 *
 *    The MTLBufferPool provides functionality for safe tracking of resources,
 *    as buffers freed on the host side must have their usage by the GPU tracked,
 *    to ensure they are not prematurely re-used before they have finished being
 *    used by the GPU.
 *
 *    Note: The MTLBufferPool is a global construct which can be fetched from anywhere.
 *
 *  Usage:
 *    MTLContext::get_global_memory_manager();  - static routine to fetch global memory manager.
 *
 *    gpu::MTLBuffer *allocate_buffer(size, is_cpu_visibile, bytes=nullptr)
 *    gpu::MTLBuffer *allocate_buffer_aligned(size, alignment, is_cpu_visibile, bytes=nullptr)
 */

/* Debug memory statistics: Disabled by Macro rather than guarded for
 * performance considerations. */
#define MTL_DEBUG_MEMORY_STATISTICS 0

/* Allows a scratch buffer to temporarily grow beyond its maximum, which allows submission
 * of one-time-use data packets which are too large. */
#define MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION 1

namespace blender::gpu {

/* Forward Declarations. */
class MTLContext;
class MTLCommandBufferManager;
class MTLUniformBuf;

/* -------------------------------------------------------------------- */
/** \name Memory Management.
 * \{ */

/* MTLBuffer allocation wrapper. */
class MTLBuffer {

 private:
  /* Metal resource. */
  id<MTLBuffer> metal_buffer_;

  /* Host-visible mapped-memory pointer. Behavior depends on buffer type:
   * - Shared buffers: pointer represents base address of #MTLBuffer whose data
   *                   access has shared access by both the CPU and GPU on
   *                   Unified Memory Architectures (UMA).
   * - Managed buffer: Host-side mapped buffer region for CPU (Host) access. Managed buffers
   *                   must be manually flushed to transfer data to GPU-resident buffer.
   * - Private buffer: Host access is invalid, `data` will be nullptr. */
  void *data_;

  /* Whether buffer is allocated from an external source. */
  bool is_external_ = false;

  /* Allocation info. */
  MTLResourceOptions options_;
  id<MTLDevice> device_;
  uint64_t alignment_;
  uint64_t size_;

  /* Allocated size may be larger than actual size. */
  uint64_t usage_size_;

  /* Lifetime info - whether the current buffer is actively in use. A buffer
   * should be in use after it has been allocated. De-allocating the buffer, and
   * returning it to the free buffer pool will set in_use to false. Using a buffer
   * while it is not in-use should not be allowed and result in an error. */
  std::atomic<bool> in_use_;

 public:
  MTLBuffer(id<MTLDevice> device, uint64_t size, MTLResourceOptions options, uint alignment = 1);
  MTLBuffer(id<MTLBuffer> external_buffer);
  ~MTLBuffer();

  /* Fetch information about backing MTLBuffer. */
  id<MTLBuffer> get_metal_buffer() const;
  void *get_host_ptr() const;
  uint64_t get_size_used() const;
  uint64_t get_size() const;

  /* Flush data to GPU. */
  void flush();
  void flush_range(uint64_t offset, uint64_t length);
  bool requires_flush();

  /* Buffer usage tracking. */
  void flag_in_use(bool used);
  bool get_in_use();
  void set_usage_size(uint64_t size_used);

  /* Debug. */
  void set_label(NSString *str);

  /* Read properties. */
  MTLResourceOptions get_resource_options();
  uint64_t get_alignment();

  /* Resource-local free: For buffers allocated via memory manager,
   * this will call the context `free_buffer` method to return the buffer to the context memory
   * pool.
   *
   * Otherwise, free will release the associated metal resource.
   * As a note, calling the destructor will also destroy the buffer and associated metal
   * resource. */
  void free();

  /* Safety check to ensure buffers are not used after free. */
  void debug_ensure_used();
};

/* View into part of an MTLBuffer. */
struct MTLBufferRange {
  id<MTLBuffer> metal_buffer;
  void *data;
  uint64_t buffer_offset;
  uint64_t size;
  MTLResourceOptions options;

  void flush();
  bool requires_flush();
};

/* Circular scratch buffer allocations should be seen as temporary and only used within the
 * lifetime of the frame. */
using MTLTemporaryBuffer = MTLBufferRange;

/* Round-Robin Circular-buffer. */
class MTLCircularBuffer {
  friend class MTLScratchBufferManager;

 private:
  MTLContext &own_context_;

  /* Wrapped MTLBuffer allocation handled. */
  gpu::MTLBuffer *cbuffer_;

  /* Current offset where next allocation will begin. */
  uint64_t current_offset_;

  /* Whether the Circular Buffer can grow during re-allocation if
   * the size is exceeded. */
  bool can_resize_;

  /* Usage information. */
  uint64_t used_frame_index_;
  uint64_t last_flush_base_offset_;

 public:
  MTLCircularBuffer(MTLContext &ctx, uint64_t initial_size, bool allow_grow);
  ~MTLCircularBuffer();
  MTLTemporaryBuffer allocate_range(uint64_t alloc_size);
  MTLTemporaryBuffer allocate_range_aligned(uint64_t alloc_size, uint alignment);
  void flush();

  /* Reset pointer back to start of circular buffer. */
  void reset();
};

/* Wrapper struct used by Memory Manager to sort and compare gpu::MTLBuffer resources inside the
 * memory pools. */
struct MTLBufferHandle {
  gpu::MTLBuffer *buffer;
  uint64_t buffer_size;

  inline MTLBufferHandle(gpu::MTLBuffer *buf)
  {
    this->buffer = buf;
    this->buffer_size = this->buffer->get_size();
  }

  inline MTLBufferHandle(uint64_t compare_size)
  {
    this->buffer = nullptr;
    this->buffer_size = compare_size;
  }
};

struct CompareMTLBuffer {
  bool operator()(const MTLBufferHandle &lhs, const MTLBufferHandle &rhs) const
  {
    return lhs.buffer_size < rhs.buffer_size;
  }
};

/* An MTLSafeFreeList is a temporary list of gpu::MTLBuffers which have
 * been freed by the high level backend, but are pending GPU work execution before
 * the gpu::MTLBuffers can be returned to the Memory manager pools.
 * This list is implemented as a chunked linked-list.
 *
 * Only a single MTLSafeFreeList is active at one time and is associated with current command
 * buffer submissions. If an MTLBuffer is freed during the lifetime of a command buffer, it could
 * still possibly be in-use and as such, the MTLSafeFreeList will increment its reference count for
 * each command buffer submitted while the current pool is active.
 *
 * -- Reference count is incremented upon MTLCommandBuffer commit.
 * -- Reference count is decremented in the MTLCommandBuffer completion callback handler.
 *
 * A new MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than
 * individual buffer resource tracking reduces performance overhead.
 *
 *  * The reference count starts at 1 to ensure that the reference count cannot prematurely reach
 *  zero until any command buffers have been submitted. This additional decrement happens
 *  when the next MTLSafeFreeList is created, to allow the existing pool to be released once
 *  the reference count hits zero after submitted command buffers complete.
 *
 * Note: the Metal API independently tracks resources used by command buffers for the purpose of
 * keeping resources alive while in-use by the driver and CPU, however, this differs from the
 * MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing
 * previously allocated MTLBuffer resources to be re-used. This allows us to save on the expensive
 * cost of memory allocation.
 */
class MTLSafeFreeList {
  friend class MTLBufferPool;

 private:
  std::atomic<int> reference_count_;
  std::atomic<bool> in_free_queue_;
  std::recursive_mutex lock_;

  /* Linked list of next MTLSafeFreeList chunk if current chunk is full. */
  std::atomic<int> has_next_pool_;
  std::atomic<MTLSafeFreeList *> next_;

  /* Lockless list. MAX_NUM_BUFFERS_ within a chunk based on considerations
   * for performance and memory. */
  static const int MAX_NUM_BUFFERS_ = 1024;
  std::atomic<int> current_list_index_;
  gpu::MTLBuffer *safe_free_pool_[MAX_NUM_BUFFERS_];

 public:
  MTLSafeFreeList();

  /* Add buffer to Safe Free List, can be called from secondary threads.
   * Performs a lockless list insert. */
  void insert_buffer(gpu::MTLBuffer *buffer);

  /* Increments command buffer reference count. */
  void increment_reference();

  /* Decrement and return of buffers to pool occur on MTLCommandBuffer completion callback thread.
   */
  void decrement_reference();

  void flag_in_queue()
  {
    in_free_queue_ = true;
    if (has_next_pool_) {
      MTLSafeFreeList *next_pool = next_.load();
      BLI_assert(next_pool != nullptr);
      next_pool->flag_in_queue();
    }
  }
};

/* MTLBuffer pools. */
/* Allocating Metal buffers is expensive, so we cache all allocated buffers,
 * and when requesting a new buffer, find one which fits the required dimensions
 * from an existing pool of buffers.
 *
 * When freeing MTLBuffers, we insert them into the current MTLSafeFreeList, which defers
 * release of the buffer until the associated command buffers have finished executing.
 * This prevents a buffer from being re-used while it is still in-use by the GPU.
 *
 * * Once command buffers complete, MTLSafeFreeList's associated with the current
 *   command buffer submission are added to the `completed_safelist_queue_`.
 *
 * * At a set point in time, all MTLSafeFreeList's in `completed_safelist_queue_` have their
 *   MTLBuffers re-inserted into the Memory Manager's pools. */
class MTLBufferPool {

 private:
  /* Memory statistics. */
  long long int total_allocation_bytes_ = 0;

#if MTL_DEBUG_MEMORY_STATISTICS == 1
  /* Debug statistics. */
  std::atomic<int> per_frame_allocation_count_;
  std::atomic<long long int> allocations_in_pool_;
  std::atomic<long long int> buffers_in_pool_;
#endif

  /* Metal resources. */
  bool ensure_initialised_ = false;
  id<MTLDevice> device_ = nil;

  /* The buffer selection aims to pick a buffer which meets the minimum size requirements.
   * To do this, we keep an ordered set of all available buffers. If the buffer is larger than the
   * desired allocation size, we check it against `mtl_buffer_size_threshold_factor_`,
   * which defines what % larger than the original allocation the buffer can be.
   * - A higher value results in greater re-use of previously allocated buffers of similar sizes.
   * - A lower value may result in more dynamic allocations, but minimized memory usage for a given
   *   scenario.
   * The current value of 1.26 is calibrated for optimal performance and memory utilization. */
  static constexpr float mtl_buffer_size_threshold_factor_ = 1.26;

  /* Buffer pools using MTLResourceOptions as key for allocation type.
   * Aliased as 'uint64_t' for map type compatibility.
   * - A size-ordered list (MultiSet) of allocated buffers is kept per MTLResourceOptions
   *   permutation. This allows efficient lookup for buffers of a given requested size.
   * - MTLBufferHandle wraps a gpu::MTLBuffer pointer to achieve easy size-based sorting
   *   via CompareMTLBuffer. */
  using MTLBufferPoolOrderedList = std::multiset<MTLBufferHandle, CompareMTLBuffer>;
  using MTLBufferResourceOptions = uint64_t;

  blender::Map<MTLBufferResourceOptions, MTLBufferPoolOrderedList *> buffer_pools_;
  blender::Vector<gpu::MTLBuffer *> allocations_;

  /* Maintain a queue of all MTLSafeFreeList's that have been released
   * by the GPU and are ready to have their buffers re-inserted into the
   * MemoryManager pools.
   * Access to this queue is made thread-safe through safelist_lock_. */
  std::mutex safelist_lock_;
  blender::Vector<MTLSafeFreeList *> completed_safelist_queue_;

  /* Current free list, associated with active MTLCommandBuffer submission. */
  /* MTLBuffer::free() can be called from separate threads, due to usage within animation
   * system/worker threads. */
  std::atomic<MTLSafeFreeList *> current_free_list_;

 public:
  void init(id<MTLDevice> device);
  ~MTLBufferPool();

  gpu::MTLBuffer *allocate_buffer(uint64_t size, bool cpu_visible, const void *bytes = nullptr);
  gpu::MTLBuffer *allocate_buffer_aligned(uint64_t size,
                                          uint alignment,
                                          bool cpu_visible,
                                          const void *bytes = nullptr);
  bool free_buffer(gpu::MTLBuffer *buffer);

  /* Flush MTLSafeFreeList buffers, for completed lists in `completed_safelist_queue_`,
   * back to memory pools. */
  void update_memory_pools();

  /* Access and control over active MTLSafeFreeList. */
  MTLSafeFreeList *get_current_safe_list();
  void begin_new_safe_list();

  /* Add a completed MTLSafeFreeList to completed_safelist_queue_. */
  void push_completed_safe_list(MTLSafeFreeList *list);

 private:
  void ensure_buffer_pool(MTLResourceOptions options);
  void insert_buffer_into_pool(MTLResourceOptions options, gpu::MTLBuffer *buffer);
  void free();
};

/* Scratch buffers are circular-buffers used for temporary data within the current frame.
 * In order to preserve integrity of contents when having multiple-frames-in-flight,
 * we cycle through a collection of scratch buffers which are reset upon next use.
 *
 * Below are a series of properties, declared to manage scratch buffers. If a scratch buffer
 * overflows, then the original buffer will be flushed and submitted, with retained references
 * by usage within the command buffer, and a new buffer will be created.
 * - The new buffer will grow in size to account for increased demand in temporary memory.
 */
class MTLScratchBufferManager {

 private:
  /* Maximum number of scratch buffers to allocate. This should be the maximum number of
   * simultaneous frames in flight. */
  static constexpr uint mtl_max_scratch_buffers_ = MTL_NUM_SAFE_FRAMES;

 public:
  /* Maximum size of single scratch buffer allocation. When re-sizing, this is the maximum size the
   * newly allocated buffers will grow to. Larger allocations are possible if
   * `MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION` is enabled, but these will instead allocate new
   * buffers from the memory pools on the fly. */
  static constexpr uint mtl_scratch_buffer_max_size_ = 128 * 1024 * 1024;

  /* Initial size of circular scratch buffers prior to growth. */
  static constexpr uint mtl_scratch_buffer_initial_size_ = 16 * 1024 * 1024;

 private:
  /* Parent MTLContext. */
  MTLContext &context_;
  bool initialised_ = false;

  /* Scratch buffer currently in-use. */
  uint current_scratch_buffer_ = 0;

  /* Scratch buffer pool. */
  MTLCircularBuffer *scratch_buffers_[mtl_max_scratch_buffers_];

 public:
  MTLScratchBufferManager(MTLContext &context) : context_(context){};
  ~MTLScratchBufferManager();

  /* Explicit initialization and freeing of resources.
   * Initialization must occur after device creation. */
  void init();
  void free();

  /* Allocation functions for creating temporary allocations from active circular buffer. */
  MTLTemporaryBuffer scratch_buffer_allocate_range(uint64_t alloc_size);
  MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(uint64_t alloc_size, uint alignment);

  /* Ensure a new scratch buffer is started if we move onto a new frame.
   * Called when a new command buffer begins. */
  void ensure_increment_scratch_buffer();

  /* Flush memory for active scratch buffer to GPU.
   * This call will perform a partial flush of the buffer starting from
   * the last offset the data was flushed from, to the current offset. */
  void flush_active_scratch_buffer();
};

/** \} */

}  // namespace blender::gpu