diff options
author | Michael Jones <michael_jones> | 2022-06-07 13:08:21 +0300 |
---|---|---|
committer | Michael Jones <michael_p_jones@apple.com> | 2022-06-07 13:08:39 +0300 |
commit | 4412e14708c5625c3fe84bc75fce2ca6de6f58c9 (patch) | |
tree | 93efcc0fec90881989cab9b4c643b4a7f1e36a04 /intern/cycles/device/metal/queue.mm | |
parent | 4fc7e1a8800473eb67c5234ab9ec4f20713fe7c3 (diff) |
Cycles: Useful Metal backend debug & profiling functionality
This patch adds some useful debugging & profiling env vars to the Metal backend:
- `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render
- `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose)
- `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index
Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled:
```
---------------------------------------------------------------------------------------------------
Kernel name Total threads Dispatches Avg. T/D Time Time%
---------------------------------------------------------------------------------------------------
integrator_init_from_camera 657,407,232 161 4,083,274 0.24s 0.51%
integrator_intersect_closest 1,629,288,440 681 2,392,494 15.18s 32.12%
integrator_intersect_shadow 751,652,291 470 1,599,260 5.80s 12.28%
integrator_shade_background 304,612,074 263 1,158,220 1.16s 2.45%
integrator_shade_surface 1,159,764,041 676 1,715,627 20.57s 43.52%
integrator_shade_shadow 598,885,847 418 1,432,741 1.27s 2.69%
integrator_queued_paths_array 2,969,650,130 805 3,689,006 0.35s 0.74%
integrator_queued_shadow_paths_array 593,936,619 379 1,567,115 0.14s 0.29%
integrator_terminated_paths_array 22,205,417 155 143,260 0.05s 0.10%
integrator_sorted_paths_array 2,517,140,043 676 3,723,579 1.65s 3.50%
integrator_compact_paths_array 648,912,748 155 4,186,533 0.03s 0.07%
integrator_compact_states 20,872,687 155 134,662 0.14s 0.29%
integrator_terminated_shadow_paths_array 374,100,675 438 854,111 0.16s 0.33%
integrator_compact_shadow_paths_array 503,768,657 438 1,150,156 0.05s 0.10%
integrator_compact_shadow_states 37,664,941 202 186,460 0.23s 0.50%
integrator_reset 25,165,824 6 4,194,304 0.06s 0.12%
film_convert_combined_half_rgba 3,110,400 6 518,400 0.00s 0.01%
prefix_sum 676 676 1 0.19s 0.40%
---------------------------------------------------------------------------------------------------
6,760 47.27s 100.00%
---------------------------------------------------------------------------------------------------
```
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D15044
Diffstat (limited to 'intern/cycles/device/metal/queue.mm')
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 226 |
1 files changed, 224 insertions, 2 deletions
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index c1dab5b0d8f..8b2d5d81859 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) } wait_semaphore = dispatch_semaphore_create(0); + + if (@available(macos 10.14, *)) { + if (getenv("CYCLES_METAL_PROFILING")) { + /* Enable per-kernel timing breakdown (shown at end of render). */ + timing_shared_event = [mtlDevice newSharedEvent]; + } + if (getenv("CYCLES_METAL_DEBUG")) { + /* Enable very verbose tracing (shows every dispatch). */ + verbose_tracing = true; + } + timing_shared_event_id = 1; + } + + capture_kernel = DeviceKernel(-1); + if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + /* Enable .gputrace capture for the specified DeviceKernel. */ + MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager]; + mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice]; + mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"]; + [captureManager setDefaultCaptureScope:mtlCaptureScope]; + + capture_dispatch = -1; + if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) { + capture_dispatch = atoi(capture_dispatch_str); + capture_dispatch_counter = 0; + } + + capture_kernel = DeviceKernel(atoi(capture_kernel_str)); + printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel)); + + if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) { + if (@available(macos 10.15, *)) { + if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) { + + MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init]; + captureDescriptor.captureObject = mtlCaptureScope; + captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument; + captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)]; + + NSError *error; + if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) { + NSString *err = [error localizedDescription]; + printf("Start capture failed: %s\n", [err UTF8String]); + } + else { + printf("Capture started (URL: %s)\n", capture_url); + is_capturing_to_disk = true; + } + } + else { + printf("Capture to file is not supported\n"); + } + } + } + } } MetalDeviceQueue::~MetalDeviceQueue() @@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue() [mtlCommandQueue release]; mtlCommandQueue = nil; } + + if (mtlCaptureScope) { + [mtlCaptureScope release]; + } + + double total_time = 0.0; + + /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */ + int64_t total_work_size = 0; + int64_t num_dispatches = 0; + for (auto &stat : timing_stats) { + total_time += stat.total_time; + total_work_size += stat.total_work_size; + num_dispatches += stat.num_dispatches; + } + + if (num_dispatches) { + printf("\nMetal dispatch stats:\n\n"); + auto header = string_printf("%-40s %16s %12s %12s %7s %7s", + "Kernel name", + "Total threads", + "Dispatches", + "Avg. T/D", + "Time", + "Time%"); + auto divider = string(header.length(), '-'); + printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str()); + + for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) { + auto &stat = timing_stats[i]; + if (stat.num_dispatches > 0) { + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + device_kernel_as_string(DeviceKernel(i)), + string_human_readable_number(stat.total_work_size).c_str(), + string_human_readable_number(stat.num_dispatches).c_str(), + string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(), + stat.total_time, + stat.total_time * 100.0 / total_time); + } + } + printf("%s\n", divider.c_str()); + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + "", + "", + string_human_readable_number(num_dispatches).c_str(), + "", + total_time, + 100.0); + printf("%s\n\n", divider.c_str()); + } } int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const @@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { + if (kernel == capture_kernel) { + if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) { + /* Start gputrace capture. */ + if (mtlCommandBuffer) { + synchronize(); + } + [mtlCaptureScope beginScope]; + printf("[mtlCaptureScope beginScope]\n"); + is_capturing = true; + } + capture_dispatch_counter += 1; + } + if (metal_device->have_error()) { return false; } @@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); + if (timing_shared_event) { + command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id}); + } + /* Determine size requirement for argument buffer. */ size_t arg_buffer_length = 0; for (size_t i = 0; i < args.count; i++) { @@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Encode KernelParamsMetal buffers */ [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets]; + if (verbose_tracing || timing_shared_event || is_capturing) { + /* Add human-readable labels if we're doing any form of debugging / profiling. */ + mtlComputeCommandEncoder.label = [[NSString alloc] + initWithFormat:@"Metal queue launch %s, work_size %d", + device_kernel_as_string(kernel), + work_size]; + } + /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + sizeof(IntegratorStateGPU); @@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, int pointer_index = offset / sizeof(device_ptr); MetalDevice::MetalMem *mmem = *( MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset); - if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) { + if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) { [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:pointer_index]; @@ -344,12 +474,53 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } }]; + if (verbose_tracing || is_capturing) { + /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */ + synchronize(); + + /* Show queue counters and dispatch timing. */ + if (verbose_tracing) { + if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) { + printf( + "_____________________________________.____________________.______________.___________" + "______________________________________\n"); + } + + printf("%-40s| %7d threads |%5.2fms | buckets [", + device_kernel_as_string(kernel), + work_size, + last_completion_time * 1000.0); + std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); + for (auto &it : metal_device->metal_mem_map) { + const string c_integrator_queue_counter = "integrator_queue_counter"; + if (it.first->name == c_integrator_queue_counter) { + /* Workaround "device_copy_from" being protected. */ + struct MyDeviceMemory : device_memory { + void device_copy_from__IntegratorQueueCounter() + { + device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter)); + } + }; + ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter(); + + if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *) + it.first->host_pointer) { + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) + printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i])); + } + break; + } + } + printf("]\n"); + } + } + return !(metal_device->have_error()); } bool MetalDeviceQueue::synchronize() { - if (metal_device->have_error()) { + if (has_captured_to_disk || metal_device->have_error()) { return false; } @@ -359,6 +530,28 @@ bool MetalDeviceQueue::synchronize() close_blit_encoder(); if (mtlCommandBuffer) { + scoped_timer timer; + if (timing_shared_event) { + /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */ + __block double completion_time = 0; + for (uint64_t i = command_buffer_start_timing_id; i < timing_shared_event_id; i++) { + [timing_shared_event notifyListener:shared_event_listener + atValue:i + block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) { + completion_time = timer.get_time() - completion_time; + last_completion_time = completion_time; + for (auto label : command_encoder_labels) { + if (label.timing_id == value) { + TimingStats &stat = timing_stats[label.kernel]; + stat.num_dispatches++; + stat.total_time += completion_time; + stat.total_work_size += label.work_size; + } + } + }]; + } + } + uint64_t shared_event_id = this->shared_event_id++; if (@available(macos 10.14, *)) { @@ -374,6 +567,22 @@ bool MetalDeviceQueue::synchronize() dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER); } + if (is_capturing) { + [mtlCaptureScope endScope]; + is_capturing = false; + printf("[mtlCaptureScope endScope]\n"); + + if (is_capturing_to_disk) { + if (@available(macos 10.15, *)) { + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + has_captured_to_disk = true; + is_capturing_to_disk = false; + is_capturing = false; + printf("Capture stopped\n"); + } + } + } + [mtlCommandBuffer release]; for (const CopyBack &mmem : copy_back_mem) { @@ -385,6 +594,7 @@ bool MetalDeviceQueue::synchronize() metal_device->flush_delayed_free_list(); mtlCommandBuffer = nil; + command_encoder_labels.clear(); } return !(metal_device->have_error()); @@ -530,6 +740,13 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel { bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM); + if (timing_shared_event) { + /* Close the current encoder to ensure we're able to capture per-encoder timing data. */ + if (mtlComputeEncoder) { + close_compute_encoder(); + } + } + if (@available(macos 10.14, *)) { if (mtlComputeEncoder) { if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent : @@ -575,6 +792,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder() if (!mtlCommandBuffer) { mtlCommandBuffer = [mtlCommandQueue commandBuffer]; [mtlCommandBuffer retain]; + command_buffer_start_timing_id = timing_shared_event_id; } mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder]; @@ -585,6 +803,10 @@ void MetalDeviceQueue::close_compute_encoder() { [mtlComputeEncoder endEncoding]; mtlComputeEncoder = nil; + + if (timing_shared_event) { + [mtlCommandBuffer encodeSignalEvent:timing_shared_event value:timing_shared_event_id++]; + } } void MetalDeviceQueue::close_blit_encoder() |