diff options
author | Michael Jones <michael_jones> | 2022-06-07 13:08:21 +0300 |
---|---|---|
committer | Michael Jones <michael_p_jones@apple.com> | 2022-06-07 13:08:39 +0300 |
commit | 4412e14708c5625c3fe84bc75fce2ca6de6f58c9 (patch) | |
tree | 93efcc0fec90881989cab9b4c643b4a7f1e36a04 /intern/cycles | |
parent | 4fc7e1a8800473eb67c5234ab9ec4f20713fe7c3 (diff) |
Cycles: Useful Metal backend debug & profiling functionality
This patch adds some useful debugging & profiling env vars to the Metal backend:
- `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render
- `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose)
- `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index
Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled:
```
---------------------------------------------------------------------------------------------------
Kernel name Total threads Dispatches Avg. T/D Time Time%
---------------------------------------------------------------------------------------------------
integrator_init_from_camera 657,407,232 161 4,083,274 0.24s 0.51%
integrator_intersect_closest 1,629,288,440 681 2,392,494 15.18s 32.12%
integrator_intersect_shadow 751,652,291 470 1,599,260 5.80s 12.28%
integrator_shade_background 304,612,074 263 1,158,220 1.16s 2.45%
integrator_shade_surface 1,159,764,041 676 1,715,627 20.57s 43.52%
integrator_shade_shadow 598,885,847 418 1,432,741 1.27s 2.69%
integrator_queued_paths_array 2,969,650,130 805 3,689,006 0.35s 0.74%
integrator_queued_shadow_paths_array 593,936,619 379 1,567,115 0.14s 0.29%
integrator_terminated_paths_array 22,205,417 155 143,260 0.05s 0.10%
integrator_sorted_paths_array 2,517,140,043 676 3,723,579 1.65s 3.50%
integrator_compact_paths_array 648,912,748 155 4,186,533 0.03s 0.07%
integrator_compact_states 20,872,687 155 134,662 0.14s 0.29%
integrator_terminated_shadow_paths_array 374,100,675 438 854,111 0.16s 0.33%
integrator_compact_shadow_paths_array 503,768,657 438 1,150,156 0.05s 0.10%
integrator_compact_shadow_states 37,664,941 202 186,460 0.23s 0.50%
integrator_reset 25,165,824 6 4,194,304 0.06s 0.12%
film_convert_combined_half_rgba 3,110,400 6 518,400 0.00s 0.01%
prefix_sum 676 676 1 0.19s 0.40%
---------------------------------------------------------------------------------------------------
6,760 47.27s 100.00%
---------------------------------------------------------------------------------------------------
```
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D15044
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/device/metal/bvh.mm | 2 | ||||
-rw-r--r-- | intern/cycles/device/metal/device_impl.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 6 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.h | 34 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 226 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 4 |
7 files changed, 269 insertions, 7 deletions
diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 086fbb093ba..09c4ace081e 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -11,6 +11,7 @@ # include "util/progress.h" # include "device/metal/bvh.h" +# include "device/metal/util.h" CCL_NAMESPACE_BEGIN @@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN { \ string str = string_printf(__VA_ARGS__); \ progress.set_substatus(str); \ + metal_printf("%s\n", str.c_str()); \ } BVHMetal::BVHMetal(const BVHParams ¶ms_, diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 7506b9b069f..0e6817d94f8 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -31,6 +31,8 @@ class MetalDevice : public Device { string source[PSO_NUM]; string source_md5[PSO_NUM]; + bool capture_enabled = false; + KernelParamsMetal launch_params = {0}; /* MetalRT members ----------------------------------*/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 16aabacb4cf..086bf0af979 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -86,6 +86,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile use_metalrt = (atoi(metalrt) != 0); } + if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + capture_enabled = true; + } + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -394,7 +398,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) } if (size > 0) { - if (mem.type == MEM_DEVICE_ONLY) { + if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) { options = MTLResourceStorageModePrivate; } diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 6cc84a20787..de20514de0b 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -12,8 +12,6 @@ # include "device/metal/util.h" # include "kernel/device/metal/globals.h" -# define metal_printf VLOG(4) << string_printf - CCL_NAMESPACE_BEGIN class MetalDevice; @@ -77,6 +75,38 @@ class MetalDeviceQueue : public DeviceQueue { void close_compute_encoder(); void close_blit_encoder(); + + bool verbose_tracing = false; + + /* Per-kernel profiling (see CYCLES_METAL_PROFILING). */ + + struct TimingData { + DeviceKernel kernel; + int work_size; + uint64_t timing_id; + }; + std::vector<TimingData> command_encoder_labels; + id<MTLSharedEvent> timing_shared_event = nil; + uint64_t timing_shared_event_id; + uint64_t command_buffer_start_timing_id; + + struct TimingStats { + double total_time = 0.0; + uint64_t total_work_size = 0; + uint64_t num_dispatches = 0; + }; + TimingStats timing_stats[DEVICE_KERNEL_NUM]; + double last_completion_time = 0.0; + + /* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */ + + id<MTLCaptureScope> mtlCaptureScope = nil; + DeviceKernel capture_kernel; + int capture_dispatch = 0; + int capture_dispatch_counter = 0; + bool is_capturing = false; + bool is_capturing_to_disk = false; + bool has_captured_to_disk = false; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index c1dab5b0d8f..8b2d5d81859 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) } wait_semaphore = dispatch_semaphore_create(0); + + if (@available(macos 10.14, *)) { + if (getenv("CYCLES_METAL_PROFILING")) { + /* Enable per-kernel timing breakdown (shown at end of render). */ + timing_shared_event = [mtlDevice newSharedEvent]; + } + if (getenv("CYCLES_METAL_DEBUG")) { + /* Enable very verbose tracing (shows every dispatch). */ + verbose_tracing = true; + } + timing_shared_event_id = 1; + } + + capture_kernel = DeviceKernel(-1); + if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + /* Enable .gputrace capture for the specified DeviceKernel. */ + MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager]; + mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice]; + mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"]; + [captureManager setDefaultCaptureScope:mtlCaptureScope]; + + capture_dispatch = -1; + if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) { + capture_dispatch = atoi(capture_dispatch_str); + capture_dispatch_counter = 0; + } + + capture_kernel = DeviceKernel(atoi(capture_kernel_str)); + printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel)); + + if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) { + if (@available(macos 10.15, *)) { + if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) { + + MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init]; + captureDescriptor.captureObject = mtlCaptureScope; + captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument; + captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)]; + + NSError *error; + if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) { + NSString *err = [error localizedDescription]; + printf("Start capture failed: %s\n", [err UTF8String]); + } + else { + printf("Capture started (URL: %s)\n", capture_url); + is_capturing_to_disk = true; + } + } + else { + printf("Capture to file is not supported\n"); + } + } + } + } } MetalDeviceQueue::~MetalDeviceQueue() @@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue() [mtlCommandQueue release]; mtlCommandQueue = nil; } + + if (mtlCaptureScope) { + [mtlCaptureScope release]; + } + + double total_time = 0.0; + + /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */ + int64_t total_work_size = 0; + int64_t num_dispatches = 0; + for (auto &stat : timing_stats) { + total_time += stat.total_time; + total_work_size += stat.total_work_size; + num_dispatches += stat.num_dispatches; + } + + if (num_dispatches) { + printf("\nMetal dispatch stats:\n\n"); + auto header = string_printf("%-40s %16s %12s %12s %7s %7s", + "Kernel name", + "Total threads", + "Dispatches", + "Avg. T/D", + "Time", + "Time%"); + auto divider = string(header.length(), '-'); + printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str()); + + for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) { + auto &stat = timing_stats[i]; + if (stat.num_dispatches > 0) { + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + device_kernel_as_string(DeviceKernel(i)), + string_human_readable_number(stat.total_work_size).c_str(), + string_human_readable_number(stat.num_dispatches).c_str(), + string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(), + stat.total_time, + stat.total_time * 100.0 / total_time); + } + } + printf("%s\n", divider.c_str()); + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + "", + "", + string_human_readable_number(num_dispatches).c_str(), + "", + total_time, + 100.0); + printf("%s\n\n", divider.c_str()); + } } int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const @@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { + if (kernel == capture_kernel) { + if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) { + /* Start gputrace capture. */ + if (mtlCommandBuffer) { + synchronize(); + } + [mtlCaptureScope beginScope]; + printf("[mtlCaptureScope beginScope]\n"); + is_capturing = true; + } + capture_dispatch_counter += 1; + } + if (metal_device->have_error()) { return false; } @@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); + if (timing_shared_event) { + command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id}); + } + /* Determine size requirement for argument buffer. */ size_t arg_buffer_length = 0; for (size_t i = 0; i < args.count; i++) { @@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Encode KernelParamsMetal buffers */ [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets]; + if (verbose_tracing || timing_shared_event || is_capturing) { + /* Add human-readable labels if we're doing any form of debugging / profiling. */ + mtlComputeCommandEncoder.label = [[NSString alloc] + initWithFormat:@"Metal queue launch %s, work_size %d", + device_kernel_as_string(kernel), + work_size]; + } + /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + sizeof(IntegratorStateGPU); @@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, int pointer_index = offset / sizeof(device_ptr); MetalDevice::MetalMem *mmem = *( MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset); - if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) { + if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) { [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:pointer_index]; @@ -344,12 +474,53 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } }]; + if (verbose_tracing || is_capturing) { + /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */ + synchronize(); + + /* Show queue counters and dispatch timing. */ + if (verbose_tracing) { + if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) { + printf( + "_____________________________________.____________________.______________.___________" + "______________________________________\n"); + } + + printf("%-40s| %7d threads |%5.2fms | buckets [", + device_kernel_as_string(kernel), + work_size, + last_completion_time * 1000.0); + std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); + for (auto &it : metal_device->metal_mem_map) { + const string c_integrator_queue_counter = "integrator_queue_counter"; + if (it.first->name == c_integrator_queue_counter) { + /* Workaround "device_copy_from" being protected. */ + struct MyDeviceMemory : device_memory { + void device_copy_from__IntegratorQueueCounter() + { + device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter)); + } + }; + ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter(); + + if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *) + it.first->host_pointer) { + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) + printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i])); + } + break; + } + } + printf("]\n"); + } + } + return !(metal_device->have_error()); } bool MetalDeviceQueue::synchronize() { - if (metal_device->have_error()) { + if (has_captured_to_disk || metal_device->have_error()) { return false; } @@ -359,6 +530,28 @@ bool MetalDeviceQueue::synchronize() close_blit_encoder(); if (mtlCommandBuffer) { + scoped_timer timer; + if (timing_shared_event) { + /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */ + __block double completion_time = 0; + for (uint64_t i = command_buffer_start_timing_id; i < timing_shared_event_id; i++) { + [timing_shared_event notifyListener:shared_event_listener + atValue:i + block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) { + completion_time = timer.get_time() - completion_time; + last_completion_time = completion_time; + for (auto label : command_encoder_labels) { + if (label.timing_id == value) { + TimingStats &stat = timing_stats[label.kernel]; + stat.num_dispatches++; + stat.total_time += completion_time; + stat.total_work_size += label.work_size; + } + } + }]; + } + } + uint64_t shared_event_id = this->shared_event_id++; if (@available(macos 10.14, *)) { @@ -374,6 +567,22 @@ bool MetalDeviceQueue::synchronize() dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER); } + if (is_capturing) { + [mtlCaptureScope endScope]; + is_capturing = false; + printf("[mtlCaptureScope endScope]\n"); + + if (is_capturing_to_disk) { + if (@available(macos 10.15, *)) { + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + has_captured_to_disk = true; + is_capturing_to_disk = false; + is_capturing = false; + printf("Capture stopped\n"); + } + } + } + [mtlCommandBuffer release]; for (const CopyBack &mmem : copy_back_mem) { @@ -385,6 +594,7 @@ bool MetalDeviceQueue::synchronize() metal_device->flush_delayed_free_list(); mtlCommandBuffer = nil; + command_encoder_labels.clear(); } return !(metal_device->have_error()); @@ -530,6 +740,13 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel { bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM); + if (timing_shared_event) { + /* Close the current encoder to ensure we're able to capture per-encoder timing data. */ + if (mtlComputeEncoder) { + close_compute_encoder(); + } + } + if (@available(macos 10.14, *)) { if (mtlComputeEncoder) { if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent : @@ -575,6 +792,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder() if (!mtlCommandBuffer) { mtlCommandBuffer = [mtlCommandQueue commandBuffer]; [mtlCommandBuffer retain]; + command_buffer_start_timing_id = timing_shared_event_id; } mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder]; @@ -585,6 +803,10 @@ void MetalDeviceQueue::close_compute_encoder() { [mtlComputeEncoder endEncoding]; mtlComputeEncoder = nil; + + if (timing_shared_event) { + [mtlCommandBuffer encodeSignalEvent:timing_shared_event value:timing_shared_event_id++]; + } } void MetalDeviceQueue::close_blit_encoder() diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index cc653ab7e12..f728967835d 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -14,6 +14,8 @@ # include "util/thread.h" +# define metal_printf VLOG(4) << string_printf + CCL_NAMESPACE_BEGIN enum MetalGPUVendor { diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 6405e365847..d657571a5fa 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -241,7 +241,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix -#ifdef __KERNEL_METAL__ +#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) constant int __dummy_constant [[function_constant(0)]]; #endif @@ -256,7 +256,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; -#ifdef __KERNEL_METAL__ +#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) KernelGlobals kg = NULL; /* Workaround Ambient Occlusion and Bevel nodes not working with Metal. * Dummy offset should not affect result, but somehow fixes bug! */ |