Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Jones <michael_jones>2022-06-07 13:08:21 +0300
committerMichael Jones <michael_p_jones@apple.com>2022-06-07 13:08:39 +0300
commit4412e14708c5625c3fe84bc75fce2ca6de6f58c9 (patch)
tree93efcc0fec90881989cab9b4c643b4a7f1e36a04 /intern/cycles/device/metal/queue.mm
parent4fc7e1a8800473eb67c5234ab9ec4f20713fe7c3 (diff)
Cycles: Useful Metal backend debug & profiling functionality
This patch adds some useful debugging & profiling env vars to the Metal backend: - `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render - `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose) - `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled: ``` --------------------------------------------------------------------------------------------------- Kernel name Total threads Dispatches Avg. T/D Time Time% --------------------------------------------------------------------------------------------------- integrator_init_from_camera 657,407,232 161 4,083,274 0.24s 0.51% integrator_intersect_closest 1,629,288,440 681 2,392,494 15.18s 32.12% integrator_intersect_shadow 751,652,291 470 1,599,260 5.80s 12.28% integrator_shade_background 304,612,074 263 1,158,220 1.16s 2.45% integrator_shade_surface 1,159,764,041 676 1,715,627 20.57s 43.52% integrator_shade_shadow 598,885,847 418 1,432,741 1.27s 2.69% integrator_queued_paths_array 2,969,650,130 805 3,689,006 0.35s 0.74% integrator_queued_shadow_paths_array 593,936,619 379 1,567,115 0.14s 0.29% integrator_terminated_paths_array 22,205,417 155 143,260 0.05s 0.10% integrator_sorted_paths_array 2,517,140,043 676 3,723,579 1.65s 3.50% integrator_compact_paths_array 648,912,748 155 4,186,533 0.03s 0.07% integrator_compact_states 20,872,687 155 134,662 0.14s 0.29% integrator_terminated_shadow_paths_array 374,100,675 438 854,111 0.16s 0.33% integrator_compact_shadow_paths_array 503,768,657 438 1,150,156 0.05s 0.10% integrator_compact_shadow_states 37,664,941 202 186,460 0.23s 0.50% integrator_reset 25,165,824 6 4,194,304 0.06s 0.12% film_convert_combined_half_rgba 3,110,400 6 518,400 0.00s 0.01% prefix_sum 676 676 1 0.19s 0.40% --------------------------------------------------------------------------------------------------- 6,760 47.27s 100.00% --------------------------------------------------------------------------------------------------- ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D15044
Diffstat (limited to 'intern/cycles/device/metal/queue.mm')
-rw-r--r--intern/cycles/device/metal/queue.mm226
1 files changed, 224 insertions, 2 deletions
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index c1dab5b0d8f..8b2d5d81859 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
}
wait_semaphore = dispatch_semaphore_create(0);
+
+ if (@available(macos 10.14, *)) {
+ if (getenv("CYCLES_METAL_PROFILING")) {
+ /* Enable per-kernel timing breakdown (shown at end of render). */
+ timing_shared_event = [mtlDevice newSharedEvent];
+ }
+ if (getenv("CYCLES_METAL_DEBUG")) {
+ /* Enable very verbose tracing (shows every dispatch). */
+ verbose_tracing = true;
+ }
+ timing_shared_event_id = 1;
+ }
+
+ capture_kernel = DeviceKernel(-1);
+ if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
+ /* Enable .gputrace capture for the specified DeviceKernel. */
+ MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
+ mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice];
+ mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
+ [captureManager setDefaultCaptureScope:mtlCaptureScope];
+
+ capture_dispatch = -1;
+ if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
+ capture_dispatch = atoi(capture_dispatch_str);
+ capture_dispatch_counter = 0;
+ }
+
+ capture_kernel = DeviceKernel(atoi(capture_kernel_str));
+ printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel));
+
+ if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
+ if (@available(macos 10.15, *)) {
+ if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
+
+ MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
+ captureDescriptor.captureObject = mtlCaptureScope;
+ captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+ captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
+
+ NSError *error;
+ if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
+ NSString *err = [error localizedDescription];
+ printf("Start capture failed: %s\n", [err UTF8String]);
+ }
+ else {
+ printf("Capture started (URL: %s)\n", capture_url);
+ is_capturing_to_disk = true;
+ }
+ }
+ else {
+ printf("Capture to file is not supported\n");
+ }
+ }
+ }
+ }
}
MetalDeviceQueue::~MetalDeviceQueue()
@@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue()
[mtlCommandQueue release];
mtlCommandQueue = nil;
}
+
+ if (mtlCaptureScope) {
+ [mtlCaptureScope release];
+ }
+
+ double total_time = 0.0;
+
+ /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
+ int64_t total_work_size = 0;
+ int64_t num_dispatches = 0;
+ for (auto &stat : timing_stats) {
+ total_time += stat.total_time;
+ total_work_size += stat.total_work_size;
+ num_dispatches += stat.num_dispatches;
+ }
+
+ if (num_dispatches) {
+ printf("\nMetal dispatch stats:\n\n");
+ auto header = string_printf("%-40s %16s %12s %12s %7s %7s",
+ "Kernel name",
+ "Total threads",
+ "Dispatches",
+ "Avg. T/D",
+ "Time",
+ "Time%");
+ auto divider = string(header.length(), '-');
+ printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
+
+ for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
+ auto &stat = timing_stats[i];
+ if (stat.num_dispatches > 0) {
+ printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
+ device_kernel_as_string(DeviceKernel(i)),
+ string_human_readable_number(stat.total_work_size).c_str(),
+ string_human_readable_number(stat.num_dispatches).c_str(),
+ string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(),
+ stat.total_time,
+ stat.total_time * 100.0 / total_time);
+ }
+ }
+ printf("%s\n", divider.c_str());
+ printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
+ "",
+ "",
+ string_human_readable_number(num_dispatches).c_str(),
+ "",
+ total_time,
+ 100.0);
+ printf("%s\n\n", divider.c_str());
+ }
}
int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
@@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args)
{
+ if (kernel == capture_kernel) {
+ if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) {
+ /* Start gputrace capture. */
+ if (mtlCommandBuffer) {
+ synchronize();
+ }
+ [mtlCaptureScope beginScope];
+ printf("[mtlCaptureScope beginScope]\n");
+ is_capturing = true;
+ }
+ capture_dispatch_counter += 1;
+ }
+
if (metal_device->have_error()) {
return false;
}
@@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
+ if (timing_shared_event) {
+ command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id});
+ }
+
/* Determine size requirement for argument buffer. */
size_t arg_buffer_length = 0;
for (size_t i = 0; i < args.count; i++) {
@@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Encode KernelParamsMetal buffers */
[metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
+ if (verbose_tracing || timing_shared_event || is_capturing) {
+ /* Add human-readable labels if we're doing any form of debugging / profiling. */
+ mtlComputeCommandEncoder.label = [[NSString alloc]
+ initWithFormat:@"Metal queue launch %s, work_size %d",
+ device_kernel_as_string(kernel),
+ work_size];
+ }
+
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
sizeof(IntegratorStateGPU);
@@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
int pointer_index = offset / sizeof(device_ptr);
MetalDevice::MetalMem *mmem = *(
MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset);
- if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) {
+ if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
[metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
offset:0
atIndex:pointer_index];
@@ -344,12 +474,53 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
}];
+ if (verbose_tracing || is_capturing) {
+ /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
+ synchronize();
+
+ /* Show queue counters and dispatch timing. */
+ if (verbose_tracing) {
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
+ printf(
+ "_____________________________________.____________________.______________.___________"
+ "______________________________________\n");
+ }
+
+ printf("%-40s| %7d threads |%5.2fms | buckets [",
+ device_kernel_as_string(kernel),
+ work_size,
+ last_completion_time * 1000.0);
+ std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+ for (auto &it : metal_device->metal_mem_map) {
+ const string c_integrator_queue_counter = "integrator_queue_counter";
+ if (it.first->name == c_integrator_queue_counter) {
+ /* Workaround "device_copy_from" being protected. */
+ struct MyDeviceMemory : device_memory {
+ void device_copy_from__IntegratorQueueCounter()
+ {
+ device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter));
+ }
+ };
+ ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter();
+
+ if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
+ it.first->host_pointer) {
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++)
+ printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i]));
+ }
+ break;
+ }
+ }
+ printf("]\n");
+ }
+ }
+
return !(metal_device->have_error());
}
bool MetalDeviceQueue::synchronize()
{
- if (metal_device->have_error()) {
+ if (has_captured_to_disk || metal_device->have_error()) {
return false;
}
@@ -359,6 +530,28 @@ bool MetalDeviceQueue::synchronize()
close_blit_encoder();
if (mtlCommandBuffer) {
+ scoped_timer timer;
+ if (timing_shared_event) {
+ /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */
+ __block double completion_time = 0;
+ for (uint64_t i = command_buffer_start_timing_id; i < timing_shared_event_id; i++) {
+ [timing_shared_event notifyListener:shared_event_listener
+ atValue:i
+ block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
+ completion_time = timer.get_time() - completion_time;
+ last_completion_time = completion_time;
+ for (auto label : command_encoder_labels) {
+ if (label.timing_id == value) {
+ TimingStats &stat = timing_stats[label.kernel];
+ stat.num_dispatches++;
+ stat.total_time += completion_time;
+ stat.total_work_size += label.work_size;
+ }
+ }
+ }];
+ }
+ }
+
uint64_t shared_event_id = this->shared_event_id++;
if (@available(macos 10.14, *)) {
@@ -374,6 +567,22 @@ bool MetalDeviceQueue::synchronize()
dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER);
}
+ if (is_capturing) {
+ [mtlCaptureScope endScope];
+ is_capturing = false;
+ printf("[mtlCaptureScope endScope]\n");
+
+ if (is_capturing_to_disk) {
+ if (@available(macos 10.15, *)) {
+ [[MTLCaptureManager sharedCaptureManager] stopCapture];
+ has_captured_to_disk = true;
+ is_capturing_to_disk = false;
+ is_capturing = false;
+ printf("Capture stopped\n");
+ }
+ }
+ }
+
[mtlCommandBuffer release];
for (const CopyBack &mmem : copy_back_mem) {
@@ -385,6 +594,7 @@ bool MetalDeviceQueue::synchronize()
metal_device->flush_delayed_free_list();
mtlCommandBuffer = nil;
+ command_encoder_labels.clear();
}
return !(metal_device->have_error());
@@ -530,6 +740,13 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
{
bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
+ if (timing_shared_event) {
+ /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
+ if (mtlComputeEncoder) {
+ close_compute_encoder();
+ }
+ }
+
if (@available(macos 10.14, *)) {
if (mtlComputeEncoder) {
if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
@@ -575,6 +792,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
if (!mtlCommandBuffer) {
mtlCommandBuffer = [mtlCommandQueue commandBuffer];
[mtlCommandBuffer retain];
+ command_buffer_start_timing_id = timing_shared_event_id;
}
mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder];
@@ -585,6 +803,10 @@ void MetalDeviceQueue::close_compute_encoder()
{
[mtlComputeEncoder endEncoding];
mtlComputeEncoder = nil;
+
+ if (timing_shared_event) {
+ [mtlCommandBuffer encodeSignalEvent:timing_shared_event value:timing_shared_event_id++];
+ }
}
void MetalDeviceQueue::close_blit_encoder()