diff options
Diffstat (limited to 'intern/cycles/device/metal/queue.mm')
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 598 |
1 files changed, 455 insertions, 143 deletions
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index ec10e091b25..5ac63a16c61 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -17,46 +17,250 @@ CCL_NAMESPACE_BEGIN /* MetalDeviceQueue */ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) - : DeviceQueue(device), metal_device(device), stats(device->stats) + : DeviceQueue(device), metal_device_(device), stats_(device->stats) { if (@available(macos 11.0, *)) { - command_buffer_desc = [[MTLCommandBufferDescriptor alloc] init]; - command_buffer_desc.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus; + command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc] init]; + command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus; } - mtlDevice = device->mtlDevice; - mtlCommandQueue = [mtlDevice newCommandQueue]; + mtlDevice_ = device->mtlDevice; + mtlCommandQueue_ = [mtlDevice_ newCommandQueue]; if (@available(macos 10.14, *)) { - shared_event = [mtlDevice newSharedEvent]; - shared_event_id = 1; + shared_event_ = [mtlDevice_ newSharedEvent]; + shared_event_id_ = 1; /* Shareable event listener */ - event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL); - shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue]; + event_queue_ = dispatch_queue_create("com.cycles.metal.event_queue", NULL); + shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_]; } - wait_semaphore = dispatch_semaphore_create(0); + wait_semaphore_ = dispatch_semaphore_create(0); + + if (@available(macos 10.14, *)) { + if (getenv("CYCLES_METAL_PROFILING")) { + /* Enable per-kernel timing breakdown (shown at end of render). */ + timing_shared_event_ = [mtlDevice_ newSharedEvent]; + label_command_encoders_ = true; + } + if (getenv("CYCLES_METAL_DEBUG")) { + /* Enable very verbose tracing (shows every dispatch). */ + verbose_tracing_ = true; + label_command_encoders_ = true; + } + timing_shared_event_id_ = 1; + } + + setup_capture(); +} + +void MetalDeviceQueue::setup_capture() +{ + capture_kernel_ = DeviceKernel(-1); + + if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + /* CYCLES_DEBUG_METAL_CAPTURE_KERNEL captures a single dispatch of the specified kernel. */ + capture_kernel_ = DeviceKernel(atoi(capture_kernel_str)); + printf("Capture kernel: %d = %s\n", capture_kernel_, device_kernel_as_string(capture_kernel_)); + + capture_dispatch_counter_ = 0; + if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) { + capture_dispatch_counter_ = atoi(capture_dispatch_str); + + printf("Capture dispatch number %d\n", capture_dispatch_counter_); + } + } + else if (auto capture_samples_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) { + /* CYCLES_DEBUG_METAL_CAPTURE_SAMPLES captures a block of dispatches from reset#(N) to + * reset#(N+1). */ + capture_samples_ = true; + capture_reset_counter_ = atoi(capture_samples_str); + + capture_dispatch_counter_ = INT_MAX; + if (auto capture_limit_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) { + /* CYCLES_DEBUG_METAL_CAPTURE_LIMIT sets the maximum number of dispatches to capture. */ + capture_dispatch_counter_ = atoi(capture_limit_str); + } + + printf("Capturing sample block %d (dispatch limit: %d)\n", + capture_reset_counter_, + capture_dispatch_counter_); + } + else { + /* No capturing requested. */ + return; + } + + /* Enable .gputrace capture for the specified DeviceKernel. */ + MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager]; + mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_]; + mtlCaptureScope_.label = [NSString stringWithFormat:@"Cycles kernel dispatch"]; + [captureManager setDefaultCaptureScope:mtlCaptureScope_]; + + label_command_encoders_ = true; + + if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) { + if (@available(macos 10.15, *)) { + if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) { + + MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init]; + captureDescriptor.captureObject = mtlCaptureScope_; + captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument; + captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)]; + + NSError *error; + if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) { + NSString *err = [error localizedDescription]; + printf("Start capture failed: %s\n", [err UTF8String]); + } + else { + printf("Capture started (URL: %s)\n", capture_url); + is_capturing_to_disk_ = true; + } + } + else { + printf("Capture to file is not supported\n"); + } + } + } +} + +void MetalDeviceQueue::update_capture(DeviceKernel kernel) +{ + /* Handle capture end triggers. */ + if (is_capturing_) { + capture_dispatch_counter_ -= 1; + if (capture_dispatch_counter_ <= 0 || kernel == DEVICE_KERNEL_INTEGRATOR_RESET) { + /* End capture if we've hit the dispatch limit or we hit a "reset". */ + end_capture(); + } + return; + } + + if (capture_dispatch_counter_ < 0) { + /* We finished capturing. */ + return; + } + + /* Handle single-capture start trigger. */ + if (kernel == capture_kernel_) { + /* Start capturing when the we hit the Nth dispatch of the specified kernel. */ + if (capture_dispatch_counter_ == 0) { + begin_capture(); + } + capture_dispatch_counter_ -= 1; + return; + } + + /* Handle multi-capture start trigger. */ + if (capture_samples_) { + /* Start capturing when the reset countdown is at 0. */ + if (capture_reset_counter_ == 0) { + begin_capture(); + } + + if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) { + capture_reset_counter_ -= 1; + } + return; + } +} + +void MetalDeviceQueue::begin_capture() +{ + /* Start gputrace capture. */ + if (mtlCommandBuffer_) { + synchronize(); + } + [mtlCaptureScope_ beginScope]; + printf("[mtlCaptureScope_ beginScope]\n"); + is_capturing_ = true; +} + +void MetalDeviceQueue::end_capture() +{ + [mtlCaptureScope_ endScope]; + is_capturing_ = false; + printf("[mtlCaptureScope_ endScope]\n"); + + if (is_capturing_to_disk_) { + if (@available(macos 10.15, *)) { + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + has_captured_to_disk_ = true; + is_capturing_to_disk_ = false; + is_capturing_ = false; + printf("Capture stopped\n"); + } + } } MetalDeviceQueue::~MetalDeviceQueue() { /* Tidying up here isn't really practical - we should expect and require the work * queue to be empty here. */ - assert(mtlCommandBuffer == nil); - assert(command_buffers_submitted == command_buffers_completed); + assert(mtlCommandBuffer_ == nil); + assert(command_buffers_submitted_ == command_buffers_completed_); if (@available(macos 10.14, *)) { - [shared_event_listener release]; - [shared_event release]; + [shared_event_listener_ release]; + [shared_event_ release]; } if (@available(macos 11.0, *)) { - [command_buffer_desc release]; - } - if (mtlCommandQueue) { - [mtlCommandQueue release]; - mtlCommandQueue = nil; + [command_buffer_desc_ release]; + } + if (mtlCommandQueue_) { + [mtlCommandQueue_ release]; + mtlCommandQueue_ = nil; + } + + if (mtlCaptureScope_) { + [mtlCaptureScope_ release]; + } + + double total_time = 0.0; + + /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */ + int64_t num_dispatches = 0; + for (auto &stat : timing_stats_) { + total_time += stat.total_time; + num_dispatches += stat.num_dispatches; + } + + if (num_dispatches) { + printf("\nMetal dispatch stats:\n\n"); + auto header = string_printf("%-40s %16s %12s %12s %7s %7s", + "Kernel name", + "Total threads", + "Dispatches", + "Avg. T/D", + "Time", + "Time%"); + auto divider = string(header.length(), '-'); + printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str()); + + for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) { + auto &stat = timing_stats_[i]; + if (stat.num_dispatches > 0) { + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + device_kernel_as_string(DeviceKernel(i)), + string_human_readable_number(stat.total_work_size).c_str(), + string_human_readable_number(stat.num_dispatches).c_str(), + string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(), + stat.total_time, + stat.total_time * 100.0 / total_time); + } + } + printf("%s\n", divider.c_str()); + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + "", + "", + string_human_readable_number(num_dispatches).c_str(), + "", + total_time, + 100.0); + printf("%s\n\n", divider.c_str()); } } @@ -66,10 +270,10 @@ int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const /* TODO: compute automatically. */ /* TODO: must have at least num_threads_per_block. */ int result = 1048576; - if (metal_device->device_vendor == METAL_GPU_AMD) { + if (metal_device_->device_vendor == METAL_GPU_AMD) { result *= 2; } - else if (metal_device->device_vendor == METAL_GPU_APPLE) { + else if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; } return result; @@ -80,19 +284,24 @@ int MetalDeviceQueue::num_concurrent_busy_states() const /* METAL_WIP */ /* TODO: compute automatically. */ int result = 65536; - if (metal_device->device_vendor == METAL_GPU_AMD) { + if (metal_device_->device_vendor == METAL_GPU_AMD) { result *= 2; } - else if (metal_device->device_vendor == METAL_GPU_APPLE) { + else if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; } return result; } +int MetalDeviceQueue::num_sort_partition_elements() const +{ + return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice); +} + void MetalDeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ - metal_device->load_texture_info(); + metal_device_->load_texture_info(); synchronize(); } @@ -101,15 +310,23 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { - if (metal_device->have_error()) { + update_capture(kernel); + + if (metal_device_->have_error()) { return false; } - VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " - << work_size; + VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); + if (@available(macos 10.14, *)) { + if (timing_shared_event_) { + command_encoder_labels_.push_back({kernel, work_size, timing_shared_event_id_}); + } + } + /* Determine size requirement for argument buffer. */ size_t arg_buffer_length = 0; for (size_t i = 0; i < args.count; i++) { @@ -126,8 +343,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Metal ancillary bindless pointers. */ size_t metal_offsets = arg_buffer_length; - arg_buffer_length += metal_device->mtlAncillaryArgEncoder.encodedLength; - arg_buffer_length = round_up(arg_buffer_length, metal_device->mtlAncillaryArgEncoder.alignment); + arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength; + arg_buffer_length = round_up(arg_buffer_length, metal_device_->mtlAncillaryArgEncoder.alignment); /* Temporary buffer used to prepare arg_buffer */ uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length); @@ -146,23 +363,27 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ - size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) + - sizeof(IntegratorStateGPU); + size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + + offsetof(IntegratorStateGPU, sort_partition_divisor); size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, - (uint8_t *)&metal_device->launch_params + plain_old_launch_data_offset, + (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset, plain_old_launch_data_size); /* Allocate an argument buffer. */ MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged; if (@available(macOS 11.0, *)) { - if ([mtlDevice hasUnifiedMemory]) { + if ([mtlDevice_ hasUnifiedMemory]) { arg_buffer_options = MTLResourceStorageModeShared; } } - id<MTLBuffer> arg_buffer = temp_buffer_pool.get_buffer( - mtlDevice, mtlCommandBuffer, arg_buffer_length, arg_buffer_options, init_arg_buffer, stats); + id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(mtlDevice_, + mtlCommandBuffer_, + arg_buffer_length, + arg_buffer_options, + init_arg_buffer, + stats_); /* Encode the pointer "enqueue" arguments */ bytes_written = 0; @@ -170,16 +391,16 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, size_t size_in_bytes = args.sizes[i]; bytes_written = round_up(bytes_written, size_in_bytes); if (args.types[i] == DeviceKernelArguments::POINTER) { - [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer - offset:bytes_written]; + [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer + offset:bytes_written]; if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) { [mtlComputeCommandEncoder useResource:mmem->mtlBuffer usage:MTLResourceUsageRead | MTLResourceUsageWrite]; - [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0]; + [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0]; } else { if (@available(macos 12.0, *)) { - [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0]; + [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0]; } } } @@ -187,49 +408,58 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } /* Encode KernelParamsMetal buffers */ - [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets]; + [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer + offset:globals_offsets]; + + if (label_command_encoders_) { + /* Add human-readable labels if we're doing any form of debugging / profiling. */ + mtlComputeCommandEncoder.label = [[NSString alloc] + initWithFormat:@"Metal queue launch %s, work_size %d", + device_kernel_as_string(kernel), + work_size]; + } /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ - const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + - sizeof(IntegratorStateGPU); + const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) + + offsetof(IntegratorStateGPU, sort_partition_divisor); for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { - int pointer_index = offset / sizeof(device_ptr); + int pointer_index = int(offset / sizeof(device_ptr)); MetalDevice::MetalMem *mmem = *( - MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset); - if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) { - [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer - offset:0 - atIndex:pointer_index]; + MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset); + if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) { + [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer + offset:0 + atIndex:pointer_index]; } else { if (@available(macos 12.0, *)) { - [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index]; + [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index]; } } } bytes_written = globals_offsets + sizeof(KernelParamsMetal); - const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(metal_device, - kernel); + const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline( + metal_device_, kernel); if (!metal_kernel_pso) { - metal_device->set_error( + metal_device_->set_error( string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel))); return false; } /* Encode ancillaries */ - [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; - [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d - offset:0 - atIndex:0]; - [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_3d - offset:0 - atIndex:1]; + [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; + [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d + offset:0 + atIndex:0]; + [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d + offset:0 + atIndex:1]; if (@available(macos 12.0, *)) { - if (metal_device->use_metalrt) { - if (metal_device->bvhMetalRT) { - id<MTLAccelerationStructure> accel_struct = metal_device->bvhMetalRT->accel_struct; - [metal_device->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2]; + if (metal_device_->use_metalrt) { + if (metal_device_->bvhMetalRT) { + id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct; + [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2]; } for (int table = 0; table < METALRT_TABLE_NUM; table++) { @@ -237,19 +467,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer offset:globals_offsets atIndex:1]; - [metal_device->mtlAncillaryArgEncoder + [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table] atIndex:3 + table]; [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table] usage:MTLResourceUsageRead]; } else { - [metal_device->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil - atIndex:3 + table]; + [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil + atIndex:3 + table]; } } } - bytes_written = metal_offsets + metal_device->mtlAncillaryArgEncoder.encodedLength; + bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength; } if (arg_buffer.storageMode == MTLStorageModeManaged) { @@ -260,16 +490,17 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1]; [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2]; - if (metal_device->use_metalrt) { + if (metal_device_->use_metalrt) { if (@available(macos 12.0, *)) { - auto bvhMetalRT = metal_device->bvhMetalRT; + auto bvhMetalRT = metal_device_->bvhMetalRT; switch (kernel) { case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: break; default: bvhMetalRT = nil; @@ -304,7 +535,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: /* See parallel_active_index.h for why this amount of shared memory is needed. * Rounded up to 16 bytes for Metal */ - shared_mem_bytes = round_up((num_threads_per_block + 1) * sizeof(int), 16); + shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16); [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0]; break; @@ -318,13 +549,13 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch threadsPerThreadgroup:size_threads_per_threadgroup]; - [mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) { + [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) { NSString *kernel_name = metal_kernel_pso->function.label; /* Enhanced command buffer errors are only available in 11.0+ */ if (@available(macos 11.0, *)) { if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) { - printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]); NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo valueForKey:MTLCommandBufferEncoderInfoErrorKey]; if (encoderInfos != nil) { @@ -338,55 +569,122 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } } else if (command_buffer.error) { - printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]); } } }]; - return !(metal_device->have_error()); + if (verbose_tracing_ || is_capturing_) { + /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */ + synchronize(); + + /* Show queue counters and dispatch timing. */ + if (verbose_tracing_) { + if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) { + printf( + "_____________________________________.____________________.______________.___________" + "______________________________________\n"); + } + + printf("%-40s| %7d threads |%5.2fms | buckets [", + device_kernel_as_string(kernel), + work_size, + last_completion_time_ * 1000.0); + std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex); + for (auto &it : metal_device_->metal_mem_map) { + const string c_integrator_queue_counter = "integrator_queue_counter"; + if (it.first->name == c_integrator_queue_counter) { + /* Workaround "device_copy_from" being protected. */ + struct MyDeviceMemory : device_memory { + void device_copy_from__IntegratorQueueCounter() + { + device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter)); + } + }; + ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter(); + + if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *) + it.first->host_pointer) { + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) + printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i])); + } + break; + } + } + printf("]\n"); + } + } + + return !(metal_device_->have_error()); } bool MetalDeviceQueue::synchronize() { - if (metal_device->have_error()) { + if (has_captured_to_disk_ || metal_device_->have_error()) { return false; } - if (mtlComputeEncoder) { + if (mtlComputeEncoder_) { close_compute_encoder(); } close_blit_encoder(); - if (mtlCommandBuffer) { - uint64_t shared_event_id = this->shared_event_id++; + if (mtlCommandBuffer_) { + scoped_timer timer; if (@available(macos 10.14, *)) { - __block dispatch_semaphore_t block_sema = wait_semaphore; - [shared_event notifyListener:shared_event_listener - atValue:shared_event_id - block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) { - dispatch_semaphore_signal(block_sema); - }]; + if (timing_shared_event_) { + /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */ + __block double completion_time = 0; + for (uint64_t i = command_buffer_start_timing_id_; i < timing_shared_event_id_; i++) { + [timing_shared_event_ notifyListener:shared_event_listener_ + atValue:i + block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) { + completion_time = timer.get_time() - completion_time; + last_completion_time_ = completion_time; + for (auto label : command_encoder_labels_) { + if (label.timing_id == value) { + TimingStats &stat = timing_stats_[label.kernel]; + stat.num_dispatches++; + stat.total_time += completion_time; + stat.total_work_size += label.work_size; + } + } + }]; + } + } + } - [mtlCommandBuffer encodeSignalEvent:shared_event value:shared_event_id]; - [mtlCommandBuffer commit]; - dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER); + uint64_t shared_event_id_ = this->shared_event_id_++; + + if (@available(macos 10.14, *)) { + __block dispatch_semaphore_t block_sema = wait_semaphore_; + [shared_event_ notifyListener:shared_event_listener_ + atValue:shared_event_id_ + block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) { + dispatch_semaphore_signal(block_sema); + }]; + + [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_]; + [mtlCommandBuffer_ commit]; + dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER); } - [mtlCommandBuffer release]; + [mtlCommandBuffer_ release]; - for (const CopyBack &mmem : copy_back_mem) { + for (const CopyBack &mmem : copy_back_mem_) { memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size); } - copy_back_mem.clear(); + copy_back_mem_.clear(); - temp_buffer_pool.process_command_buffer_completion(mtlCommandBuffer); - metal_device->flush_delayed_free_list(); + temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_); + metal_device_->flush_delayed_free_list(); - mtlCommandBuffer = nil; + mtlCommandBuffer_ = nil; + command_encoder_labels_.clear(); } - return !(metal_device->have_error()); + return !(metal_device_->have_error()); } void MetalDeviceQueue::zero_to_device(device_memory &mem) @@ -399,20 +697,20 @@ void MetalDeviceQueue::zero_to_device(device_memory &mem) /* Allocate on demand. */ if (mem.device_pointer == 0) { - metal_device->mem_alloc(mem); + metal_device_->mem_alloc(mem); } /* Zero memory on device. */ assert(mem.device_pointer != 0); - std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); - MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem); + std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex); + MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem); if (mmem.mtlBuffer) { id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder(); [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0]; } else { - metal_device->mem_zero(mem); + metal_device_->mem_zero(mem); } } @@ -424,15 +722,15 @@ void MetalDeviceQueue::copy_to_device(device_memory &mem) /* Allocate on demand. */ if (mem.device_pointer == 0) { - metal_device->mem_alloc(mem); + metal_device_->mem_alloc(mem); } assert(mem.device_pointer != 0); assert(mem.host_pointer != nullptr); - std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); - auto result = metal_device->metal_mem_map.find(&mem); - if (result != metal_device->metal_mem_map.end()) { + std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex); + auto result = metal_device_->metal_mem_map.find(&mem); + if (result != metal_device_->metal_mem_map.end()) { if (mem.host_pointer == mem.shared_pointer) { return; } @@ -440,12 +738,12 @@ void MetalDeviceQueue::copy_to_device(device_memory &mem) MetalDevice::MetalMem &mmem = *result->second; id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder(); - id<MTLBuffer> buffer = temp_buffer_pool.get_buffer(mtlDevice, - mtlCommandBuffer, - mmem.size, - MTLResourceStorageModeShared, - mem.host_pointer, - stats); + id<MTLBuffer> buffer = temp_buffer_pool_.get_buffer(mtlDevice_, + mtlCommandBuffer_, + mmem.size, + MTLResourceStorageModeShared, + mem.host_pointer, + stats_); [blitEncoder copyFromBuffer:buffer sourceOffset:0 @@ -454,7 +752,7 @@ void MetalDeviceQueue::copy_to_device(device_memory &mem) size:mmem.size]; } else { - metal_device->mem_copy_to(mem); + metal_device_->mem_copy_to(mem); } } @@ -469,8 +767,8 @@ void MetalDeviceQueue::copy_from_device(device_memory &mem) assert(mem.device_pointer != 0); assert(mem.host_pointer != nullptr); - std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); - MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem); + std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex); + MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem); if (mmem.mtlBuffer) { const size_t size = mem.memory_size(); @@ -480,8 +778,8 @@ void MetalDeviceQueue::copy_from_device(device_memory &mem) [blitEncoder synchronizeResource:mmem.mtlBuffer]; } if (mem.host_pointer != mmem.hostPtr) { - if (mtlCommandBuffer) { - copy_back_mem.push_back({mem.host_pointer, mmem.hostPtr, size}); + if (mtlCommandBuffer_) { + copy_back_mem_.push_back({mem.host_pointer, mmem.hostPtr, size}); } else { memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size); @@ -493,16 +791,16 @@ void MetalDeviceQueue::copy_from_device(device_memory &mem) } } else { - metal_device->mem_copy_from(mem); + metal_device_->mem_copy_from(mem); } } void MetalDeviceQueue::prepare_resources(DeviceKernel kernel) { - std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); + std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex); /* declare resource usage */ - for (auto &it : metal_device->metal_mem_map) { + for (auto &it : metal_device_->metal_mem_map) { device_memory *mem = it.first; MTLResourceUsage usage = MTLResourceUsageRead; @@ -512,17 +810,17 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel kernel) if (it.second->mtlBuffer) { /* METAL_WIP - use array version (i.e. useResources) */ - [mtlComputeEncoder useResource:it.second->mtlBuffer usage:usage]; + [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage]; } else if (it.second->mtlTexture) { /* METAL_WIP - use array version (i.e. useResources) */ - [mtlComputeEncoder useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample]; + [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample]; } } /* ancillaries */ - [mtlComputeEncoder useResource:metal_device->texture_bindings_2d usage:MTLResourceUsageRead]; - [mtlComputeEncoder useResource:metal_device->texture_bindings_3d usage:MTLResourceUsageRead]; + [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead]; + [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead]; } id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel) @@ -530,67 +828,81 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM); if (@available(macos 10.14, *)) { - if (mtlComputeEncoder) { - if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent : - MTLDispatchTypeSerial) { + if (timing_shared_event_) { + /* Close the current encoder to ensure we're able to capture per-encoder timing data. */ + if (mtlComputeEncoder_) { + close_compute_encoder(); + } + } + + if (mtlComputeEncoder_) { + if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent : + MTLDispatchTypeSerial) { /* declare usage of MTLBuffers etc */ prepare_resources(kernel); - return mtlComputeEncoder; + return mtlComputeEncoder_; } close_compute_encoder(); } close_blit_encoder(); - if (!mtlCommandBuffer) { - mtlCommandBuffer = [mtlCommandQueue commandBuffer]; - [mtlCommandBuffer retain]; + if (!mtlCommandBuffer_) { + mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer]; + [mtlCommandBuffer_ retain]; } - mtlComputeEncoder = [mtlCommandBuffer + mtlComputeEncoder_ = [mtlCommandBuffer_ computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial]; - [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))]; + [mtlComputeEncoder_ setLabel:@(device_kernel_as_string(kernel))]; /* declare usage of MTLBuffers etc */ prepare_resources(kernel); } - return mtlComputeEncoder; + return mtlComputeEncoder_; } id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder() { - if (mtlBlitEncoder) { - return mtlBlitEncoder; + if (mtlBlitEncoder_) { + return mtlBlitEncoder_; } - if (mtlComputeEncoder) { + if (mtlComputeEncoder_) { close_compute_encoder(); } - if (!mtlCommandBuffer) { - mtlCommandBuffer = [mtlCommandQueue commandBuffer]; - [mtlCommandBuffer retain]; + if (!mtlCommandBuffer_) { + mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer]; + [mtlCommandBuffer_ retain]; + command_buffer_start_timing_id_ = timing_shared_event_id_; } - mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder]; - return mtlBlitEncoder; + mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder]; + return mtlBlitEncoder_; } void MetalDeviceQueue::close_compute_encoder() { - [mtlComputeEncoder endEncoding]; - mtlComputeEncoder = nil; + [mtlComputeEncoder_ endEncoding]; + mtlComputeEncoder_ = nil; + + if (@available(macos 10.14, *)) { + if (timing_shared_event_) { + [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++]; + } + } } void MetalDeviceQueue::close_blit_encoder() { - if (mtlBlitEncoder) { - [mtlBlitEncoder endEncoding]; - mtlBlitEncoder = nil; + if (mtlBlitEncoder_) { + [mtlBlitEncoder_ endEncoding]; + mtlBlitEncoder_ = nil; } } |