diff options
author | Nikita Sirgienko <nikita.sirgienko@intel.com> | 2022-09-14 16:55:56 +0300 |
---|---|---|
committer | Nikita Sirgienko <nikita.sirgienko@intel.com> | 2022-09-27 23:15:00 +0300 |
commit | 2ead05d73878721703de5d2fe6a07eb9053168aa (patch) | |
tree | 72b51851a8c5e7f9995607ca20c94c9cdf022ff3 /intern/cycles | |
parent | b145cc9d361e21da3a8a0ff2ef3bad1f8e8fbae6 (diff) |
Cycles: Add optional per-kernel performance statistics
When verbose level 4 is enabled, Blender prints kernel performance
data for Cycles on GPU backends (except Metal that doesn't use
debug_enqueue_* methods) for groups of kernels.
These changes introduce a new CYCLES_DEBUG_PER_KERNEL_PERFORMANCE
environment variable to allow getting timings for each kernels
separately and not grouped with others. This is done by adding
explicit synchronization after each kernel execution.
Differential Revision: https://developer.blender.org/D15971
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/device/cuda/queue.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/queue.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/optix/queue.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/queue.cpp | 27 | ||||
-rw-r--r-- | intern/cycles/device/queue.h | 6 |
6 files changed, 41 insertions, 8 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 5912e68a92b..84b0a1e0dd6 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -79,7 +79,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, return false; } - debug_enqueue(kernel, work_size); + debug_enqueue_begin(kernel, work_size); const CUDAContextScope scope(cuda_device_); const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel); @@ -121,6 +121,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, 0), "enqueue"); + debug_enqueue_end(); + return !(cuda_device_->have_error()); } diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 8b3d963a32f..3f8b6267100 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -79,7 +79,7 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, return false; } - debug_enqueue(kernel, work_size); + debug_enqueue_begin(kernel, work_size); const HIPContextScope scope(hip_device_); const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel); @@ -120,6 +120,8 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, 0), "enqueue"); + debug_enqueue_end(); + return !(hip_device_->have_error()); } diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index 1e822e25f1a..f865b27f976 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -77,7 +77,7 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel, void **args = const_cast<void **>(_args.values); - debug_enqueue(kernel, signed_kernel_work_size); + debug_enqueue_begin(kernel, signed_kernel_work_size); assert(signed_kernel_work_size >= 0); size_t kernel_work_size = (size_t)signed_kernel_work_size; @@ -97,6 +97,8 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel, oneapi_device_->oneapi_error_message() + "\""); } + debug_enqueue_end(); + return is_finished_ok; } diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index f0d49ad6f6c..3bc547ed11d 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -46,7 +46,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, return false; } - debug_enqueue(kernel, work_size); + debug_enqueue_begin(kernel, work_size); const CUDAContextScope scope(cuda_device_); @@ -131,6 +131,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, 1, 1)); + debug_enqueue_end(); + return !(optix_device->have_error()); } diff --git a/intern/cycles/device/queue.cpp b/intern/cycles/device/queue.cpp index cc0cf0ccf84..8c997b184b6 100644 --- a/intern/cycles/device/queue.cpp +++ b/intern/cycles/device/queue.cpp @@ -12,9 +12,13 @@ CCL_NAMESPACE_BEGIN DeviceQueue::DeviceQueue(Device *device) - : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0) + : device(device), + last_kernels_enqueued_(0), + last_sync_time_(0.0), + is_per_kernel_performance_(false) { DCHECK_NE(device, nullptr); + is_per_kernel_performance_ = getenv("CYCLES_DEBUG_PER_KERNEL_PERFORMANCE"); } DeviceQueue::~DeviceQueue() @@ -33,11 +37,17 @@ DeviceQueue::~DeviceQueue() }); VLOG_DEVICE_STATS << "GPU queue stats:"; + double total_time = 0.0; for (const auto &[mask, time] : stats_sorted) { + total_time += time; VLOG_DEVICE_STATS << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5) << std::right << time << "s: " << device_kernel_mask_as_string(mask); } + + if (is_per_kernel_performance_) + VLOG_DEVICE_STATS << "GPU queue total time: " << std::fixed << std::setprecision(5) + << total_time; } } @@ -50,7 +60,7 @@ void DeviceQueue::debug_init_execution() last_kernels_enqueued_ = 0; } -void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) +void DeviceQueue::debug_enqueue_begin(DeviceKernel kernel, const int work_size) { if (VLOG_DEVICE_STATS_IS_ON) { VLOG_DEVICE_STATS << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size " @@ -60,6 +70,13 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel); } +void DeviceQueue::debug_enqueue_end() +{ + if (VLOG_DEVICE_STATS_IS_ON && is_per_kernel_performance_) { + synchronize(); + } +} + void DeviceQueue::debug_synchronize() { if (VLOG_DEVICE_STATS_IS_ON) { @@ -67,7 +84,11 @@ void DeviceQueue::debug_synchronize() const double elapsed_time = new_time - last_sync_time_; VLOG_DEVICE_STATS << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s"; - stats_kernel_time_[last_kernels_enqueued_] += elapsed_time; + /* There is no sense to have an entries in the performance data + * container without related kernel information. */ + if (last_kernels_enqueued_ != 0) { + stats_kernel_time_[last_kernels_enqueued_] += elapsed_time; + } last_sync_time_ = new_time; } diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 808431af401..e9a7a9d8701 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -162,7 +162,8 @@ class DeviceQueue { /* Implementations call these from the corresponding methods to generate debugging logs. */ void debug_init_execution(); - void debug_enqueue(DeviceKernel kernel, const int work_size); + void debug_enqueue_begin(DeviceKernel kernel, const int work_size); + void debug_enqueue_end(); void debug_synchronize(); string debug_active_kernels(); @@ -172,6 +173,9 @@ class DeviceQueue { double last_sync_time_; /* Accumulated execution time for combinations of kernels launched together. */ map<DeviceKernelMask, double> stats_kernel_time_; + /* If it is true, then a performance statistics in the debugging logs will have focus on kernels + * and an explicitqueue synchronization will be added after each kernel execution. */ + bool is_per_kernel_performance_; }; CCL_NAMESPACE_END |