Cycles: Add optional per-kernel performance statistics

When verbose level 4 is enabled, Blender prints kernel performance data for Cycles on GPU backends (except Metal that doesn't use debug_enqueue_* methods) for groups of kernels. These changes introduce a new CYCLES_DEBUG_PER_KERNEL_PERFORMANCE environment variable to allow getting timings for each kernels separately and not grouped with others. This is done by adding explicit synchronization after each kernel execution. Differential Revision: https://developer.blender.org/D15971
author: Nikita Sirgienko <nikita.sirgienko@intel.com> 2022-09-14 16:55:56 +0300
committer: Nikita Sirgienko <nikita.sirgienko@intel.com> 2022-09-27 23:15:00 +0300
commit: 2ead05d73878721703de5d2fe6a07eb9053168aa (patch)
tree: 72b51851a8c5e7f9995607ca20c94c9cdf022ff3
parent: b145cc9d361e21da3a8a0ff2ef3bad1f8e8fbae6 (diff)
6 files changed, 41 insertions, 8 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 5912e68a92b..84b0a1e0dd6 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -79,7 +79,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
     return false;
   }
 
-  debug_enqueue(kernel, work_size);
+  debug_enqueue_begin(kernel, work_size);
 
   const CUDAContextScope scope(cuda_device_);
   const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
@@ -121,6 +121,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
                                 0),
                  "enqueue");
 
+  debug_enqueue_end();
+
   return !(cuda_device_->have_error());
 }
 
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 8b3d963a32f..3f8b6267100 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -79,7 +79,7 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel,
     return false;
   }
 
-  debug_enqueue(kernel, work_size);
+  debug_enqueue_begin(kernel, work_size);
 
   const HIPContextScope scope(hip_device_);
   const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
@@ -120,6 +120,8 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel,
                                        0),
                  "enqueue");
 
+  debug_enqueue_end();
+
   return !(hip_device_->have_error());
 }
 
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 1e822e25f1a..f865b27f976 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -77,7 +77,7 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
 
   void **args = const_cast<void **>(_args.values);
 
-  debug_enqueue(kernel, signed_kernel_work_size);
+  debug_enqueue_begin(kernel, signed_kernel_work_size);
   assert(signed_kernel_work_size >= 0);
   size_t kernel_work_size = (size_t)signed_kernel_work_size;
 
@@ -97,6 +97,8 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
                               oneapi_device_->oneapi_error_message() + "\"");
   }
 
+  debug_enqueue_end();
+
   return is_finished_ok;
 }
 
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index f0d49ad6f6c..3bc547ed11d 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -46,7 +46,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
     return false;
   }
 
-  debug_enqueue(kernel, work_size);
+  debug_enqueue_begin(kernel, work_size);
 
   const CUDAContextScope scope(cuda_device_);
 
@@ -131,6 +131,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                                   1,
                                   1));
 
+  debug_enqueue_end();
+
   return !(optix_device->have_error());
 }
 
diff --git a/intern/cycles/device/queue.cpp b/intern/cycles/device/queue.cpp
index cc0cf0ccf84..8c997b184b6 100644
--- a/intern/cycles/device/queue.cpp
+++ b/intern/cycles/device/queue.cpp
@@ -12,9 +12,13 @@
 CCL_NAMESPACE_BEGIN
 
 DeviceQueue::DeviceQueue(Device *device)
-    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+    : device(device),
+      last_kernels_enqueued_(0),
+      last_sync_time_(0.0),
+      is_per_kernel_performance_(false)
 {
   DCHECK_NE(device, nullptr);
+  is_per_kernel_performance_ = getenv("CYCLES_DEBUG_PER_KERNEL_PERFORMANCE");
 }
 
 DeviceQueue::~DeviceQueue()
@@ -33,11 +37,17 @@ DeviceQueue::~DeviceQueue()
          });
 
     VLOG_DEVICE_STATS << "GPU queue stats:";
+    double total_time = 0.0;
     for (const auto &[mask, time] : stats_sorted) {
+      total_time += time;
       VLOG_DEVICE_STATS << "  " << std::setfill(' ') << std::setw(10) << std::fixed
                         << std::setprecision(5) << std::right << time
                         << "s: " << device_kernel_mask_as_string(mask);
     }
+
+    if (is_per_kernel_performance_)
+      VLOG_DEVICE_STATS << "GPU queue total time: " << std::fixed << std::setprecision(5)
+                        << total_time;
   }
 }
 
@@ -50,7 +60,7 @@ void DeviceQueue::debug_init_execution()
   last_kernels_enqueued_ = 0;
 }
 
-void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+void DeviceQueue::debug_enqueue_begin(DeviceKernel kernel, const int work_size)
 {
   if (VLOG_DEVICE_STATS_IS_ON) {
     VLOG_DEVICE_STATS << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
@@ -60,6 +70,13 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
   last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
 }
 
+void DeviceQueue::debug_enqueue_end()
+{
+  if (VLOG_DEVICE_STATS_IS_ON && is_per_kernel_performance_) {
+    synchronize();
+  }
+}
+
 void DeviceQueue::debug_synchronize()
 {
   if (VLOG_DEVICE_STATS_IS_ON) {
@@ -67,7 +84,11 @@ void DeviceQueue::debug_synchronize()
     const double elapsed_time = new_time - last_sync_time_;
     VLOG_DEVICE_STATS << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
 
-    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+    /* There is no sense to have an entries in the performance data
+     * container without related kernel information. */
+    if (last_kernels_enqueued_ != 0) {
+      stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+    }
 
     last_sync_time_ = new_time;
   }
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 808431af401..e9a7a9d8701 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -162,7 +162,8 @@ class DeviceQueue {
 
   /* Implementations call these from the corresponding methods to generate debugging logs. */
   void debug_init_execution();
-  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
+  void debug_enqueue_end();
   void debug_synchronize();
   string debug_active_kernels();
 
@@ -172,6 +173,9 @@ class DeviceQueue {
   double last_sync_time_;
   /* Accumulated execution time for combinations of kernels launched together. */
   map<DeviceKernelMask, double> stats_kernel_time_;
+  /* If it is true, then a performance statistics in the debugging logs will have focus on kernels
+   * and an explicitqueue synchronization will be added after each kernel execution. */
+  bool is_per_kernel_performance_;
 };
 
 CCL_NAMESPACE_END
author	Nikita Sirgienko <nikita.sirgienko@intel.com>	2022-09-14 16:55:56 +0300
committer	Nikita Sirgienko <nikita.sirgienko@intel.com>	2022-09-27 23:15:00 +0300
commit	2ead05d73878721703de5d2fe6a07eb9053168aa (patch)
tree	72b51851a8c5e7f9995607ca20c94c9cdf022ff3
parent	b145cc9d361e21da3a8a0ff2ef3bad1f8e8fbae6 (diff)