From ba4e227def13b4b953775ff4a2fd9c154bb07751 Mon Sep 17 00:00:00 2001
From: Sayak Biswas <sayakAMD>
Date: Wed, 20 Oct 2021 13:37:39 +0200
Subject: HIP device code cleanup and fix for high VRAM usage

This patch cleans up code for HIP device and makes it more consistent with the CUDA code.
It also fixes the issue with high VRAM usage on AMD cards using HIP allowing better performance and usage on cards like 6600XT.
Added a check in intern/cycles/kernel/bvh/bvh_util.h to prevent compiler error with hipcc

Reviewed By: brecht, leesonw

Maniphest Tasks: T92124

Differential Revision: https://developer.blender.org/D12834
---
 intern/cycles/device/hip/device_impl.cpp | 142 ++-----------------------------
 intern/cycles/device/hip/queue.cpp       |  82 ++++++++++++------
 intern/cycles/device/hip/queue.h         |   3 +-
 3 files changed, 62 insertions(+), 165 deletions(-)

(limited to 'intern/cycles/device/hip')
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 0e5ac6ce401..964783a08bf 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     return;
   }
 
+  /* hipDeviceMapHost for mapping host memory when out of device memory.
+   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
   hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
 
   hip_assert(
@@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad
     }
     else if (map_host_used + size < map_host_limit) {
       /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+      mem_alloc_result = hipHostMalloc(
+          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
 
       assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
              (mem_alloc_result != hipSuccess && shared_pointer == 0));
@@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
   size_t bytes;
 
   hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
-  assert(bytes == size);
   hip_assert(hipMemcpyHtoD(mem, host, size));
 }
 
@@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem)
   }
 }
 
-#  if 0
-void HIPDevice::render(DeviceTask &task,
-                        RenderTile &rtile,
-                        device_vector<KernelWorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  HIPContextScope scope(this);
-  hipFunction_t hipRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
-  }
-  else {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  KernelWorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  hip_assert(
-      hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  uint start_sample = rtile.start_sample;
-  uint end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    hip_assert(
-        hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    hip_assert(hipDeviceSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    hip_assert(hipDeviceSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void HIPDevice::thread_run(DeviceTask &task)
-{
-  HIPContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-}
-#  endif
-
 unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
 {
   return make_unique<HIPDeviceQueue>(this);
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 78c77e5fdae..0d9f5916d30 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue()
   hipStreamDestroy(hip_stream_);
 }
 
-int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  return 14416128;
+  int num_states = 0;
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+  if (max_num_threads == 0) {
+    num_states = 1048576;  // 65536 * 16
+  }
+  else {
+    num_states = max_num_threads * 16;
+  }
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    float factor = atof(factor_str);
+    if (!factor)
+      VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+    num_states = max((int)(num_states * factor), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
 }
 
 int HIPDeviceQueue::num_concurrent_busy_states() const
@@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
   }
 
   /* Launch kernel. */
-  hip_device_assert(hip_device_,
-                    hipModuleLaunchKernel(hip_kernel.function,
-                                          num_blocks,
-                                          1,
-                                          1,
-                                          num_threads_per_block,
-                                          1,
-                                          1,
-                                          shared_mem_bytes,
-                                          hip_stream_,
-                                          args,
-                                          0));
+  assert_success(hipModuleLaunchKernel(hip_kernel.function,
+                                       num_blocks,
+                                       1,
+                                       1,
+                                       num_threads_per_block,
+                                       1,
+                                       1,
+                                       shared_mem_bytes,
+                                       hip_stream_,
+                                       args,
+                                       0),
+                 "enqueue");
+
   return !(hip_device_->have_error());
 }
 
@@ -127,7 +147,7 @@ bool HIPDeviceQueue::synchronize()
   }
 
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  assert_success(hipStreamSynchronize(hip_stream_), "synchronize");
   debug_synchronize();
 
   return !(hip_device_->have_error());
@@ -150,9 +170,9 @@ void HIPDeviceQueue::zero_to_device(device_memory &mem)
   assert(mem.device_pointer != 0);
 
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
-      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+  assert_success(
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_),
+      "zero_to_device");
 }
 
 void HIPDeviceQueue::copy_to_device(device_memory &mem)
@@ -173,10 +193,10 @@ void HIPDeviceQueue::copy_to_device(device_memory &mem)
 
   /* Copy memory to device. */
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
+  assert_success(
       hipMemcpyHtoDAsync(
-          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_),
+      "copy_to_device");
 }
 
 void HIPDeviceQueue::copy_from_device(device_memory &mem)
@@ -192,13 +212,21 @@ void HIPDeviceQueue::copy_from_device(device_memory &mem)
 
   /* Copy memory from device. */
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
+  assert_success(
       hipMemcpyDtoHAsync(
-          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_),
+      "copy_from_device");
+}
+
+void HIPDeviceQueue::assert_success(hipError_t result, const char *operation)
+{
+  if (result != hipSuccess) {
+    const char *name = hipewErrorString(result);
+    hip_device_->set_error(
+        string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+  }
 }
 
-// TODO : (Arya) Enable this after stabilizing dev branch
 unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
 {
   return make_unique<HIPDeviceGraphicsInterop>(this);
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 04c8a5982ce..b92f7de7e4b 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -55,12 +55,13 @@ class HIPDeviceQueue : public DeviceQueue {
     return hip_stream_;
   }
 
-  // TODO : (Arya) Enable this after stabilizing the dev branch
   virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
 
  protected:
   HIPDevice *hip_device_;
   hipStream_t hip_stream_;
+
+  void assert_success(hipError_t result, const char *operation);
 };
 
 CCL_NAMESPACE_END
-- 
cgit v1.2.3