HIP device code cleanup and fix for high VRAM usage

This patch cleans up code for HIP device and makes it more consistent with the CUDA code. It also fixes the issue with high VRAM usage on AMD cards using HIP allowing better performance and usage on cards like 6600XT. Added a check in intern/cycles/kernel/bvh/bvh_util.h to prevent compiler error with hipcc Reviewed By: brecht, leesonw Maniphest Tasks: T92124 Differential Revision: https://developer.blender.org/D12834
author: Sayak Biswas <sayakAMD> 2021-10-20 14:37:39 +0300
committer: William Leeson <william@blender.org> 2021-10-20 15:04:28 +0300
commit: ba4e227def13b4b953775ff4a2fd9c154bb07751 (patch)
tree: 2ebf929123c0e1cec40fda60c4e8b86f1c38a28c
parent: d28aaf6139c8cfa8555542f4f228f390485dd7ed (diff)
6 files changed, 69 insertions, 168 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 544e22f342b..ac11b495948 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -430,7 +430,7 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
 
 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
 option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
-option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" OFF)
+option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" ON)
 mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
 
diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h
index 02fffc331bf..aa42fdf8ecd 100644
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -24,9 +24,13 @@ extern "C" {
 #include <stdlib.h>
 
 #define HIP_IPC_HANDLE_SIZE 64
+#define hipHostMallocDefault 0x00
 #define hipHostMallocPortable 0x01
 #define hipHostMallocMapped 0x02
 #define hipHostMallocWriteCombined 0x04
+#define hipHostMallocNumaUser 0x20000000
+#define hipHostMallocCoherent 0x40000000
+#define hipHostMallocNonCoherent 0x80000000
 #define hipHostRegisterPortable 0x01
 #define hipHostRegisterMapped 0x02
 #define hipHostRegisterIoMemory 0x04
@@ -989,7 +993,7 @@ typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize);
 typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
 typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr);
 typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
-typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize);
+typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize, unsigned int flags);
 typedef hipError_t HIPAPI thipHostFree(void* p);
 typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags);
 typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags);
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 0e5ac6ce401..964783a08bf 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     return;
   }
 
+  /* hipDeviceMapHost for mapping host memory when out of device memory.
+   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
   hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
 
   hip_assert(
@@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad
     }
     else if (map_host_used + size < map_host_limit) {
       /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+      mem_alloc_result = hipHostMalloc(
+          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
 
       assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
              (mem_alloc_result != hipSuccess && shared_pointer == 0));
@@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
   size_t bytes;
 
   hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
-  assert(bytes == size);
   hip_assert(hipMemcpyHtoD(mem, host, size));
 }
 
@@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem)
   }
 }
 
-#  if 0
-void HIPDevice::render(DeviceTask &task,
-                        RenderTile &rtile,
-                        device_vector<KernelWorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  HIPContextScope scope(this);
-  hipFunction_t hipRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
-  }
-  else {
-    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  KernelWorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  hip_assert(
-      hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  uint start_sample = rtile.start_sample;
-  uint end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    hip_assert(
-        hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    hip_assert(hipDeviceSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    hip_assert(hipDeviceSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void HIPDevice::thread_run(DeviceTask &task)
-{
-  HIPContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-}
-#  endif
-
 unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
 {
   return make_unique<HIPDeviceQueue>(this);
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 78c77e5fdae..0d9f5916d30 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue()
   hipStreamDestroy(hip_stream_);
 }
 
-int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  return 14416128;
+  int num_states = 0;
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+  if (max_num_threads == 0) {
+    num_states = 1048576;  // 65536 * 16
+  }
+  else {
+    num_states = max_num_threads * 16;
+  }
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    float factor = atof(factor_str);
+    if (!factor)
+      VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+    num_states = max((int)(num_states * factor), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
 }
 
 int HIPDeviceQueue::num_concurrent_busy_states() const
@@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
   }
 
   /* Launch kernel. */
-  hip_device_assert(hip_device_,
-                    hipModuleLaunchKernel(hip_kernel.function,
-                                          num_blocks,
-                                          1,
-                                          1,
-                                          num_threads_per_block,
-                                          1,
-                                          1,
-                                          shared_mem_bytes,
-                                          hip_stream_,
-                                          args,
-                                          0));
+  assert_success(hipModuleLaunchKernel(hip_kernel.function,
+                                       num_blocks,
+                                       1,
+                                       1,
+                                       num_threads_per_block,
+                                       1,
+                                       1,
+                                       shared_mem_bytes,
+                                       hip_stream_,
+                                       args,
+                                       0),
+                 "enqueue");
+
   return !(hip_device_->have_error());
 }
 
@@ -127,7 +147,7 @@ bool HIPDeviceQueue::synchronize()
   }
 
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  assert_success(hipStreamSynchronize(hip_stream_), "synchronize");
   debug_synchronize();
 
   return !(hip_device_->have_error());
@@ -150,9 +170,9 @@ void HIPDeviceQueue::zero_to_device(device_memory &mem)
   assert(mem.device_pointer != 0);
 
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
-      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+  assert_success(
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_),
+      "zero_to_device");
 }
 
 void HIPDeviceQueue::copy_to_device(device_memory &mem)
@@ -173,10 +193,10 @@ void HIPDeviceQueue::copy_to_device(device_memory &mem)
 
   /* Copy memory to device. */
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
+  assert_success(
       hipMemcpyHtoDAsync(
-          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_),
+      "copy_to_device");
 }
 
 void HIPDeviceQueue::copy_from_device(device_memory &mem)
@@ -192,13 +212,21 @@ void HIPDeviceQueue::copy_from_device(device_memory &mem)
 
   /* Copy memory from device. */
   const HIPContextScope scope(hip_device_);
-  hip_device_assert(
-      hip_device_,
+  assert_success(
       hipMemcpyDtoHAsync(
-          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_),
+      "copy_from_device");
+}
+
+void HIPDeviceQueue::assert_success(hipError_t result, const char *operation)
+{
+  if (result != hipSuccess) {
+    const char *name = hipewErrorString(result);
+    hip_device_->set_error(
+        string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+  }
 }
 
-// TODO : (Arya) Enable this after stabilizing dev branch
 unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
 {
   return make_unique<HIPDeviceGraphicsInterop>(this);
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 04c8a5982ce..b92f7de7e4b 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -55,12 +55,13 @@ class HIPDeviceQueue : public DeviceQueue {
     return hip_stream_;
   }
 
-  // TODO : (Arya) Enable this after stabilizing the dev branch
   virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
 
  protected:
   HIPDevice *hip_device_;
   hipStream_t hip_stream_;
+
+  void assert_success(hipError_t result, const char *operation);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index 309f0eeb1e2..8686f887021 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -98,7 +98,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
     for (int j = 0; j < num_hits - 1; ++j) {
       if (hits[j].t > hits[j + 1].t) {
         struct Intersection tmp_hit = hits[j];
-        struct float3 tmp_Ng = Ng[j];
+        float3 tmp_Ng = Ng[j];
         hits[j] = hits[j + 1];
         Ng[j] = Ng[j + 1];
         hits[j + 1] = tmp_hit;
author	Sayak Biswas <sayakAMD>	2021-10-20 14:37:39 +0300
committer	William Leeson <william@blender.org>	2021-10-20 15:04:28 +0300
commit	ba4e227def13b4b953775ff4a2fd9c154bb07751 (patch)
tree	2ebf929123c0e1cec40fda60c4e8b86f1c38a28c
parent	d28aaf6139c8cfa8555542f4f228f390485dd7ed (diff)