Cycles: add HIP device support for AMD GPUs

NOTE: this feature is not ready for user testing, and not yet enabled in daily builds. It is being merged now for easier collaboration on development. HIP is a heterogenous compute interface allowing C++ code to be executed on GPUs similar to CUDA. It is intended to bring back AMD GPU rendering support on Windows and Linux. https://github.com/ROCm-Developer-Tools/HIP. As of the time of writing, it should compile and run on Linux with existing HIP compilers and driver runtimes. Publicly available compilers and drivers for Windows will come later. See task T91571 for more details on the current status and work remaining to be done. Credits: Sayak Biswas (AMD) Arya Rafii (AMD) Brian Savery (AMD) Differential Revision: https://developer.blender.org/D12578
author: Brian Savery <bsavery> 2021-09-28 17:51:14 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-28 20:18:55 +0300
commit: 044a77352f8a8a0e1f60190369d69ef26587b65f (patch)
tree: 22096da4d5214cbd7419d1a5e0dadc70e6cacea3 /intern/cycles/util
parent: 262b2118565826177133013c324212c66d882456 (diff)
5 files changed, 66 insertions, 8 deletions
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index de17efafcf2..faba411c769 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,7 +34,7 @@
 
 #else /* __KERNEL_GPU__ */
 
-#  ifdef __KERNEL_CUDA__
+#  if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
 
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 1d598725c84..2245668d02f 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false)
   reset();
 }
 
+DebugFlags::HIP::HIP() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
     adaptive_compile = true;
 }
 
+void DebugFlags::HIP::reset()
+{
+  if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
   reset();
@@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
 
   os << "OptiX flags:\n"
      << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
+
+  os << "HIP flags:\n"
+     << "  HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n";
+
   return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index a2acaea5675..81677201790 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -93,6 +93,17 @@ class DebugFlags {
     bool adaptive_compile;
   };
 
+  /* Descriptor of HIP feature-set to be used. */
+  struct HIP {
+    HIP();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
+    bool adaptive_compile;
+  };
+
   /* Descriptor of OptiX feature-set to be used. */
   struct OptiX {
     OptiX();
@@ -124,6 +135,9 @@ class DebugFlags {
   /* Requested OptiX flags. */
   OptiX optix;
 
+  /* Requested HIP flags. */
+  HIP hip;
+
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index d9edfec5da3..f36a492a1b0 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 /* Half Floats */
 
 /* CUDA has its own half data type, no need to define then */
-#ifndef __KERNEL_CUDA__
+#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -59,7 +59,7 @@ struct half4 {
   half x, y, z, w;
 };
 
-#ifdef __KERNEL_CUDA__
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
@@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 
 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
+
 #  ifndef __KERNEL_SSE2__
   for (int i = 0; i < 4; i++) {
     /* optimized float to half for pixels:
@@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 #  endif
 }
 
+#  ifndef __KERNEL_HIP__
+
 ccl_device_inline float half_to_float(half h)
 {
   float f;
@@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h)
 
   return f;
 }
+#  else
+
+ccl_device_inline float half_to_float(std::uint32_t a) noexcept
+{
+
+  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
+
+  std::uint32_t v = __float_as_uint(__uint_as_float(u) *
+                                    __uint_as_float(0x77800000U) /*0x1.0p+112f*/) +
+                    0x38000000U;
+
+  u = (a & 0x7fff) != 0 ? v : u;
+
+  return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/;
+}
+
+#  endif /* __KERNEL_HIP__ */
 
 ccl_device_inline float4 half4_to_float4(half4 h)
 {
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 6d728dde679..cb1e94c838c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,6 +26,10 @@
 #  include <cmath>
 #endif
 
+#ifdef __HIP__
+#  include <hip/hip_vector_types.h>
+#endif
+
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
@@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Scalar */
 
-#ifdef _WIN32
+#ifndef __HIP__
+#  ifdef _WIN32
 ccl_device_inline float fmaxf(float a, float b)
 {
   return (a > b) ? a : b;
@@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b)
 {
   return (a < b) ? a : b;
 }
-#endif /* _WIN32 */
+
+#  endif /* _WIN32 */
+#endif   /* __HIP__ */
 
 #ifndef __KERNEL_GPU__
 using std::isfinite;
@@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f)
   return u.i;
 }
 
+#ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
   union {
@@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i)
   u.i = i;
   return u.f;
 }
+#endif
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
@@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits)
 
 ccl_device_inline uint count_leading_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __clz(x);
 #else
   assert(x != 0);
@@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x)
 
 ccl_device_inline uint count_trailing_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return (__ffs(x) - 1);
 #else
   assert(x != 0);
@@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 
 ccl_device_inline uint find_first_set(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __ffs(x);
 #else
 #  ifdef _MSC_VER
author	Brian Savery <bsavery>	2021-09-28 17:51:14 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-28 20:18:55 +0300
commit	044a77352f8a8a0e1f60190369d69ef26587b65f (patch)
tree	22096da4d5214cbd7419d1a5e0dadc70e6cacea3 /intern/cycles/util
parent	262b2118565826177133013c324212c66d882456 (diff)