Cycles: add HIP device support for AMD GPUs

NOTE: this feature is not ready for user testing, and not yet enabled in daily builds. It is being merged now for easier collaboration on development. HIP is a heterogenous compute interface allowing C++ code to be executed on GPUs similar to CUDA. It is intended to bring back AMD GPU rendering support on Windows and Linux. https://github.com/ROCm-Developer-Tools/HIP. As of the time of writing, it should compile and run on Linux with existing HIP compilers and driver runtimes. Publicly available compilers and drivers for Windows will come later. See task T91571 for more details on the current status and work remaining to be done. Credits: Sayak Biswas (AMD) Arya Rafii (AMD) Brian Savery (AMD) Differential Revision: https://developer.blender.org/D12578
author: Brian Savery <bsavery> 2021-09-28 17:51:14 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-28 20:18:55 +0300
commit: 044a77352f8a8a0e1f60190369d69ef26587b65f (patch)
tree: 22096da4d5214cbd7419d1a5e0dadc70e6cacea3
parent: 262b2118565826177133013c324212c66d882456 (diff)
45 files changed, 4854 insertions, 19 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e807b84e22..c4b8bf6dcd4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -419,6 +419,8 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
 
 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
 option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
+option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" OFF)
+mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
 
 option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON)
@@ -821,6 +823,11 @@ if(NOT WITH_CUDA_DYNLOAD)
   endif()
 endif()
 
+if(WITH_CYCLES_DEVICE_HIP)
+  # Currently HIP must be dynamically loaded, this may change in future toolkits
+  set(WITH_HIP_DYNLOAD ON)
+endif()
+
 #-----------------------------------------------------------------------------
 # Check check if submodules are cloned
 
@@ -1850,6 +1857,9 @@ elseif(WITH_CYCLES_STANDALONE)
   if(WITH_CUDA_DYNLOAD)
     add_subdirectory(extern/cuew)
   endif()
+  if(WITH_HIP_DYNLOAD)
+    add_subdirectory(extern/hipew)
+  endif()
   if(NOT WITH_SYSTEM_GLEW)
     add_subdirectory(extern/glew)
   endif()
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 7f7d91f0765..2b2cca04503 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -70,6 +70,9 @@ if(WITH_CYCLES OR WITH_COMPOSITOR OR WITH_OPENSUBDIV)
   if(WITH_CUDA_DYNLOAD)
     add_subdirectory(cuew)
   endif()
+  if(WITH_HIP_DYNLOAD)
+    add_subdirectory(hipew)
+  endif()
 endif()
 
 if(WITH_GHOST_X11 AND WITH_GHOST_XDND)
diff --git a/extern/hipew/CMakeLists.txt b/extern/hipew/CMakeLists.txt
new file mode 100644
index 00000000000..d215ea8c691
--- /dev/null
+++ b/extern/hipew/CMakeLists.txt
@@ -0,0 +1,39 @@
+# ***** BEGIN GPL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# The Original Code is Copyright (C) 2021, Blender Foundation
+# All rights reserved.
+# ***** END GPL LICENSE BLOCK *****
+
+set(INC
+  .
+  include
+)
+
+set(INC_SYS
+
+)
+
+set(SRC
+  src/hipew.c
+
+  include/hipew.h
+)
+
+set(LIB
+)
+
+blender_add_lib(extern_hipew "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h
new file mode 100644
index 00000000000..02fffc331bf
--- /dev/null
+++ b/extern/hipew/include/hipew.h
@@ -0,0 +1,1207 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __HIPEW_H__
+#define __HIPEW_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+
+#define HIP_IPC_HANDLE_SIZE 64
+#define hipHostMallocPortable 0x01
+#define hipHostMallocMapped 0x02
+#define hipHostMallocWriteCombined 0x04
+#define hipHostRegisterPortable 0x01
+#define hipHostRegisterMapped 0x02
+#define hipHostRegisterIoMemory 0x04
+#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
+#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
+#define hipArrayLayered 0x01
+#define hipArraySurfaceLoadStore 0x02
+#define hipArrayCubemap 0x04
+#define hipArrayTextureGather 0x08
+#define HIP_TRSA_OVERRIDE_FORMAT 0x01
+#define HIP_TRSF_READ_AS_INTEGER 0x01
+#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
+#define HIP_LAUNCH_PARAM_END ((void*)0x00)
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
+
+/* Functions which changed 3.1 -> 3.2 for 64 bit stuff,
+ * the cuda library has both the old ones for compatibility and new
+ * ones with _v2 postfix,
+ */
+#define hipModuleGetGlobal hipModuleGetGlobal
+#define hipMemGetInfo hipMemGetInfo
+#define hipMemAllocPitch hipMemAllocPitch
+#define hipMemGetAddressRange hipMemGetAddressRange
+#define hipMemcpyHtoD hipMemcpyHtoD
+#define hipMemcpyDtoH hipMemcpyDtoH
+#define hipMemcpyDtoD hipMemcpyDtoD
+#define hipMemcpyHtoA hipMemcpyHtoA
+#define hipMemcpyAtoH hipMemcpyAtoH
+#define hipMemcpyHtoDAsync hipMemcpyHtoDAsync
+#define hipMemcpyDtoHAsync hipMemcpyDtoHAsync
+#define hipMemcpyDtoDAsync hipMemcpyDtoDAsync
+#define hipMemsetD8 hipMemsetD8
+#define hipMemsetD16 hipMemsetD16
+#define hipMemsetD32 hipMemsetD32
+#define hipArrayCreate hipArrayCreate
+#define hipArray3DCreate hipArray3DCreate
+#define hipTexRefSetAddress hipTexRefSetAddress
+#define hipTexRefGetAddress hipTexRefGetAddress
+#define hipStreamDestroy hipStreamDestroy
+#define hipEventDestroy hipEventDestroy
+#define hipTexRefSetAddress2D hipTexRefSetAddress2D
+
+/* Types. */
+#ifdef _MSC_VER
+typedef unsigned __int32 hipuint32_t;
+typedef unsigned __int64 hipuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t hipuint32_t;
+typedef uint64_t hipuint64_t;
+#endif
+
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
+typedef unsigned long long hipDeviceptr_t;
+#else
+typedef unsigned int hipDeviceptr_t;
+#endif
+
+
+#ifdef _WIN32
+#  define HIPAPI __stdcall
+#  define HIP_CB __stdcall
+#else
+#  define HIPAPI
+#  define HIP_CB
+#endif
+
+typedef int hipDevice_t;
+typedef struct ihipCtx_t* hipCtx_t;
+typedef struct ihipModule_t* hipModule_t;
+typedef struct ihipModuleSymbol_t* hipFunction_t;
+typedef struct hipArray* hArray;
+typedef struct hipMipmappedArray_st* hipMipmappedArray_t;
+typedef struct ihipEvent_t* hipEvent_t;
+typedef struct ihipStream_t* hipStream_t;
+typedef unsigned long long hipTextureObject_t;
+
+typedef struct HIPuuid_st {
+  char bytes[16];
+} HIPuuid;
+
+typedef enum hipChannelFormatKind {
+    hipChannelFormatKindSigned = 0,
+    hipChannelFormatKindUnsigned = 1,
+    hipChannelFormatKindFloat = 2,
+    hipChannelFormatKindNone = 3,
+}hipChannelFormatKind;
+
+typedef struct hipChannelFormatDesc {
+    int x;
+    int y;
+    int z;
+    int w;
+    enum hipChannelFormatKind f;
+}hipChannelFormatDesc;
+
+typedef enum hipTextureFilterMode {
+  hipFilterModePoint = 0,
+  hipFilterModeLinear = 1,
+} hipTextureFilterMode;
+
+typedef enum hipArray_Format {
+  HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+  HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
+  HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+  HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
+  HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+  HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
+  HIP_AD_FORMAT_HALF = 0x10,
+  HIP_AD_FORMAT_FLOAT = 0x20,
+} hipArray_Format;
+
+typedef enum hipTextureAddressMode {
+  hipAddressModeWrap = 0,
+  hipAddressModeClamp = 1,
+  hipAddressModeMirror = 2,
+  hipAddressModeBorder = 3,
+} hipTextureAddressMode;
+
+/**
+ * hip texture reference
+ */
+typedef struct textureReference {
+    int normalized;
+    //enum hipTextureReadMode readMode;// used only for driver API's
+    enum hipTextureFilterMode filterMode;
+    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
+    struct hipChannelFormatDesc channelDesc;
+    int sRGB;                    // Perform sRGB->linear conversion during texture read
+    unsigned int maxAnisotropy;  // Limit to the anisotropy ratio
+    enum hipTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+
+    hipTextureObject_t textureObject;
+    int numChannels;
+    enum hipArray_Format format;
+}textureReference;
+
+typedef textureReference* hipTexRef;
+
+typedef enum hipMemoryType {
+  hipMemoryTypeHost = 0x00,
+  hipMemoryTypeDevice = 0x01,
+  hipMemoryTypeArray = 0x02,
+  hipMemoryTypeUnified = 0x03,
+} hipMemoryType;
+
+/**
+ * Pointer attributes
+ */
+typedef struct hipPointerAttribute_t {
+    enum hipMemoryType memoryType;
+    int device;
+    void* devicePointer;
+    void* hostPointer;
+    int isManaged;
+    unsigned allocationFlags; /* flags specified when memory was allocated*/
+    /* peers? */
+} hipPointerAttribute_t;
+
+typedef struct ihipIpcEventHandle_t {
+  char reserved[HIP_IPC_HANDLE_SIZE];
+} ihipIpcEventHandle_t;
+
+typedef struct hipIpcMemHandle_st {
+  char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcMemHandle_t;
+
+typedef enum HIPipcMem_flags_enum {
+  hipIpcMemLazyEnablePeerAccess = 0x1,
+} HIPipcMem_flags;
+
+typedef enum HIPmemAttach_flags_enum {
+  hipMemAttachGlobal = 0x1,
+  hipMemAttachHost = 0x2,
+  HIP_MEM_ATTACH_SINGLE = 0x4,
+} HIPmemAttach_flags;
+
+typedef enum HIPctx_flags_enum {
+  hipDeviceScheduleAuto = 0x00,
+  hipDeviceScheduleSpin = 0x01,
+  hipDeviceScheduleYield = 0x02,
+  hipDeviceScheduleBlockingSync = 0x04,
+  hipDeviceScheduleMask = 0x07,
+  hipDeviceMapHost = 0x08,
+  hipDeviceLmemResizeToMax = 0x10,
+} HIPctx_flags;
+
+typedef enum HIPstream_flags_enum {
+  hipStreamDefault = 0x0,
+  hipStreamNonBlocking = 0x1,
+} HIPstream_flags;
+
+typedef enum HIPevent_flags_enum {
+  hipEventDefault = 0x0,
+  hipEventBlockingSync = 0x1,
+  hipEventDisableTiming = 0x2,
+  hipEventInterprocess = 0x4,
+} HIPevent_flags;
+
+typedef enum HIPstreamWaitValue_flags_enum {
+  HIP_STREAM_WAIT_VALUE_GEQ = 0x0,
+  HIP_STREAM_WAIT_VALUE_EQ = 0x1,
+  HIP_STREAM_WAIT_VALUE_AND = 0x2,
+  HIP_STREAM_WAIT_VALUE_NOR = 0x3,
+  HIP_STREAM_WAIT_VALUE_FLUSH = (1 << 30),
+} HIPstreamWaitValue_flags;
+
+typedef enum HIPstreamWriteValue_flags_enum {
+  HIP_STREAM_WRITE_VALUE_DEFAULT = 0x0,
+  HIP_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1,
+} HIPstreamWriteValue_flags;
+
+typedef enum HIPstreamBatchMemOpType_enum {
+  HIP_STREAM_MEM_OP_WAIT_VALUE_32 = 1,
+  HIP_STREAM_MEM_OP_WRITE_VALUE_32 = 2,
+  HIP_STREAM_MEM_OP_WAIT_VALUE_64 = 4,
+  HIP_STREAM_MEM_OP_WRITE_VALUE_64 = 5,
+  HIP_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3,
+} HIPstreamBatchMemOpType;
+
+
+typedef union HIPstreamBatchMemOpParams_union {
+  HIPstreamBatchMemOpType operation;
+  struct HIPstreamMemOpWaitValueParams_st {
+    HIPstreamBatchMemOpType operation;
+    hipDeviceptr_t address;
+    union {
+      hipuint32_t value;
+      hipuint64_t value64;
+    };
+    unsigned int flags;
+    hipDeviceptr_t alias;
+  } waitValue;
+  struct HIPstreamMemOpWriteValueParams_st {
+    HIPstreamBatchMemOpType operation;
+    hipDeviceptr_t address;
+    union {
+      hipuint32_t value;
+      hipuint64_t value64;
+    };
+    unsigned int flags;
+    hipDeviceptr_t alias;
+  } writeValue;
+  struct HIPstreamMemOpFlushRemoteWritesParams_st {
+    HIPstreamBatchMemOpType operation;
+    unsigned int flags;
+  } flushRemoteWrites;
+  hipuint64_t pad[6];
+} HIPstreamBatchMemOpParams;
+
+typedef enum HIPoccupancy_flags_enum {
+  hipOccupancyDefault = 0x0,
+  HIP_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1,
+} HIPoccupancy_flags;
+
+typedef enum hipDeviceAttribute_t {
+  hipDeviceAttributeCudaCompatibleBegin = 0,
+  hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled.
+  hipDeviceAttributeAccessPolicyMaxWindowSize,        ///< Cuda only. The maximum size of the window policy in bytes.
+  hipDeviceAttributeAsyncEngineCount,                 ///< Cuda only. Asynchronous engines number.
+  hipDeviceAttributeCanMapHostMemory,                 ///< Whether host memory can be mapped into device address space
+  hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Cuda only. Device can access host registered memory
+                                                      ///< at the same virtual address as the CPU
+  hipDeviceAttributeClockRate,                        ///< Peak clock frequency in kilohertz.
+  hipDeviceAttributeComputeMode,                      ///< Compute mode that device is currently in.
+  hipDeviceAttributeComputePreemptionSupported,       ///< Cuda only. Device supports Compute Preemption.
+  hipDeviceAttributeConcurrentKernels,                ///< Device can possibly execute multiple kernels concurrently.
+  hipDeviceAttributeConcurrentManagedAccess,          ///< Device can coherently access managed memory concurrently with the CPU
+  hipDeviceAttributeCooperativeLaunch,                ///< Support cooperative launch
+  hipDeviceAttributeCooperativeMultiDeviceLaunch,     ///< Support cooperative launch on multiple devices
+  hipDeviceAttributeDeviceOverlap,                    ///< Cuda only. Device can concurrently copy memory and execute a kernel.
+                                                      ///< Deprecated. Use instead asyncEngineCount.
+  hipDeviceAttributeDirectManagedMemAccessFromHost,   ///< Host can directly access managed memory on
+                                                      ///< the device without migration
+  hipDeviceAttributeGlobalL1CacheSupported,           ///< Cuda only. Device supports caching globals in L1
+  hipDeviceAttributeHostNativeAtomicSupported,        ///< Cuda only. Link between the device and the host supports native atomic operations
+  hipDeviceAttributeIntegrated,                       ///< Device is integrated GPU
+  hipDeviceAttributeIsMultiGpuBoard,                  ///< Multiple GPU devices.
+  hipDeviceAttributeKernelExecTimeout,                ///< Run time limit for kernels executed on the device
+  hipDeviceAttributeL2CacheSize,                      ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+  hipDeviceAttributeLocalL1CacheSupported,            ///< caching locals in L1 is supported
+  hipDeviceAttributeLuid,                             ///< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeLuidDeviceNodeMask,               ///< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeComputeCapabilityMajor,           ///< Major compute capability version number.
+  hipDeviceAttributeManagedMemory,                    ///< Device supports allocating managed memory on this system
+  hipDeviceAttributeMaxBlocksPerMultiProcessor,       ///< Cuda only. Max block size per multiprocessor
+  hipDeviceAttributeMaxBlockDimX,                     ///< Max block size in width.
+  hipDeviceAttributeMaxBlockDimY,                     ///< Max block size in height.
+  hipDeviceAttributeMaxBlockDimZ,                     ///< Max block size in depth.
+  hipDeviceAttributeMaxGridDimX,                      ///< Max grid size  in width.
+  hipDeviceAttributeMaxGridDimY,                      ///< Max grid size  in height.
+  hipDeviceAttributeMaxGridDimZ,                      ///< Max grid size  in depth.
+  hipDeviceAttributeMaxSurface1D,                     ///< Maximum size of 1D surface.
+  hipDeviceAttributeMaxSurface1DLayered,              ///< Cuda only. Maximum dimensions of 1D layered surface.
+  hipDeviceAttributeMaxSurface2D,                     ///< Maximum dimension (width, height) of 2D surface.
+  hipDeviceAttributeMaxSurface2DLayered,              ///< Cuda only. Maximum dimensions of 2D layered surface.
+  hipDeviceAttributeMaxSurface3D,                     ///< Maximum dimension (width, height, depth) of 3D surface.
+  hipDeviceAttributeMaxSurfaceCubemap,                ///< Cuda only. Maximum dimensions of Cubemap surface.
+  hipDeviceAttributeMaxSurfaceCubemapLayered,         ///< Cuda only. Maximum dimension of Cubemap layered surface.
+  hipDeviceAttributeMaxTexture1DWidth,                ///< Maximum size of 1D texture.
+  hipDeviceAttributeMaxTexture1DLayered,              ///< Cuda only. Maximum dimensions of 1D layered texture.
+  hipDeviceAttributeMaxTexture1DLinear,               ///< Maximum number of elements allocatable in a 1D linear texture.
+                                                      ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
+  hipDeviceAttributeMaxTexture1DMipmap,               ///< Cuda only. Maximum size of 1D mipmapped texture.
+  hipDeviceAttributeMaxTexture2DWidth,                ///< Maximum dimension width of 2D texture.
+  hipDeviceAttributeMaxTexture2DHeight,               ///< Maximum dimension hight of 2D texture.
+  hipDeviceAttributeMaxTexture2DGather,               ///< Cuda only. Maximum dimensions of 2D texture if gather operations  performed.
+  hipDeviceAttributeMaxTexture2DLayered,              ///< Cuda only. Maximum dimensions of 2D layered texture.
+  hipDeviceAttributeMaxTexture2DLinear,               ///< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
+  hipDeviceAttributeMaxTexture2DMipmap,               ///< Cuda only. Maximum dimensions of 2D mipmapped texture.
+  hipDeviceAttributeMaxTexture3DWidth,                ///< Maximum dimension width of 3D texture.
+  hipDeviceAttributeMaxTexture3DHeight,               ///< Maximum dimension height of 3D texture.
+  hipDeviceAttributeMaxTexture3DDepth,                ///< Maximum dimension depth of 3D texture.
+  hipDeviceAttributeMaxTexture3DAlt,                  ///< Cuda only. Maximum dimensions of alternate 3D texture.
+  hipDeviceAttributeMaxTextureCubemap,                ///< Cuda only. Maximum dimensions of Cubemap texture
+  hipDeviceAttributeMaxTextureCubemapLayered,         ///< Cuda only. Maximum dimensions of Cubemap layered texture.
+  hipDeviceAttributeMaxThreadsDim,                    ///< Maximum dimension of a block
+  hipDeviceAttributeMaxThreadsPerBlock,               ///< Maximum number of threads per block.
+  hipDeviceAttributeMaxThreadsPerMultiProcessor,      ///< Maximum resident threads per multiprocessor.
+  hipDeviceAttributeMaxPitch,                         ///< Maximum pitch in bytes allowed by memory copies
+  hipDeviceAttributeMemoryBusWidth,                   ///< Global memory bus width in bits.
+  hipDeviceAttributeMemoryClockRate,                  ///< Peak memory clock frequency in kilohertz.
+  hipDeviceAttributeComputeCapabilityMinor,           ///< Minor compute capability version number.
+  hipDeviceAttributeMultiGpuBoardGroupID,             ///< Cuda only. Unique ID of device group on the same multi-GPU board
+  hipDeviceAttributeMultiprocessorCount,              ///< Number of multiprocessors on the device.
+  hipDeviceAttributeName,                             ///< Device name.
+  hipDeviceAttributePageableMemoryAccess,             ///< Device supports coherently accessing pageable memory
+                                                      ///< without calling hipHostRegister on it
+  hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables
+  hipDeviceAttributePciBusId,                         ///< PCI Bus ID.
+  hipDeviceAttributePciDeviceId,                      ///< PCI Device ID.
+  hipDeviceAttributePciDomainID,                      ///< PCI Domain ID.
+  hipDeviceAttributePersistingL2CacheMaxSize,         ///< Cuda11 only. Maximum l2 persisting lines capacity in bytes
+  hipDeviceAttributeMaxRegistersPerBlock,             ///< 32-bit registers available to a thread block. This number is shared
+                                                      ///< by all thread blocks simultaneously resident on a multiprocessor.
+  hipDeviceAttributeMaxRegistersPerMultiprocessor,    ///< 32-bit registers available per block.
+  hipDeviceAttributeReservedSharedMemPerBlock,        ///< Cuda11 only. Shared memory reserved by CUDA driver per block.
+  hipDeviceAttributeMaxSharedMemoryPerBlock,          ///< Maximum shared memory available per block in bytes.
+  hipDeviceAttributeSharedMemPerBlockOptin,           ///< Cuda only. Maximum shared memory per block usable by special opt in.
+  hipDeviceAttributeSharedMemPerMultiprocessor,       ///< Cuda only. Shared memory available per multiprocessor.
+  hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision.
+  hipDeviceAttributeStreamPrioritiesSupported,        ///< Cuda only. Whether to support stream priorities.
+  hipDeviceAttributeSurfaceAlignment,                 ///< Cuda only. Alignment requirement for surfaces
+  hipDeviceAttributeTccDriver,                        ///< Cuda only. Whether device is a Tesla device using TCC driver
+  hipDeviceAttributeTextureAlignment,                 ///< Alignment requirement for textures
+  hipDeviceAttributeTexturePitchAlignment,            ///< Pitch alignment requirement for 2D texture references bound to pitched memory;
+  hipDeviceAttributeTotalConstantMemory,              ///< Constant memory size in bytes.
+  hipDeviceAttributeTotalGlobalMem,                   ///< Global memory available on devicice.
+  hipDeviceAttributeUnifiedAddressing,                ///< Cuda only. An unified address space shared with the host.
+  hipDeviceAttributeUuid,                             ///< Cuda only. Unique ID in 16 byte.
+  hipDeviceAttributeWarpSize,                         ///< Warp size in threads.
+  hipDeviceAttributeCudaCompatibleEnd = 9999,
+  hipDeviceAttributeAmdSpecificBegin = 10000,
+  hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  ///< Frequency in khz of the timer used by the device-side "clock*"
+  hipDeviceAttributeArch,                                     ///< Device architecture
+  hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         ///< Maximum Shared Memory PerMultiprocessor.
+  hipDeviceAttributeGcnArch,                                  ///< Device gcn architecture
+  hipDeviceAttributeGcnArchName,                              ///< Device gcnArch name in 256 bytes
+  hipDeviceAttributeHdpMemFlushCntl,                          ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeHdpRegFlushCntl,                          ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      ///< Supports cooperative launch on multiple
+                                                              ///< devices with unmatched functions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   ///< Supports cooperative launch on multiple
+                                                              ///< devices with unmatched grid dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  ///< Supports cooperative launch on multiple
+                                                              ///< devices with unmatched block dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple
+                                                              ///< devices with unmatched shared memories
+  hipDeviceAttributeIsLargeBar,                               ///< Whether it is LargeBar
+  hipDeviceAttributeAsicRevision,                             ///< Revision of the GPU in this device
+  hipDeviceAttributeCanUseStreamWaitValue,                    ///< '1' if Device supports hipStreamWaitValue32() and
+                                                              ///< hipStreamWaitValue64() , '0' otherwise.
+  hipDeviceAttributeAmdSpecificEnd = 19999,
+  hipDeviceAttributeVendorSpecificBegin = 20000,
+  // Extended attributes for vendors
+} hipDeviceAttribute_t;
+
+typedef struct HIPdevprop_st {
+  int maxThreadsPerBlock;
+  int maxThreadsDim[3];
+  int maxGridSize[3];
+  int sharedMemPerBlock;
+  int totalConstantMemory;
+  int SIMDWidth;
+  int memPitch;
+  int regsPerBlock;
+  int clockRate;
+  int textureAlign;
+} HIPdevprop;
+
+typedef enum HIPpointer_attribute_enum {
+  HIP_POINTER_ATTRIBUTE_CONTEXT = 1,
+  HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,
+  HIP_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,
+  HIP_POINTER_ATTRIBUTE_HOST_POINTER = 4,
+  HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,
+  HIP_POINTER_ATTRIBUTE_BUFFER_ID = 7,
+  HIP_POINTER_ATTRIBUTE_IS_MANAGED = 8,
+  HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,
+} HIPpointer_attribute;
+
+typedef enum hipFunction_attribute {
+  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+  HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+  HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+  HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+  HIP_FUNC_ATTRIBUTE_NUM_REGS = 4,
+  HIP_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+  HIP_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+  HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+  HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+  HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+  HIP_FUNC_ATTRIBUTE_MAX,
+} hipFunction_attribute;
+
+typedef enum hipFuncCache_t {
+  hipFuncCachePreferNone = 0x00,
+  hipFuncCachePreferShared = 0x01,
+  hipFuncCachePreferL1 = 0x02,
+  hipFuncCachePreferEqual = 0x03,
+} hipFuncCache_t;
+
+typedef enum hipSharedMemConfig {
+  hipSharedMemBankSizeDefault = 0x00,
+  hipSharedMemBankSizeFourByte = 0x01,
+  hipSharedMemBankSizeEightByte = 0x02,
+} hipSharedMemConfig;
+
+typedef enum HIPshared_carveout_enum {
+  HIP_SHAREDMEM_CARVEOUT_DEFAULT,
+  HIP_SHAREDMEM_CARVEOUT_MAX_SHARED = 100,
+  HIP_SHAREDMEM_CARVEOUT_MAX_L1 = 0,
+} HIPshared_carveout;
+
+
+
+typedef enum hipComputeMode {
+  hipComputeModeDefault = 0,
+  hipComputeModeProhibited = 2,
+  hipComputeModeExclusiveProcess = 3,
+} hipComputeMode;
+
+typedef enum HIPmem_advise_enum {
+  HIP_MEM_ADVISE_SET_READ_MOSTLY = 1,
+  HIP_MEM_ADVISE_UNSET_READ_MOSTLY = 2,
+  HIP_MEM_ADVISE_SET_PREFERRED_LOCATION = 3,
+  HIP_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4,
+  HIP_MEM_ADVISE_SET_ACCESSED_BY = 5,
+  HIP_MEM_ADVISE_UNSET_ACCESSED_BY = 6,
+} HIPmem_advise;
+
+typedef enum HIPmem_range_attribute_enum {
+  HIP_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1,
+  HIP_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2,
+  HIP_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3,
+  HIP_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4,
+} HIPmem_range_attribute;
+
+typedef enum hipJitOption {
+  hipJitOptionMaxRegisters = 0,
+  hipJitOptionThreadsPerBlock,
+  hipJitOptionWallTime,
+  hipJitOptionInfoLogBuffer,
+  hipJitOptionInfoLogBufferSizeBytes,
+  hipJitOptionErrorLogBuffer,
+  hipJitOptionErrorLogBufferSizeBytes,
+  hipJitOptionOptimizationLevel,
+  hipJitOptionTargetFromContext,
+  hipJitOptionTarget,
+  hipJitOptionFallbackStrategy,
+  hipJitOptionGenerateDebugInfo,
+  hipJitOptionLogVerbose,
+  hipJitOptionGenerateLineInfo,
+  hipJitOptionCacheMode,
+  hipJitOptionSm3xOpt,
+  hipJitOptionFastCompile,
+  hipJitOptionNumOptions,
+} hipJitOption;
+
+typedef enum HIPjit_target_enum {
+  HIP_TARGET_COMPUTE_20 = 20,
+  HIP_TARGET_COMPUTE_21 = 21,
+  HIP_TARGET_COMPUTE_30 = 30,
+  HIP_TARGET_COMPUTE_32 = 32,
+  HIP_TARGET_COMPUTE_35 = 35,
+  HIP_TARGET_COMPUTE_37 = 37,
+  HIP_TARGET_COMPUTE_50 = 50,
+  HIP_TARGET_COMPUTE_52 = 52,
+  HIP_TARGET_COMPUTE_53 = 53,
+  HIP_TARGET_COMPUTE_60 = 60,
+  HIP_TARGET_COMPUTE_61 = 61,
+  HIP_TARGET_COMPUTE_62 = 62,
+  HIP_TARGET_COMPUTE_70 = 70,
+  HIP_TARGET_COMPUTE_73 = 73,
+  HIP_TARGET_COMPUTE_75 = 75,
+} HIPjit_target;
+
+typedef enum HIPjit_fallback_enum {
+  HIP_PREFER_PTX = 0,
+  HIP_PREFER_BINARY,
+} HIPjit_fallback;
+
+typedef enum HIPjit_cacheMode_enum {
+  HIP_JIT_CACHE_OPTION_NONE = 0,
+  HIP_JIT_CACHE_OPTION_CG,
+  HIP_JIT_CACHE_OPTION_CA,
+} HIPjit_cacheMode;
+
+typedef enum HIPjitInputType_enum {
+  HIP_JIT_INPUT_HIPBIN = 0,
+  HIP_JIT_INPUT_PTX,
+  HIP_JIT_INPUT_FATBINARY,
+  HIP_JIT_INPUT_OBJECT,
+  HIP_JIT_INPUT_LIBRARY,
+  HIP_JIT_NUM_INPUT_TYPES,
+} HIPjitInputType;
+
+typedef struct HIPlinkState_st* HIPlinkState;
+
+typedef enum hipGLDeviceList {
+    hipGLDeviceListAll = 1,           ///< All hip devices used by current OpenGL context.
+    hipGLDeviceListCurrentFrame = 2,  ///< Hip devices used by current OpenGL context in current
+                                    ///< frame
+    hipGLDeviceListNextFrame = 3      ///< Hip devices used by current OpenGL context in next
+                                    ///< frame.
+} hipGLDeviceList;
+
+typedef enum hipGraphicsRegisterFlags {
+    hipGraphicsRegisterFlagsNone = 0,
+    hipGraphicsRegisterFlagsReadOnly = 1,  ///< HIP will not write to this registered resource
+    hipGraphicsRegisterFlagsWriteDiscard =
+        2,  ///< HIP will only write and will not read from this registered resource
+    hipGraphicsRegisterFlagsSurfaceLoadStore = 4,  ///< HIP will bind this resource to a surface
+    hipGraphicsRegisterFlagsTextureGather =
+        8  ///< HIP will perform texture gather operations on this registered resource
+} hipGraphicsRegisterFlags;
+
+typedef enum HIPgraphicsRegisterFlags_enum {
+  HIP_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
+  HIP_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
+  HIP_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
+  HIP_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04,
+  HIP_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08,
+} HIPgraphicsRegisterFlags;
+
+typedef enum HIPgraphicsMapResourceFlags_enum {
+  HIP_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
+  HIP_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
+  HIP_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
+} HIPgraphicsMapResourceFlags;
+
+typedef enum HIParray_cubemap_face_enum {
+  HIP_HIPBEMAP_FACE_POSITIVE_X = 0x00,
+  HIP_HIPBEMAP_FACE_NEGATIVE_X = 0x01,
+  HIP_HIPBEMAP_FACE_POSITIVE_Y = 0x02,
+  HIP_HIPBEMAP_FACE_NEGATIVE_Y = 0x03,
+  HIP_HIPBEMAP_FACE_POSITIVE_Z = 0x04,
+  HIP_HIPBEMAP_FACE_NEGATIVE_Z = 0x05,
+} HIParray_cubemap_face;
+
+typedef enum hipLimit_t {
+  HIP_LIMIT_STACK_SIZE = 0x00,
+  HIP_LIMIT_PRINTF_FIFO_SIZE = 0x01,
+  hipLimitMallocHeapSize = 0x02,
+  HIP_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03,
+  HIP_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04,
+  HIP_LIMIT_MAX,
+} hipLimit_t;
+
+typedef enum hipResourceType {
+  hipResourceTypeArray = 0x00,
+  hipResourceTypeMipmappedArray = 0x01,
+  hipResourceTypeLinear = 0x02,
+  hipResourceTypePitch2D = 0x03,
+} hipResourceType;
+
+typedef enum hipError_t {
+  hipSuccess = 0,
+  hipErrorInvalidValue = 1,
+  hipErrorOutOfMemory = 2,
+  hipErrorNotInitialized = 3,
+  hipErrorDeinitialized = 4,
+  hipErrorProfilerDisabled = 5,
+  hipErrorProfilerNotInitialized = 6,
+  hipErrorProfilerAlreadyStarted = 7,
+  hipErrorProfilerAlreadyStopped = 8,
+  hipErrorNoDevice = 100,
+  hipErrorInvalidDevice = 101,
+  hipErrorInvalidImage = 200,
+  hipErrorInvalidContext = 201,
+  hipErrorContextAlreadyCurrent = 202,
+  hipErrorMapFailed = 205,
+  hipErrorUnmapFailed = 206,
+  hipErrorArrayIsMapped = 207,
+  hipErrorAlreadyMapped = 208,
+  hipErrorNoBinaryForGpu = 209,
+  hipErrorAlreadyAcquired = 210,
+  hipErrorNotMapped = 211,
+  hipErrorNotMappedAsArray = 212,
+  hipErrorNotMappedAsPointer = 213,
+  hipErrorECCNotCorrectable = 214,
+  hipErrorUnsupportedLimit = 215,
+  hipErrorContextAlreadyInUse = 216,
+  hipErrorPeerAccessUnsupported = 217,
+  hipErrorInvalidKernelFile = 218,
+  hipErrorInvalidGraphicsContext = 219,
+  hipErrorInvalidSource = 300,
+  hipErrorFileNotFound = 301,
+  hipErrorSharedObjectSymbolNotFound = 302,
+  hipErrorSharedObjectInitFailed = 303,
+  hipErrorOperatingSystem = 304,
+  hipErrorInvalidHandle = 400,
+  hipErrorNotFound = 500,
+  hipErrorNotReady = 600,
+  hipErrorIllegalAddress = 700,
+  hipErrorLaunchOutOfResources = 701,
+  hipErrorLaunchTimeOut = 702,
+  hipErrorPeerAccessAlreadyEnabled = 704,
+  hipErrorPeerAccessNotEnabled = 705,
+  hipErrorSetOnActiveProcess = 708,
+  hipErrorAssert = 710,
+  hipErrorHostMemoryAlreadyRegistered = 712,
+  hipErrorHostMemoryNotRegistered = 713,
+  hipErrorLaunchFailure = 719,
+  hipErrorCooperativeLaunchTooLarge = 720,
+  hipErrorNotSupported = 801,
+  hipErrorUnknown = 999,
+} hipError_t;
+
+/**
+ * Stream CallBack struct
+ */
+typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+
+typedef enum HIPdevice_P2PAttribute_enum {
+  HIP_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01,
+  HIP_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02,
+  HIP_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03,
+  HIP_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED = 0x04,
+} HIPdevice_P2PAttribute;
+
+typedef struct hipGraphicsResource_st* hipGraphicsResource;
+
+typedef struct hip_Memcpy2D {
+  size_t srcXInBytes;
+  size_t srcY;
+  hipMemoryType srcMemoryType;
+  const void* srcHost;
+  hipDeviceptr_t srcDevice;
+  hArray * srcArray;
+  size_t srcPitch;
+  size_t dstXInBytes;
+  size_t dstY;
+  hipMemoryType dstMemoryType;
+  void* dstHost;
+  hipDeviceptr_t dstDevice;
+  hArray * dstArray;
+  size_t dstPitch;
+  size_t WidthInBytes;
+  size_t Height;
+} hip_Memcpy2D;
+
+typedef enum hipDeviceP2PAttr {
+  hipDevP2PAttrPerformanceRank = 0,
+  hipDevP2PAttrAccessSupported,
+  hipDevP2PAttrNativeAtomicSupported,
+  hipDevP2PAttrHipArrayAccessSupported
+} hipDeviceP2PAttr;
+
+typedef struct HIP_MEMCPY3D {
+  size_t srcXInBytes;
+  size_t srcY;
+  size_t srcZ;
+  size_t srcLOD;
+  hipMemoryType srcMemoryType;
+  const void* srcHost;
+  hipDeviceptr_t srcDevice;
+  hArray * srcArray;
+  void* reserved0;
+  size_t srcPitch;
+  size_t srcHeight;
+  size_t dstXInBytes;
+  size_t dstY;
+  size_t dstZ;
+  size_t dstLOD;
+  hipMemoryType dstMemoryType;
+  void* dstHost;
+  hipDeviceptr_t dstDevice;
+  hArray * dstArray;
+  void* reserved1;
+  size_t dstPitch;
+  size_t dstHeight;
+  size_t WidthInBytes;
+  size_t Height;
+  size_t Depth;
+} HIP_MEMCPY3D;
+
+typedef struct HIP_MEMCPY3D_PEER_st {
+  size_t srcXInBytes;
+  size_t srcY;
+  size_t srcZ;
+  size_t srcLOD;
+  hipMemoryType srcMemoryType;
+  const void* srcHost;
+  hipDeviceptr_t srcDevice;
+  hArray * srcArray;
+  hipCtx_t srcContext;
+  size_t srcPitch;
+  size_t srcHeight;
+  size_t dstXInBytes;
+  size_t dstY;
+  size_t dstZ;
+  size_t dstLOD;
+  hipMemoryType dstMemoryType;
+  void* dstHost;
+  hipDeviceptr_t dstDevice;
+  hArray * dstArray;
+  hipCtx_t dstContext;
+  size_t dstPitch;
+  size_t dstHeight;
+  size_t WidthInBytes;
+  size_t Height;
+  size_t Depth;
+} HIP_MEMCPY3D_PEER;
+
+typedef struct HIP_ARRAY_DESCRIPTOR {
+  size_t Width;
+  size_t Height;
+  hipArray_Format Format;
+  unsigned int NumChannels;
+} HIP_ARRAY_DESCRIPTOR;
+
+typedef struct HIP_ARRAY3D_DESCRIPTOR {
+  size_t Width;
+  size_t Height;
+  size_t Depth;
+  hipArray_Format Format;
+  unsigned int NumChannels;
+  unsigned int Flags;
+} HIP_ARRAY3D_DESCRIPTOR;
+
+typedef struct HIP_RESOURCE_DESC_st {
+  hipResourceType resType;
+  union {
+    struct {
+      hArray * h_Array;
+    } array;
+    struct {
+      hipMipmappedArray_t hMipmappedArray;
+    } mipmap;
+    struct {
+      hipDeviceptr_t devPtr;
+      hipArray_Format format;
+      unsigned int numChannels;
+      size_t sizeInBytes;
+    } linear;
+    struct {
+      hipDeviceptr_t devPtr;
+      hipArray_Format format;
+      unsigned int numChannels;
+      size_t width;
+      size_t height;
+      size_t pitchInBytes;
+    } pitch2D;
+    struct {
+      int reserved[32];
+    } reserved;
+  } res;
+  unsigned int flags;
+} hipResourceDesc;
+
+/**
+ * hip texture resource view formats
+ */
+typedef enum hipResourceViewFormat {
+    hipResViewFormatNone = 0x00,
+    hipResViewFormatUnsignedChar1 = 0x01,
+    hipResViewFormatUnsignedChar2 = 0x02,
+    hipResViewFormatUnsignedChar4 = 0x03,
+    hipResViewFormatSignedChar1 = 0x04,
+    hipResViewFormatSignedChar2 = 0x05,
+    hipResViewFormatSignedChar4 = 0x06,
+    hipResViewFormatUnsignedShort1 = 0x07,
+    hipResViewFormatUnsignedShort2 = 0x08,
+    hipResViewFormatUnsignedShort4 = 0x09,
+    hipResViewFormatSignedShort1 = 0x0a,
+    hipResViewFormatSignedShort2 = 0x0b,
+    hipResViewFormatSignedShort4 = 0x0c,
+    hipResViewFormatUnsignedInt1 = 0x0d,
+    hipResViewFormatUnsignedInt2 = 0x0e,
+    hipResViewFormatUnsignedInt4 = 0x0f,
+    hipResViewFormatSignedInt1 = 0x10,
+    hipResViewFormatSignedInt2 = 0x11,
+    hipResViewFormatSignedInt4 = 0x12,
+    hipResViewFormatHalf1 = 0x13,
+    hipResViewFormatHalf2 = 0x14,
+    hipResViewFormatHalf4 = 0x15,
+    hipResViewFormatFloat1 = 0x16,
+    hipResViewFormatFloat2 = 0x17,
+    hipResViewFormatFloat4 = 0x18,
+    hipResViewFormatUnsignedBlockCompressed1 = 0x19,
+    hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
+    hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
+    hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
+    hipResViewFormatSignedBlockCompressed4 = 0x1d,
+    hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
+    hipResViewFormatSignedBlockCompressed5 = 0x1f,
+    hipResViewFormatUnsignedBlockCompressed6H = 0x20,
+    hipResViewFormatSignedBlockCompressed6H = 0x21,
+    hipResViewFormatUnsignedBlockCompressed7 = 0x22
+}hipResourceViewFormat;
+
+typedef enum HIPresourceViewFormat_enum
+{
+    HIP_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    HIP_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    HIP_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    HIP_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    HIP_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    HIP_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    HIP_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    HIP_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    HIP_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    HIP_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} HIPresourceViewFormat;
+
+/**
+ * hip resource view descriptor
+ */
+struct hipResourceViewDesc {
+    enum hipResourceViewFormat format;
+    size_t width;
+    size_t height;
+    size_t depth;
+    unsigned int firstMipmapLevel;
+    unsigned int lastMipmapLevel;
+    unsigned int firstLayer;
+    unsigned int lastLayer;
+};
+
+
+typedef struct hipTextureDesc_st {
+  hipTextureAddressMode addressMode[3];
+  hipTextureFilterMode filterMode;
+  unsigned int flags;
+  unsigned int maxAnisotropy;
+  hipTextureFilterMode mipmapFilterMode;
+  float mipmapLevelBias;
+  float minMipmapLevelClamp;
+  float maxMipmapLevelClamp;
+  float borderColor[4];
+  int reserved[12];
+} hipTextureDesc;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct HIP_RESOURCE_VIEW_DESC_st {
+  hipResourceViewFormat format;
+  size_t width;
+  size_t height;
+  size_t depth;
+  unsigned int firstMipmapLevel;
+  unsigned int lastMipmapLevel;
+  unsigned int firstLayer;
+  unsigned int lastLayer;
+  unsigned int reserved[16];
+} HIP_RESOURCE_VIEW_DESC;
+
+typedef struct HIP_POINTER_ATTRIBUTE_P2P_TOKENS_st {
+  unsigned long long p2pToken;
+  unsigned int vaSpaceToken;
+} HIP_POINTER_ATTRIBUTE_P2P_TOKENS;
+
+
+typedef unsigned int GLenum;
+typedef unsigned int GLuint;
+typedef int GLint;
+
+typedef enum HIPGLDeviceList_enum {
+  HIP_GL_DEVICE_LIST_ALL = 0x01,
+  HIP_GL_DEVICE_LIST_CURRENT_FRAME = 0x02,
+  HIP_GL_DEVICE_LIST_NEXT_FRAME = 0x03,
+} HIPGLDeviceList;
+
+typedef enum HIPGLmap_flags_enum {
+  HIP_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
+  HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
+  HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
+} HIPGLmap_flags;
+
+
+/* Function types. */
+typedef hipError_t HIPAPI thipGetErrorName(hipError_t error, const char** pStr);
+typedef hipError_t HIPAPI thipInit(unsigned int Flags);
+typedef hipError_t HIPAPI thipDriverGetVersion(int* driverVersion);
+typedef hipError_t HIPAPI thipGetDevice(hipDevice_t* device, int ordinal);
+typedef hipError_t HIPAPI thipGetDeviceCount(int* count);
+typedef hipError_t HIPAPI thipDeviceGetName(char* name, int len, hipDevice_t dev);
+typedef hipError_t HIPAPI thipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+typedef hipError_t HIPAPI thipDeviceComputeCapability(int* major, int* minor, hipDevice_t dev);
+typedef hipError_t HIPAPI thipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev);
+typedef hipError_t HIPAPI thipDevicePrimaryCtxRelease(hipDevice_t dev);
+typedef hipError_t HIPAPI thipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags);
+typedef hipError_t HIPAPI thipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
+typedef hipError_t HIPAPI thipDevicePrimaryCtxReset(hipDevice_t dev);
+typedef hipError_t HIPAPI thipCtxCreate(hipCtx_t* pctx, unsigned int flags, hipDevice_t dev);
+typedef hipError_t HIPAPI thipCtxDestroy(hipCtx_t ctx);
+typedef hipError_t HIPAPI thipCtxPushCurrent(hipCtx_t ctx);
+typedef hipError_t HIPAPI thipCtxPopCurrent(hipCtx_t* pctx);
+typedef hipError_t HIPAPI thipCtxSetCurrent(hipCtx_t ctx);
+typedef hipError_t HIPAPI thipCtxGetCurrent(hipCtx_t* pctx);
+typedef hipError_t HIPAPI thipCtxGetDevice(hipDevice_t* device);
+typedef hipError_t HIPAPI thipCtxGetFlags(unsigned int* flags);
+typedef hipError_t HIPAPI thipCtxSynchronize(void);
+typedef hipError_t HIPAPI thipDeviceSynchronize(void);
+typedef hipError_t HIPAPI thipCtxGetCacheConfig(hipFuncCache_t* pconfig);
+typedef hipError_t HIPAPI thipCtxSetCacheConfig(hipFuncCache_t config);
+typedef hipError_t HIPAPI thipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig);
+typedef hipError_t HIPAPI thipCtxSetSharedMemConfig(hipSharedMemConfig config);
+typedef hipError_t HIPAPI thipCtxGetApiVersion(hipCtx_t ctx, unsigned int* version);
+typedef hipError_t HIPAPI thipModuleLoad(hipModule_t* module, const char* fname);
+typedef hipError_t HIPAPI thipModuleLoadData(hipModule_t* module, const void* image);
+typedef hipError_t HIPAPI thipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions, hipJitOption* options, void** optionValues);
+typedef hipError_t HIPAPI thipModuleUnload(hipModule_t hmod);
+typedef hipError_t HIPAPI thipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const char* name);
+typedef hipError_t HIPAPI thipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod, const char* name);
+typedef hipError_t HIPAPI thipModuleGetTexRef(textureReference** pTexRef, hipModule_t hmod, const char* name);
+typedef hipError_t HIPAPI thipMemGetInfo(size_t* free, size_t* total);
+typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize);
+typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr);
+typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
+typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize);
+typedef hipError_t HIPAPI thipHostFree(void* p);
+typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags);
+typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags);
+typedef hipError_t HIPAPI thipHostGetFlags(unsigned int* pFlags, void* p);
+typedef hipError_t HIPAPI thipMallocManaged(hipDeviceptr_t* dptr, size_t bytesize, unsigned int flags);
+typedef hipError_t HIPAPI thipDeviceGetByPCIBusId(hipDevice_t* dev, const char* pciBusId);
+typedef hipError_t HIPAPI thipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t dev);
+typedef hipError_t HIPAPI thipMemHostUnregister(void* p);
+typedef hipError_t HIPAPI thipMemcpy(hipDeviceptr_t dst, hipDeviceptr_t src, size_t ByteCount);
+typedef hipError_t HIPAPI thipMemcpyPeer(hipDeviceptr_t dstDevice, hipCtx_t dstContext, hipDeviceptr_t srcDevice, hipCtx_t srcContext, size_t ByteCount);
+typedef hipError_t HIPAPI thipMemcpyHtoD(hipDeviceptr_t dstDevice, void* srcHost, size_t ByteCount);
+typedef hipError_t HIPAPI thipMemcpyDtoH(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+typedef hipError_t HIPAPI thipMemcpyDtoD(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
+typedef hipError_t HIPAPI thipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy);
+typedef hipError_t HIPAPI thipMemcpyParam2D(const hip_Memcpy2D* pCopy);
+typedef hipError_t HIPAPI thipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy);
+typedef hipError_t HIPAPI thipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void* srcHost, size_t ByteCount, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t hStream);
+typedef hipError_t HIPAPI thipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemsetD8(hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
+typedef hipError_t HIPAPI thipMemsetD16(hipDeviceptr_t dstDevice, unsigned short us, size_t N);
+typedef hipError_t HIPAPI thipMemsetD32(hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
+typedef hipError_t HIPAPI thipMemsetD8Async(hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemsetD16Async(hipDeviceptr_t dstDevice, unsigned short us, size_t N, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemsetD32Async(hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemsetD2D8Async(hipDeviceptr_t dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemsetD2D16Async(hipDeviceptr_t dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, hipStream_t hStream);
+typedef hipError_t HIPAPI thipMemsetD2D32Async(hipDeviceptr_t dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, hipStream_t hStream);
+typedef hipError_t HIPAPI thipArrayCreate(hArray ** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray);
+typedef hipError_t HIPAPI thipArrayDestroy(hArray hArray);
+typedef hipError_t HIPAPI thipArray3DCreate(hArray * pHandle, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray);
+typedef hipError_t HIPAPI hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr);
+typedef hipError_t HIPAPI thipStreamCreateWithFlags(hipStream_t* phStream, unsigned int Flags);
+typedef hipError_t HIPAPI thipStreamCreateWithPriority(hipStream_t* phStream, unsigned int flags, int priority);
+typedef hipError_t HIPAPI thipStreamGetPriority(hipStream_t hStream, int* priority);
+typedef hipError_t HIPAPI thipStreamGetFlags(hipStream_t hStream, unsigned int* flags);
+typedef hipError_t HIPAPI thipStreamWaitEvent(hipStream_t hStream, hipEvent_t hEvent, unsigned int Flags);
+typedef hipError_t HIPAPI thipStreamAddCallback(hipStream_t hStream, hipStreamCallback_t callback, void* userData, unsigned int flags);
+typedef hipError_t HIPAPI thipStreamQuery(hipStream_t hStream);
+typedef hipError_t HIPAPI thipStreamSynchronize(hipStream_t hStream);
+typedef hipError_t HIPAPI thipStreamDestroy(hipStream_t hStream);
+typedef hipError_t HIPAPI thipEventCreateWithFlags(hipEvent_t* phEvent, unsigned int Flags);
+typedef hipError_t HIPAPI thipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
+typedef hipError_t HIPAPI thipEventQuery(hipEvent_t hEvent);
+typedef hipError_t HIPAPI thipEventSynchronize(hipEvent_t hEvent);
+typedef hipError_t HIPAPI thipEventDestroy(hipEvent_t hEvent);
+typedef hipError_t HIPAPI thipEventElapsedTime(float* pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+typedef hipError_t HIPAPI thipFuncGetAttribute(int* pi, hipFunction_attribute attrib, hipFunction_t hfunc);
+typedef hipError_t HIPAPI thipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
+typedef hipError_t HIPAPI thipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra);
+typedef hipError_t HIPAPI thipDrvOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t func, int blockSize, size_t dynamicSMemSize);
+typedef hipError_t HIPAPI thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, hipFunction_t func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef hipError_t HIPAPI thipModuleOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, hipFunction_t func, size_t dynamicSMemSize, int blockSizeLimit);
+typedef hipError_t HIPAPI thipTexRefSetArray(hipTexRef hTexRef, hArray * hArray, unsigned int Flags);
+typedef hipError_t HIPAPI thipTexRefSetAddress(size_t* ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes);
+typedef hipError_t HIPAPI thipTexRefSetAddress2D(hipTexRef hTexRef, const HIP_ARRAY_DESCRIPTOR* desc, hipDeviceptr_t dptr, size_t Pitch);
+typedef hipError_t HIPAPI thipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents);
+typedef hipError_t HIPAPI thipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipTextureAddressMode am);
+typedef hipError_t HIPAPI thipTexRefSetFilterMode(hipTexRef hTexRef, hipTextureFilterMode fm);
+typedef hipError_t HIPAPI thipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags);
+typedef hipError_t HIPAPI thipTexRefGetAddress(hipDeviceptr_t* pdptr, hipTexRef hTexRef);
+typedef hipError_t HIPAPI thipTexRefGetArray(hArray ** phArray, hipTexRef hTexRef);
+typedef hipError_t HIPAPI thipTexRefGetAddressMode(hipTextureAddressMode* pam, hipTexRef hTexRef, int dim);
+typedef hipError_t HIPAPI thipTexObjectCreate(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc, const hipTextureDesc* pTexDesc, const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
+typedef hipError_t HIPAPI thipTexObjectDestroy(hipTextureObject_t texObject);
+typedef hipError_t HIPAPI thipDeviceCanAccessPeer(int* canAccessPeer, hipDevice_t dev, hipDevice_t peerDev);
+typedef hipError_t HIPAPI thipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int Flags);
+typedef hipError_t HIPAPI thipCtxDisablePeerAccess(hipCtx_t peerContext);
+typedef hipError_t HIPAPI thipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attrib, hipDevice_t srcDevice, hipDevice_t dstDevice);
+typedef hipError_t HIPAPI thipGraphicsUnregisterResource(hipGraphicsResource resource);
+typedef hipError_t HIPAPI thipGraphicsResourceGetMappedMipmappedArray(hipMipmappedArray_t* pMipmappedArray, hipGraphicsResource resource);
+typedef hipError_t HIPAPI thipGraphicsResourceGetMappedPointer(hipDeviceptr_t* pDevPtr, size_t* pSize, hipGraphicsResource resource);
+typedef hipError_t HIPAPI thipGraphicsMapResources(unsigned int count, hipGraphicsResource* resources, hipStream_t hStream);
+typedef hipError_t HIPAPI thipGraphicsUnmapResources(unsigned int count, hipGraphicsResource* resources, hipStream_t hStream);
+typedef hipError_t HIPAPI thipGraphicsGLRegisterBuffer(hipGraphicsResource* pCudaResource, GLuint buffer, unsigned int Flags);
+typedef hipError_t HIPAPI thipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices, unsigned int hipDeviceCount, hipGLDeviceList deviceList);
+
+
+/* Function declarations. */
+extern thipGetErrorName *hipGetErrorName;
+extern thipInit *hipInit;
+extern thipDriverGetVersion *hipDriverGetVersion;
+extern thipGetDevice *hipGetDevice;
+extern thipGetDeviceCount *hipGetDeviceCount;
+extern thipDeviceGetName *hipDeviceGetName;
+extern thipDeviceGetAttribute *hipDeviceGetAttribute;
+extern thipDeviceComputeCapability *hipDeviceComputeCapability;
+extern thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain;
+extern thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease;
+extern thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags;
+extern thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState;
+extern thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset;
+extern thipCtxCreate *hipCtxCreate;
+extern thipCtxDestroy *hipCtxDestroy;
+extern thipCtxPushCurrent *hipCtxPushCurrent;
+extern thipCtxPopCurrent *hipCtxPopCurrent;
+extern thipCtxSetCurrent *hipCtxSetCurrent;
+extern thipCtxGetCurrent *hipCtxGetCurrent;
+extern thipCtxGetDevice *hipCtxGetDevice;
+extern thipCtxGetFlags *hipCtxGetFlags;
+extern thipCtxSynchronize *hipCtxSynchronize;
+extern thipDeviceSynchronize *hipDeviceSynchronize;
+extern thipCtxGetCacheConfig *hipCtxGetCacheConfig;
+extern thipCtxSetCacheConfig *hipCtxSetCacheConfig;
+extern thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig;
+extern thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig;
+extern thipCtxGetApiVersion *hipCtxGetApiVersion;
+extern thipModuleLoad *hipModuleLoad;
+extern thipModuleLoadData *hipModuleLoadData;
+extern thipModuleLoadDataEx *hipModuleLoadDataEx;
+extern thipModuleUnload *hipModuleUnload;
+extern thipModuleGetFunction *hipModuleGetFunction;
+extern thipModuleGetGlobal *hipModuleGetGlobal;
+extern thipModuleGetTexRef *hipModuleGetTexRef;
+extern thipMemGetInfo *hipMemGetInfo;
+extern thipMalloc *hipMalloc;
+extern thipMemAllocPitch *hipMemAllocPitch;
+extern thipFree *hipFree;
+extern thipMemGetAddressRange *hipMemGetAddressRange;
+extern thipHostMalloc *hipHostMalloc;
+extern thipHostFree *hipHostFree;
+extern thipHostGetDevicePointer *hipHostGetDevicePointer;
+extern thipHostGetFlags *hipHostGetFlags;
+extern thipMallocManaged *hipMallocManaged;
+extern thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId;
+extern thipDeviceGetPCIBusId *hipDeviceGetPCIBusId;
+extern thipMemcpyPeer *hipMemcpyPeer;
+extern thipMemcpyHtoD *hipMemcpyHtoD;
+extern thipMemcpyDtoH *hipMemcpyDtoH;
+extern thipMemcpyDtoD *hipMemcpyDtoD;
+extern thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned;
+extern thipMemcpyParam2D *hipMemcpyParam2D;
+extern thipDrvMemcpy3D *hipDrvMemcpy3D;
+extern thipMemcpyHtoDAsync *hipMemcpyHtoDAsync;
+extern thipMemcpyDtoHAsync *hipMemcpyDtoHAsync;
+extern thipMemcpyParam2DAsync *hipMemcpyParam2DAsync;
+extern thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync;
+extern thipMemsetD8 *hipMemsetD8;
+extern thipMemsetD16 *hipMemsetD16;
+extern thipMemsetD32 *hipMemsetD32;
+extern thipMemsetD8Async *hipMemsetD8Async;
+extern thipMemsetD16Async *hipMemsetD16Async;
+extern thipMemsetD32Async *hipMemsetD32Async;
+extern thipArrayCreate *hipArrayCreate;
+extern thipArrayDestroy *hipArrayDestroy;
+extern thipArray3DCreate *hipArray3DCreate;
+extern thipStreamCreateWithFlags *hipStreamCreateWithFlags;
+extern thipStreamCreateWithPriority *hipStreamCreateWithPriority;
+extern thipStreamGetPriority *hipStreamGetPriority;
+extern thipStreamGetFlags *hipStreamGetFlags;
+extern thipStreamWaitEvent *hipStreamWaitEvent;
+extern thipStreamAddCallback *hipStreamAddCallback;
+extern thipStreamQuery *hipStreamQuery;
+extern thipStreamSynchronize *hipStreamSynchronize;
+extern thipStreamDestroy *hipStreamDestroy;
+extern thipEventCreateWithFlags *hipEventCreateWithFlags;
+extern thipEventRecord *hipEventRecord;
+extern thipEventQuery *hipEventQuery;
+extern thipEventSynchronize *hipEventSynchronize;
+extern thipEventDestroy *hipEventDestroy;
+extern thipEventElapsedTime *hipEventElapsedTime;
+extern thipFuncGetAttribute *hipFuncGetAttribute;
+extern thipFuncSetCacheConfig *hipFuncSetCacheConfig;
+extern thipModuleLaunchKernel *hipModuleLaunchKernel;
+extern thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor;
+extern thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+extern thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize;
+extern thipTexRefSetArray *hipTexRefSetArray;
+extern thipTexRefSetAddress *hipTexRefSetAddress;
+extern thipTexRefSetAddress2D *hipTexRefSetAddress2D;
+extern thipTexRefSetFormat *hipTexRefSetFormat;
+extern thipTexRefSetAddressMode *hipTexRefSetAddressMode;
+extern thipTexRefSetFilterMode *hipTexRefSetFilterMode;
+extern thipTexRefSetFlags *hipTexRefSetFlags;
+extern thipTexRefGetAddress *hipTexRefGetAddress;
+extern thipTexRefGetArray *hipTexRefGetArray;
+extern thipTexRefGetAddressMode *hipTexRefGetAddressMode;
+extern thipTexObjectCreate *hipTexObjectCreate;
+extern thipTexObjectDestroy *hipTexObjectDestroy;
+extern thipDeviceCanAccessPeer *hipDeviceCanAccessPeer;
+extern thipCtxEnablePeerAccess *hipCtxEnablePeerAccess;
+extern thipCtxDisablePeerAccess *hipCtxDisablePeerAccess;
+extern thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute;
+extern thipGraphicsUnregisterResource *hipGraphicsUnregisterResource;
+extern thipGraphicsResourceGetMappedMipmappedArray *hipGraphicsResourceGetMappedMipmappedArray;
+extern thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer;
+extern thipGraphicsMapResources *hipGraphicsMapResources;
+extern thipGraphicsUnmapResources *hipGraphicsUnmapResources;
+
+extern thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer;
+extern thipGLGetDevices *hipGLGetDevices;
+
+
+enum {
+  HIPEW_SUCCESS = 0,
+  HIPEW_ERROR_OPEN_FAILED = -1,
+  HIPEW_ERROR_ATEXIT_FAILED = -2,
+};
+
+enum {
+	HIPEW_INIT_HIP = 1,
+};
+
+int hipewInit(hipuint32_t flags);
+const char *hipewErrorString(hipError_t result);
+const char *hipewCompilerPath(void);
+int hipewCompilerVersion(void);
+int hipewNvrtcVersion(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __HIPEW_H__ */
diff --git a/extern/hipew/src/hipew.c b/extern/hipew/src/hipew.c
new file mode 100644
index 00000000000..9d5a63f869a
--- /dev/null
+++ b/extern/hipew/src/hipew.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+#ifdef _MSC_VER
+#  if _MSC_VER < 1900
+#    define snprintf _snprintf
+#  endif
+#  define popen _popen
+#  define pclose _pclose
+#  define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <hipew.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  define VC_EXTRALEAN
+#  include <windows.h>
+
+/* Utility macros. */
+
+typedef HMODULE DynamicLibrary;
+
+#  define dynamic_library_open(path)         LoadLibraryA(path)
+#  define dynamic_library_close(lib)         FreeLibrary(lib)
+#  define dynamic_library_find(lib, symbol)  GetProcAddress(lib, symbol)
+#else
+#  include <dlfcn.h>
+
+typedef void* DynamicLibrary;
+
+#  define dynamic_library_open(path)         dlopen(path, RTLD_NOW)
+#  define dynamic_library_close(lib)         dlclose(lib)
+#  define dynamic_library_find(lib, symbol)  dlsym(lib, symbol)
+#endif
+
+#define _LIBRARY_FIND_CHECKED(lib, name) \
+        name = (t##name *)dynamic_library_find(lib, #name); \
+        assert(name);
+
+#define _LIBRARY_FIND(lib, name) \
+        name = (t##name *)dynamic_library_find(lib, #name);
+
+#define HIP_LIBRARY_FIND_CHECKED(name) \
+        _LIBRARY_FIND_CHECKED(hip_lib, name)
+#define HIP_LIBRARY_FIND(name) _LIBRARY_FIND(hip_lib, name)
+
+
+static DynamicLibrary hip_lib;
+
+/* Function definitions. */
+thipGetErrorName *hipGetErrorName;
+thipInit *hipInit;
+thipDriverGetVersion *hipDriverGetVersion;
+thipGetDevice *hipGetDevice;
+thipGetDeviceCount *hipGetDeviceCount;
+thipDeviceGetName *hipDeviceGetName;
+thipDeviceGetAttribute *hipDeviceGetAttribute;
+thipDeviceComputeCapability *hipDeviceComputeCapability;
+thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain;
+thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease;
+thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags;
+thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState;
+thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset;
+thipCtxCreate *hipCtxCreate;
+thipCtxDestroy *hipCtxDestroy;
+thipCtxPushCurrent *hipCtxPushCurrent;
+thipCtxPopCurrent *hipCtxPopCurrent;
+thipCtxSetCurrent *hipCtxSetCurrent;
+thipCtxGetCurrent *hipCtxGetCurrent;
+thipCtxGetDevice *hipCtxGetDevice;
+thipCtxGetFlags *hipCtxGetFlags;
+thipCtxSynchronize *hipCtxSynchronize;
+thipDeviceSynchronize *hipDeviceSynchronize;
+thipCtxGetCacheConfig *hipCtxGetCacheConfig;
+thipCtxSetCacheConfig *hipCtxSetCacheConfig;
+thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig;
+thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig;
+thipCtxGetApiVersion *hipCtxGetApiVersion;
+thipModuleLoad *hipModuleLoad;
+thipModuleLoadData *hipModuleLoadData;
+thipModuleLoadDataEx *hipModuleLoadDataEx;
+thipModuleUnload *hipModuleUnload;
+thipModuleGetFunction *hipModuleGetFunction;
+thipModuleGetGlobal *hipModuleGetGlobal;
+thipModuleGetTexRef *hipModuleGetTexRef;
+thipMemGetInfo *hipMemGetInfo;
+thipMalloc *hipMalloc;
+thipMemAllocPitch *hipMemAllocPitch;
+thipFree *hipFree;
+thipMemGetAddressRange *hipMemGetAddressRange;
+thipHostMalloc *hipHostMalloc;
+thipHostFree *hipHostFree;
+thipHostGetDevicePointer *hipHostGetDevicePointer;
+thipHostGetFlags *hipHostGetFlags;
+thipMallocManaged *hipMallocManaged;
+thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId;
+thipDeviceGetPCIBusId *hipDeviceGetPCIBusId;
+thipMemcpyPeer *hipMemcpyPeer;
+thipMemcpyHtoD *hipMemcpyHtoD;
+thipMemcpyDtoH *hipMemcpyDtoH;
+thipMemcpyDtoD *hipMemcpyDtoD;
+thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned;
+thipMemcpyParam2D *hipMemcpyParam2D;
+thipDrvMemcpy3D *hipDrvMemcpy3D;
+thipMemcpyHtoDAsync *hipMemcpyHtoDAsync;
+thipMemcpyDtoHAsync *hipMemcpyDtoHAsync;
+thipMemcpyParam2DAsync *hipMemcpyParam2DAsync;
+thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync;
+thipMemsetD8 *hipMemsetD8;
+thipMemsetD16 *hipMemsetD16;
+thipMemsetD32 *hipMemsetD32;
+thipMemsetD8Async *hipMemsetD8Async;
+thipMemsetD16Async *hipMemsetD16Async;
+thipMemsetD32Async *hipMemsetD32Async;
+thipArrayCreate *hipArrayCreate;
+thipArrayDestroy *hipArrayDestroy;
+thipArray3DCreate *hipArray3DCreate;
+thipStreamCreateWithFlags *hipStreamCreateWithFlags;
+thipStreamCreateWithPriority *hipStreamCreateWithPriority;
+thipStreamGetPriority *hipStreamGetPriority;
+thipStreamGetFlags *hipStreamGetFlags;
+thipStreamWaitEvent *hipStreamWaitEvent;
+thipStreamAddCallback *hipStreamAddCallback;
+thipStreamQuery *hipStreamQuery;
+thipStreamSynchronize *hipStreamSynchronize;
+thipStreamDestroy *hipStreamDestroy;
+thipEventCreateWithFlags *hipEventCreateWithFlags;
+thipEventRecord *hipEventRecord;
+thipEventQuery *hipEventQuery;
+thipEventSynchronize *hipEventSynchronize;
+thipEventDestroy *hipEventDestroy;
+thipEventElapsedTime *hipEventElapsedTime;
+thipFuncGetAttribute *hipFuncGetAttribute;
+thipFuncSetCacheConfig *hipFuncSetCacheConfig;
+thipModuleLaunchKernel *hipModuleLaunchKernel;
+thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor;
+thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize;
+thipTexRefSetArray *hipTexRefSetArray;
+thipTexRefSetAddress *hipTexRefSetAddress;
+thipTexRefSetAddress2D *hipTexRefSetAddress2D;
+thipTexRefSetFormat *hipTexRefSetFormat;
+thipTexRefSetAddressMode *hipTexRefSetAddressMode;
+thipTexRefSetFilterMode *hipTexRefSetFilterMode;
+thipTexRefSetFlags *hipTexRefSetFlags;
+thipTexRefGetAddress *hipTexRefGetAddress;
+thipTexRefGetArray *hipTexRefGetArray;
+thipTexRefGetAddressMode *hipTexRefGetAddressMode;
+thipTexObjectCreate *hipTexObjectCreate;
+thipTexObjectDestroy *hipTexObjectDestroy;
+thipDeviceCanAccessPeer *hipDeviceCanAccessPeer;
+
+thipCtxEnablePeerAccess *hipCtxEnablePeerAccess;
+thipCtxDisablePeerAccess *hipCtxDisablePeerAccess;
+thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute;
+thipGraphicsUnregisterResource *hipGraphicsUnregisterResource;
+thipGraphicsMapResources *hipGraphicsMapResources;
+thipGraphicsUnmapResources *hipGraphicsUnmapResources;
+thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer;
+
+thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer;
+thipGLGetDevices *hipGLGetDevices;
+
+
+
+static DynamicLibrary dynamic_library_open_find(const char **paths) {
+  int i = 0;
+  while (paths[i] != NULL) {
+      DynamicLibrary lib = dynamic_library_open(paths[i]);
+      if (lib != NULL) {
+        return lib;
+      }
+      ++i;
+  }
+  return NULL;
+}
+
+/* Implementation function. */
+static void hipewHipExit(void) {
+  if (hip_lib != NULL) {
+    /*  Ignore errors. */
+    dynamic_library_close(hip_lib);
+    hip_lib = NULL;
+  }
+}
+
+static int hipewHipInit(void) {
+  /* Library paths. */
+#ifdef _WIN32
+  /* Expected in c:/windows/system or similar, no path needed. */
+  const char *hip_paths[] = {"amdhip64.dll", NULL};
+#elif defined(__APPLE__)
+  /* Default installation path. */
+  const char *hip_paths[] = {"", NULL};
+#else
+  const char *hip_paths[] = {"/opt/rocm/hip/lib/libamdhip64.so", NULL};
+#endif
+  static int initialized = 0;
+  static int result = 0;
+  int error, driver_version;
+
+  if (initialized) {
+    return result;
+  }
+
+  initialized = 1;
+
+  error = atexit(hipewHipExit);
+  if (error) {
+    result = HIPEW_ERROR_ATEXIT_FAILED;
+    return result;
+  }
+
+  /* Load library. */
+  hip_lib = dynamic_library_open_find(hip_paths);
+
+  if (hip_lib == NULL) {
+    result = HIPEW_ERROR_OPEN_FAILED;
+    return result;
+  }
+
+  /* Fetch all function pointers. */
+  HIP_LIBRARY_FIND_CHECKED(hipGetErrorName);
+  HIP_LIBRARY_FIND_CHECKED(hipInit);
+  HIP_LIBRARY_FIND_CHECKED(hipDriverGetVersion);
+  HIP_LIBRARY_FIND_CHECKED(hipGetDevice);
+  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRetain);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRelease);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxSetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxGetState);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxReset);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxPushCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxPopCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetDevice);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetSharedMemConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetSharedMemConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetApiVersion);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoad);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoadData);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoadDataEx);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleUnload);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetFunction);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetGlobal);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetTexRef);
+  HIP_LIBRARY_FIND_CHECKED(hipMemGetInfo);
+  HIP_LIBRARY_FIND_CHECKED(hipMalloc);
+  HIP_LIBRARY_FIND_CHECKED(hipMemAllocPitch);
+  HIP_LIBRARY_FIND_CHECKED(hipFree);
+  HIP_LIBRARY_FIND_CHECKED(hipMemGetAddressRange);
+  HIP_LIBRARY_FIND_CHECKED(hipHostMalloc);
+  HIP_LIBRARY_FIND_CHECKED(hipHostFree);
+  HIP_LIBRARY_FIND_CHECKED(hipHostGetDevicePointer);
+  HIP_LIBRARY_FIND_CHECKED(hipHostGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipMallocManaged);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetByPCIBusId);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetPCIBusId);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyPeer);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoD);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoH);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoD);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2D);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3D);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoDAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoHAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy2DUnaligned);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2DAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3DAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD8);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD16);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD32);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD8Async);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD16Async);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD32Async);
+  HIP_LIBRARY_FIND_CHECKED(hipArrayCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipArrayDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipArray3DCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithPriority);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamGetPriority);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamWaitEvent);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamAddCallback);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamQuery);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipEventCreateWithFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipEventRecord);
+  HIP_LIBRARY_FIND_CHECKED(hipEventQuery);
+  HIP_LIBRARY_FIND_CHECKED(hipEventSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipEventDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipEventElapsedTime);
+  HIP_LIBRARY_FIND_CHECKED(hipFuncGetAttribute);
+  HIP_LIBRARY_FIND_CHECKED(hipFuncSetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLaunchKernel);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleOccupancyMaxPotentialBlockSize);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetArray);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress2D);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFormat);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddressMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFilterMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddress);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddressMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexObjectCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipTexObjectDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceCanAccessPeer);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxEnablePeerAccess);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxDisablePeerAccess);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetP2PAttribute);
+#ifdef _WIN32
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnregisterResource);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsMapResources);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnmapResources);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsResourceGetMappedPointer);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsGLRegisterBuffer);
+  HIP_LIBRARY_FIND_CHECKED(hipGLGetDevices);
+#endif
+  result = HIPEW_SUCCESS;
+  return result;
+}
+
+
+
+int hipewInit(hipuint32_t flags) {
+  int result = HIPEW_SUCCESS;
+
+  if (flags & HIPEW_INIT_HIP) {
+    result = hipewHipInit();
+    if (result != HIPEW_SUCCESS) {
+      return result;
+    }
+  }
+
+  return result;
+}
+
+
+const char *hipewErrorString(hipError_t result) {
+  switch (result) {
+    case hipSuccess: return "No errors";
+    case hipErrorInvalidValue: return "Invalid value";
+    case hipErrorOutOfMemory: return "Out of memory";
+    case hipErrorNotInitialized: return "Driver not initialized";
+    case hipErrorDeinitialized: return "Driver deinitialized";
+    case hipErrorProfilerDisabled: return "Profiler disabled";
+    case hipErrorProfilerNotInitialized: return "Profiler not initialized";
+    case hipErrorProfilerAlreadyStarted: return "Profiler already started";
+    case hipErrorProfilerAlreadyStopped: return "Profiler already stopped";
+    case hipErrorNoDevice: return "No HIP-capable device available";
+    case hipErrorInvalidDevice: return "Invalid device";
+    case hipErrorInvalidImage: return "Invalid kernel image";
+    case hipErrorInvalidContext: return "Invalid context";
+    case hipErrorContextAlreadyCurrent: return "Context already current";
+    case hipErrorMapFailed: return "Map failed";
+    case hipErrorUnmapFailed: return "Unmap failed";
+    case hipErrorArrayIsMapped: return "Array is mapped";
+    case hipErrorAlreadyMapped: return "Already mapped";
+    case hipErrorNoBinaryForGpu: return "No binary for GPU";
+    case hipErrorAlreadyAcquired: return "Already acquired";
+    case hipErrorNotMapped: return "Not mapped";
+    case hipErrorNotMappedAsArray: return "Mapped resource not available for access as an array";
+    case hipErrorNotMappedAsPointer: return "Mapped resource not available for access as a pointer";
+    case hipErrorECCNotCorrectable: return "Uncorrectable ECC error detected";
+    case hipErrorUnsupportedLimit: return "hipLimit_t not supported by device";
+    case hipErrorContextAlreadyInUse: return "Context already in use";
+    case hipErrorPeerAccessUnsupported: return "Peer access unsupported";
+    case hipErrorInvalidKernelFile: return "Invalid ptx";
+    case hipErrorInvalidGraphicsContext: return "Invalid graphics context";
+    case hipErrorInvalidSource: return "Invalid source";
+    case hipErrorFileNotFound: return "File not found";
+    case hipErrorSharedObjectSymbolNotFound: return "Link to a shared object failed to resolve";
+    case hipErrorSharedObjectInitFailed: return "Shared object initialization failed";
+    case hipErrorOperatingSystem: return "Operating system";
+    case hipErrorInvalidHandle: return "Invalid handle";
+    case hipErrorNotFound: return "Not found";
+    case hipErrorNotReady: return "HIP not ready";
+    case hipErrorIllegalAddress: return "Illegal address";
+    case hipErrorLaunchOutOfResources: return "Launch exceeded resources";
+    case hipErrorLaunchTimeOut: return "Launch exceeded timeout";
+    case hipErrorPeerAccessAlreadyEnabled: return "Peer access already enabled";
+    case hipErrorPeerAccessNotEnabled: return "Peer access not enabled";
+    case hipErrorSetOnActiveProcess: return "Primary context active";
+    case hipErrorAssert: return "Assert";
+    case hipErrorHostMemoryAlreadyRegistered: return "Host memory already registered";
+    case hipErrorHostMemoryNotRegistered: return "Host memory not registered";
+    case hipErrorLaunchFailure: return "Launch failed";
+    case hipErrorCooperativeLaunchTooLarge: return "Cooperative launch too large";
+    case hipErrorNotSupported: return "Not supported";
+    case hipErrorUnknown: return "Unknown error";
+    default: return "Unknown HIP error value";
+  }
+}
+
+static void path_join(const char *path1,
+                      const char *path2,
+                      int maxlen,
+                      char *result) {
+#if defined(WIN32) || defined(_WIN32)
+  const char separator = '\\';
+#else
+  const char separator = '/';
+#endif
+  int n = snprintf(result, maxlen, "%s%c%s", path1, separator, path2);
+  if (n != -1 && n < maxlen) {
+    result[n] = '\0';
+  }
+  else {
+    result[maxlen - 1] = '\0';
+  }
+}
+
+static int path_exists(const char *path) {
+  struct stat st;
+  if (stat(path, &st)) {
+    return 0;
+  }
+  return 1;
+}
+
+const char *hipewCompilerPath(void) {
+    #ifdef _WIN32
+    const char *hipPath = getenv("HIP_ROCCLR_HOME");
+    const char *windowsCommand = "perl ";
+    const char *executable = "bin/hipcc";
+
+    static char hipcc[65536];
+    static char finalCommand[65536];
+    if(hipPath) {
+      path_join(hipPath, executable, sizeof(hipcc), hipcc);
+      if(path_exists(hipcc)) {
+        snprintf(finalCommand, sizeof(hipcc), "%s %s", windowsCommand, hipcc);
+        return finalCommand;
+      } else {
+        printf("Could not find hipcc. Make sure HIP_ROCCLR_HOME points to the directory holding /bin/hipcc");
+      }
+    }
+    #else
+    const char *hipPath =  "opt/rocm/hip/bin";
+    const char *executable = "hipcc";
+
+    static char hipcc[65536];
+    if(hipPath) {
+      path_join(hipPath, executable, sizeof(hipcc), hipcc);
+      if(path_exists(hipcc)){
+        return hipcc;
+      }
+    }
+    #endif
+
+  {
+#ifdef _WIN32
+    FILE *handle = popen("where hipcc", "r");
+#else
+    FILE *handle = popen("which hipcc", "r");
+#endif
+    if (handle) {
+      char buffer[4096] = {0};
+      int len = fread(buffer, 1, sizeof(buffer) - 1, handle);
+      buffer[len] = '\0';
+      pclose(handle);
+      if (buffer[0]) {
+        return "hipcc";
+      }
+    }
+  }
+
+  return NULL;
+}
+
+int hipewCompilerVersion(void) {
+  const char *path = hipewCompilerPath();
+  const char *marker = "Hip compilation tools, release ";
+  FILE *pipe;
+  int major, minor;
+  char *versionstr;
+  char buf[128];
+  char output[65536] = "\0";
+  char command[65536] = "\0";
+
+  if (path == NULL) {
+    return 0;
+  }
+
+  /* get --version output */
+  strcat(command, "\"");
+  strncat(command, path, sizeof(command) - 1);
+  strncat(command, "\" --version", sizeof(command) - strlen(path) - 1);
+  pipe = popen(command, "r");
+  if (!pipe) {
+    fprintf(stderr, "HIP: failed to run compiler to retrieve version");
+    return 0;
+  }
+
+  while (!feof(pipe)) {
+    if (fgets(buf, sizeof(buf), pipe) != NULL) {
+      strncat(output, buf, sizeof(output) - strlen(output) - 1);
+    }
+  }
+
+  pclose(pipe);
+  return 40;
+}
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 17096d441f0..2018c1d9648 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -297,6 +297,7 @@ endif()
 
 if(WITH_CYCLES_STANDALONE)
   set(WITH_CYCLES_DEVICE_CUDA TRUE)
+  set(WITH_CYCLES_DEVICE_HIP TRUE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 5bdcfd56a4d..64d226cb9ec 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -95,6 +95,9 @@ set(ADDON_FILES
 
 add_definitions(${GL_DEFINITIONS})
 
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e0e8ca10bef..d729cb1ee69 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -28,7 +28,7 @@ def _configure_argument_parser():
                         action='store_true')
     parser.add_argument("--cycles-device",
                         help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
+                             "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
                              "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                         default=None)
     return parser
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index e2b671848d0..67207874431 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -111,6 +111,7 @@ enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
     ('OPTIX', "OptiX", "OptiX", 3),
+    ("HIP", "HIP", "HIP", 4)
 )
 
 enum_texture_limit = (
@@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_optix = _cycles.get_device_types()
+        has_cuda, has_optix, has_hip = _cycles.get_device_types()
+
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
         if has_optix:
             list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
+        if has_hip:
+            list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
+
         return list
 
     compute_device_type: EnumProperty(
@@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        if compute_device_type != 'CPU':
+        if compute_device_type != 'CPU' and compute_device_type != 'HIP':
             devices.extend(cpu_devices)
         return devices
 
@@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         import _cycles
         # Ensure `self.devices` is not re-allocated when the second call to
         # get_devices_for_type is made, freeing items from the first list.
-        for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
+        for device_type in ('CUDA', 'OPTIX', 'HIP'):
             self.update_device_entries(_cycles.available_devices(device_type))
 
     # Deprecated: use refresh_devices instead.
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index ac96ddf5e10..c4a1844480c 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -99,6 +99,11 @@ def use_cuda(context):
     return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
 
 
+def use_hip(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'HIP' and cscene.device == 'GPU')
+
 def use_optix(context):
     cscene = context.scene.cycles
 
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index ce1770f18a3..7bed33855c2 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -26,6 +26,7 @@ enum ComputeDevice {
   COMPUTE_DEVICE_CPU = 0,
   COMPUTE_DEVICE_CUDA = 1,
   COMPUTE_DEVICE_OPTIX = 3,
+  COMPUTE_DEVICE_HIP = 4,
 
   COMPUTE_DEVICE_NUM
 };
@@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       else if (compute_device == COMPUTE_DEVICE_OPTIX) {
         mask |= DEVICE_MASK_OPTIX;
       }
+      else if (compute_device == COMPUTE_DEVICE_HIP) {
+        mask |= DEVICE_MASK_HIP;
+      }
       vector<DeviceInfo> devices = Device::available_devices(mask);
 
       /* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 694d8454422..d681517c9e1 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false;
+  bool has_cuda = false, has_optix = false, has_hip = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
     has_optix |= (device_type == DEVICE_OPTIX);
+    has_hip |= (device_type == DEVICE_HIP);
   }
-  PyObject *list = PyTuple_New(2);
+  PyObject *list = PyTuple_New(3);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
   PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
   return list;
 }
 
@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
   else if (override == "OPTIX") {
     BlenderSession::device_override = DEVICE_MASK_OPTIX;
   }
+  else if (override == "HIP") {
+    BlenderSession::device_override = DEVICE_MASK_HIP;
+  }
   else {
     printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
     Py_RETURN_FALSE;
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index da259171844..5653f51ec05 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -531,5 +531,9 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)
     endif()
   endif()
 endif()
+if(NOT WITH_HIP_DYNLOAD)
+  message(STATUS "Setting up HIP Dynamic Load")
+  set(WITH_HIP_DYNLOAD ON)
+endif()
 
 unset(_cycles_lib_dir)
diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake
index 47196dfd1ce..172ae280cea 100644
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -162,6 +162,10 @@ macro(cycles_target_link_libraries target)
     target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
   endif()
 
+  if(WITH_HIP_DYNLOAD)
+    target_link_libraries(${target} extern_hipew)
+  endif()
+
   if(CYCLES_STANDALONE_REPOSITORY)
     target_link_libraries(${target} extern_numaapi)
   else()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index d18f4360aef..9af68550d17 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -34,6 +34,13 @@ else()
   add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
 endif()
 
+if(WITH_HIP_DYNLOAD)
+  list(APPEND INC
+    ../../../extern/hipew/include
+  )
+  add_definitions(-DWITH_HIP_DYNLOAD)
+endif()
+
 set(SRC
   device.cpp
   device_denoise.cpp
@@ -70,6 +77,21 @@ set(SRC_CUDA
   cuda/util.h
 )
 
+set(SRC_HIP
+  hip/device.cpp
+  hip/device.h
+  hip/device_impl.cpp
+  hip/device_impl.h
+  hip/graphics_interop.cpp
+  hip/graphics_interop.h
+  hip/kernel.cpp
+  hip/kernel.h
+  hip/queue.cpp
+  hip/queue.h
+  hip/util.cpp
+  hip/util.h
+)
+
 set(SRC_DUMMY
   dummy/device.cpp
   dummy/device.h
@@ -115,11 +137,20 @@ else()
   )
 endif()
 
+if(WITH_HIP_DYNLOAD)
+  list(APPEND LIB
+    extern_hipew
+  )
+endif()
+
 add_definitions(${GL_DEFINITIONS})
 
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
   add_definitions(-DWITH_OPTIX)
 endif()
@@ -140,6 +171,7 @@ cycles_add_library(cycles_device "${LIB}"
   ${SRC}
   ${SRC_CPU}
   ${SRC_CUDA}
+  ${SRC_HIP}
   ${SRC_DUMMY}
   ${SRC_MULTI}
   ${SRC_OPTIX}
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 70935598437..81574e8b184 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -25,6 +25,7 @@
 #include "device/cpu/device.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
+#include "device/hip/device.h"
 #include "device/multi/device.h"
 #include "device/optix/device.h"
 
@@ -46,6 +47,7 @@ thread_mutex Device::device_mutex;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
+vector<DeviceInfo> Device::hip_devices;
 uint Device::devices_initialized_mask = 0;
 
 /* Device */
@@ -96,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
         device = device_optix_create(info, stats, profiler);
       break;
 #endif
+
+#ifdef WITH_HIP
+    case DEVICE_HIP:
+      if (device_hip_init())
+        device = device_hip_create(info, stats, profiler);
+      break;
+#endif
+
     default:
       break;
   }
@@ -117,6 +127,8 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_OPTIX;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
+  else if (strcmp(name, "HIP") == 0)
+    return DEVICE_HIP;
 
   return DEVICE_NONE;
 }
@@ -131,6 +143,8 @@ string Device::string_from_type(DeviceType type)
     return "OPTIX";
   else if (type == DEVICE_MULTI)
     return "MULTI";
+  else if (type == DEVICE_HIP)
+    return "HIP";
 
   return "";
 }
@@ -145,6 +159,10 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
+#ifdef WITH_HIP
+  types.push_back(DEVICE_HIP);
+#endif
+
   return types;
 }
 
@@ -186,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   }
 #endif
 
+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (!(devices_initialized_mask & DEVICE_MASK_HIP)) {
+      if (device_hip_init()) {
+        device_hip_info(hip_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_HIP;
+    }
+    foreach (DeviceInfo &info, hip_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
   if (mask & DEVICE_MASK_CPU) {
     if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
       device_cpu_info(cpu_devices);
@@ -226,6 +258,15 @@ string Device::device_capabilities(uint mask)
   }
 #endif
 
+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (device_hip_init()) {
+      capabilities += "\nHIP device capabilities:\n";
+      capabilities += device_hip_capabilities();
+    }
+  }
+#endif
+
   return capabilities;
 }
 
@@ -314,6 +355,7 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
+  hip_devices.free_memory();
   cpu_devices.free_memory();
 }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 3bbad179f52..c73d74cdccc 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -51,6 +51,7 @@ enum DeviceType {
   DEVICE_CUDA,
   DEVICE_MULTI,
   DEVICE_OPTIX,
+  DEVICE_HIP,
   DEVICE_DUMMY,
 };
 
@@ -58,6 +59,7 @@ enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
+  DEVICE_MASK_HIP = (1 << DEVICE_HIP),
   DEVICE_MASK_ALL = ~0
 };
 
@@ -284,6 +286,7 @@ class Device {
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
   static vector<DeviceInfo> cpu_devices;
+  static vector<DeviceInfo> hip_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index a854cc9b693..be6123e09b2 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -277,6 +277,7 @@ class device_memory {
  protected:
   friend class CUDADevice;
   friend class OptiXDevice;
+  friend class HIPDevice;
 
   /* Only create through subclasses. */
   device_memory(Device *device, const char *name, MemoryType type);
diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp
new file mode 100644
index 00000000000..90028ac7f10
--- /dev/null
+++ b/intern/cycles/device/hip/device.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/hip/device.h"
+
+#include "util/util_logging.h"
+
+#ifdef WITH_HIP
+#  include "device/device.h"
+#  include "device/hip/device_impl.h"
+
+#  include "util/util_string.h"
+#  include "util/util_windows.h"
+#endif /* WITH_HIP */
+
+CCL_NAMESPACE_BEGIN
+
+bool device_hip_init()
+{
+#if !defined(WITH_HIP)
+  return false;
+#elif defined(WITH_HIP_DYNLOAD)
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+  int hipew_result = hipewInit(HIPEW_INIT_HIP);
+  if (hipew_result == HIPEW_SUCCESS) {
+    VLOG(1) << "HIPEW initialization succeeded";
+    if (HIPDevice::have_precompiled_kernels()) {
+      VLOG(1) << "Found precompiled kernels";
+      result = true;
+    }
+    else if (hipewCompilerPath() != NULL) {
+      VLOG(1) << "Found HIPCC " << hipewCompilerPath();
+      result = true;
+    }
+    else {
+      VLOG(1) << "Neither precompiled kernels nor HIPCC was found,"
+              << " unable to use HIP";
+    }
+  }
+  else {
+    VLOG(1) << "HIPEW initialization failed: "
+            << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                              "Error opening the library");
+  }
+
+  return result;
+#else  /* WITH_HIP_DYNLOAD */
+  return true;
+#endif /* WITH_HIP_DYNLOAD */
+}
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_HIP
+  return new HIPDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+#ifdef WITH_HIP
+static hipError_t device_hip_safe_init()
+{
+#  ifdef _WIN32
+  __try {
+    return hipInit(0);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the HIP driver and hope we can
+     * survive even with corrupted HIP installs. */
+    fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n");
+  }
+
+  return hipErrorNoDevice;
+#  else
+  return hipInit(0);
+#  endif
+}
+#endif /* WITH_HIP */
+
+void device_hip_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice)
+      fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  int count = 0;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  vector<DeviceInfo> display_devices;
+
+  for (int num = 0; num < count; num++) {
+    char name[256];
+
+    result = hipDeviceGetName(name, 256, num);
+    if (result != hipSuccess) {
+      fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result));
+      continue;
+    }
+
+    int major;
+    hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num);
+    // TODO : (Arya) What is the last major version we are supporting?
+
+    DeviceInfo info;
+
+    info.type = DEVICE_HIP;
+    info.description = string(name);
+    info.num = num;
+
+    info.has_half_images = (major >= 3);
+    info.has_nanovdb = true;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        hipDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
+
+    int pci_location[3] = {0, 0, 0};
+    hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num);
+    hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num);
+    hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num);
+    info.id = string_printf("HIP_%s_%04x:%02x:%02x",
+                            name,
+                            (unsigned int)pci_location[0],
+                            (unsigned int)pci_location[1],
+                            (unsigned int)pci_location[2]);
+
+    /* If device has a kernel timeout and no compute preemption, we assume
+     * it is connected to a display and will freeze the display while doing
+     * computations. */
+    int timeout_attr = 0, preempt_attr = 0;
+    hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num);
+
+    if (timeout_attr && !preempt_attr) {
+      VLOG(1) << "Device is recognized as display.";
+      info.description += " (Display)";
+      info.display_device = true;
+      display_devices.push_back(info);
+    }
+    else {
+      VLOG(1) << "Device has compute preemption or is not used for display.";
+      devices.push_back(info);
+    }
+    VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+  }
+
+  if (!display_devices.empty())
+    devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_HIP */
+  (void)devices;
+#endif /* WITH_HIP */
+}
+
+string device_hip_capabilities()
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice) {
+      return string("Error initializing HIP: ") + hipewErrorString(result);
+    }
+    return "No HIP device found\n";
+  }
+
+  int count;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    return string("Error getting devices: ") + hipewErrorString(result);
+  }
+
+  string capabilities = "";
+  for (int num = 0; num < count; num++) {
+    char name[256];
+    if (hipDeviceGetName(name, 256, num) != hipSuccess) {
+      continue;
+    }
+    capabilities += string("\t") + name + "\n";
+    int value;
+#  define GET_ATTR(attr) \
+    { \
+      if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \
+        capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \
+      } \
+    } \
+    (void)0
+    /* TODO(sergey): Strip all attributes which are not useful for us
+     * or does not depend on the driver.
+     */
+    GET_ATTR(MaxThreadsPerBlock);
+    GET_ATTR(MaxBlockDimX);
+    GET_ATTR(MaxBlockDimY);
+    GET_ATTR(MaxBlockDimZ);
+    GET_ATTR(MaxGridDimX);
+    GET_ATTR(MaxGridDimY);
+    GET_ATTR(MaxGridDimZ);
+    GET_ATTR(MaxSharedMemoryPerBlock);
+    GET_ATTR(TotalConstantMemory);
+    GET_ATTR(WarpSize);
+    GET_ATTR(MaxPitch);
+    GET_ATTR(MaxRegistersPerBlock);
+    GET_ATTR(ClockRate);
+    GET_ATTR(TextureAlignment);
+    GET_ATTR(MultiprocessorCount);
+    GET_ATTR(KernelExecTimeout);
+    GET_ATTR(Integrated);
+    GET_ATTR(CanMapHostMemory);
+    GET_ATTR(ComputeMode);
+    GET_ATTR(MaxTexture1DWidth);
+    GET_ATTR(MaxTexture2DWidth);
+    GET_ATTR(MaxTexture2DHeight);
+    GET_ATTR(MaxTexture3DWidth);
+    GET_ATTR(MaxTexture3DHeight);
+    GET_ATTR(MaxTexture3DDepth);
+    GET_ATTR(ConcurrentKernels);
+    GET_ATTR(EccEnabled);
+    GET_ATTR(MemoryClockRate);
+    GET_ATTR(MemoryBusWidth);
+    GET_ATTR(L2CacheSize);
+    GET_ATTR(MaxThreadsPerMultiProcessor);
+    GET_ATTR(ComputeCapabilityMajor);
+    GET_ATTR(ComputeCapabilityMinor);
+    GET_ATTR(MaxSharedMemoryPerMultiprocessor);
+    GET_ATTR(ManagedMemory);
+    GET_ATTR(IsMultiGpuBoard);
+#  undef GET_ATTR
+    capabilities += "\n";
+  }
+
+  return capabilities;
+
+#else  /* WITH_HIP */
+  return "";
+#endif /* WITH_HIP */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/hip/device.h b/intern/cycles/device/hip/device.h
new file mode 100644
index 00000000000..76fa8995bed
--- /dev/null
+++ b/intern/cycles/device/hip/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_hip_init();
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_hip_info(vector<DeviceInfo> &devices);
+
+string device_hip_capabilities();
+
+CCL_NAMESPACE_END
+\ No newline at end of file
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
new file mode 100644
index 00000000000..0e5ac6ce401
--- /dev/null
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -0,0 +1,1343 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/hip/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+bool HIPDevice::have_precompiled_kernels()
+{
+  string fatbins_path = path_get("lib");
+  return path_exists(fatbins_path);
+}
+
+bool HIPDevice::show_samples() const
+{
+  /* The HIPDevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void HIPDevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  hipDevId = info.num;
+  hipDevice = 0;
+  hipContext = 0;
+
+  hipModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize HIP. */
+  hipError_t result = hipInit(0);
+  if (result != hipSuccess) {
+    set_error(string_printf("Failed to initialize HIP runtime (%s)", hipewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = hipGetDevice(&hipDevice, hipDevId);
+  if (result != hipSuccess) {
+    set_error(string_printf("Failed to get HIP device handle from ordinal (%s)",
+                            hipewErrorString(result)));
+    return;
+  }
+
+  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+
+  hip_assert(
+      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
+
+  unsigned int ctx_flags = hipDeviceLmemResizeToMax;
+  if (can_map_host) {
+    ctx_flags |= hipDeviceMapHost;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = hipCtxCreate(&hipContext, ctx_flags, hipDevice);
+
+  if (result != hipSuccess) {
+    set_error(string_printf("Failed to create HIP context (%s)", hipewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
+  hipDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by hipCtxCreate. */
+  hipCtxPopCurrent(NULL);
+}
+
+HIPDevice::~HIPDevice()
+{
+  texture_info.free();
+
+  hip_assert(hipCtxDestroy(hipContext));
+}
+
+bool HIPDevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
+
+  // TODO : (Arya) What versions do we plan to support?
+  return true;
+}
+
+bool HIPDevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_HIP && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  HIPDevice *const peer_device_hip = static_cast<HIPDevice *>(peer_device);
+
+  int can_access = 0;
+  hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  hip_assert(hipDeviceGetP2PAttribute(
+      &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const HIPContextScope scope(this);
+    hipError_t result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0);
+    if (result != hipSuccess) {
+      set_error(string_printf("Failed to enable peer access on HIP context (%s)",
+                              hipewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const HIPContextScope scope(peer_device_hip);
+    hipError_t result = hipCtxEnablePeerAccess(hipContext, 0);
+    if (result != hipSuccess) {
+      set_error(string_printf("Failed to enable peer access on HIP context (%s)",
+                              hipewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool HIPDevice::use_adaptive_compilation()
+{
+  return DebugFlags().hip.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DHIPCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  return cflags;
+}
+
+string HIPDevice::compile_kernel(const uint kernel_features,
+                                 const char *name,
+                                 const char *base,
+                                 bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string fatbin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
+      if (path_exists(fatbin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return fatbin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing hip toolkit or changing other
+   * compiler command line arguments makes sure fatbin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = "genco";
+#  ifdef _WIN32
+  const char *const options =
+      "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp";
+#  else
+  const char *const options =
+      "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ggdb";
+#  endif
+  const string include_path = source_path;
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string fatbin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s", name, kernel_arch, major, minor, kernel_md5.c_str());
+  const string fatbin = path_cache_get(path_join("kernels", fatbin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << fatbin << ".";
+  if (path_exists(fatbin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return fatbin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("HIP binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const hipcc = hipewCompilerPath();
+  if (hipcc == NULL) {
+    set_error(
+        "HIP hipcc compiler not found. "
+        "Install HIP toolkit in default location.");
+    return string();
+  }
+
+  const int hipcc_hip_version = hipewCompilerVersion();
+  VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << ".";
+  if (hipcc_hip_version < 40) {
+    printf(
+        "Unsupported HIP version %d.%d detected, "
+        "you need HIP 4.0 or newer.\n",
+        hipcc_hip_version / 10,
+        hipcc_hip_version % 10);
+    return string();
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(fatbin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cpp", name))));
+
+  string command = string_printf("%s -%s -I %s --%s %s -o \"%s\"",
+                                 hipcc,
+                                 options,
+                                 include_path.c_str(),
+                                 kernel_ext,
+                                 source_path.c_str(),
+                                 fatbin.c_str());
+
+  printf("Compiling HIP kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(fatbin)) {
+    set_error(
+        "HIP kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return fatbin;
+}
+
+bool HIPDevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for HIP devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in hipCtxSynchronize.
+   */
+  if (hipModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if hip init succeeded */
+  if (hipContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string fatbin = compile_kernel(kernel_features, kernel_name);
+  if (fatbin.empty())
+    return false;
+
+  /* open module */
+  HIPContextScope scope(this);
+
+  string fatbin_data;
+  hipError_t result;
+
+  if (path_read_text(fatbin, fatbin_data))
+    result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
+  else
+    result = hipErrorFileNotFound;
+
+  if (result != hipSuccess)
+    set_error(string_printf(
+        "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result)));
+
+  if (result == hipSuccess) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == hipSuccess);
+}
+
+void HIPDevice::reserve_local_memory(const uint)
+{
+  /* Together with hipDeviceLmemResizeToMax, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    HIPContextScope scope(this);
+    hipMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    HIPDeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    HIPContextScope scope(this);
+    hipMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    hipDeviceptr_t tmp;
+    hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL));
+    hipMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void HIPDevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void HIPDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
+      device_memory &mem = *pair.first;
+      HIPMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple HIP devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  HIPContextScope scope(this);
+
+  hipDeviceptr_t device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  hipError_t mem_alloc_result = hipErrorOutOfMemory;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  hipMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    hipMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = hipMalloc(&device_pointer, size);
+    if (mem_alloc_result == hipSuccess) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != hipSuccess && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = hipSuccess;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+
+      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
+             (mem_alloc_result != hipSuccess && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == hipSuccess) {
+      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != hipSuccess) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(hip_mem_map_mutex);
+  HIPMem *cmem = &hip_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * HIP memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void HIPDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(hip_mem_map_mutex);
+  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const HIPContextScope scope(this);
+    hip_assert(
+        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void HIPDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    HIPContextScope scope(this);
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    const HIPMem &cmem = hip_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          hipHostFree(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      hip_assert(hipFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    hip_mem_map.erase(hip_mem_map.find(&mem));
+  }
+}
+
+void HIPDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void HIPDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void HIPDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const HIPContextScope scope(this);
+      hip_assert(hipMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (hipDeviceptr_t)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void HIPDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(hip_mem_map_mutex);
+  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const HIPContextScope scope(this);
+    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void HIPDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr HIPDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  HIPContextScope scope(this);
+  hipDeviceptr_t mem;
+  size_t bytes;
+
+  hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
+  assert(bytes == size);
+  hip_assert(hipMemcpyHtoD(mem, host, size));
+}
+
+void HIPDevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void HIPDevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void HIPDevice::tex_alloc(device_texture &mem)
+{
+  HIPContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  hipTextureAddressMode address_mode = hipAddressModeWrap;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = hipAddressModeWrap;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = hipAddressModeClamp;
+      break;
+    case EXTENSION_CLIP:
+      // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported
+      // in hip
+      address_mode = hipAddressModeClamp;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  hipTextureFilterMode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = hipFilterModePoint;
+  }
+  else {
+    filter_mode = hipFilterModeLinear;
+  }
+
+  /* Image Texture Storage */
+  hipArray_Format format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = HIP_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = HIP_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = HIP_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = HIP_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = HIP_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = HIP_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  HIPMem *cmem = NULL;
+  hArray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    cmem = &hip_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (hArray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    HIP_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    hip_assert(hipArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    HIP_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = hipMemoryTypeArray;
+    param.dstArray = &array_3d;
+    param.srcMemoryType = hipMemoryTypeHost;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    hip_assert(hipDrvMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    cmem = &hip_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    hip_Memcpy2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = hipMemoryTypeDevice;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = hipMemoryTypeHost;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    hip_assert(hipDrvMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    hipResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = hipResourceTypeArray;
+      resDesc.res.array.h_Array = &array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = hipResourceTypePitch2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = hipResourceTypeLinear;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    hipTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    cmem = &hip_mem_map[&mem];
+
+    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void HIPDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    HIPContextScope scope(this);
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    const HIPMem &cmem = hip_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      hipTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      hip_mem_map.erase(hip_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      hipArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      hip_mem_map.erase(hip_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void HIPDevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  HIPContextScope scope(this);
+  hipFunction_t hipRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
+  }
+  else {
+    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  hip_assert(
+      hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    hip_assert(
+        hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    hip_assert(hipDeviceSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    hip_assert(hipDeviceSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void HIPDevice::thread_run(DeviceTask &task)
+{
+  HIPContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
+{
+  return make_unique<HIPDeviceQueue>(this);
+}
+
+bool HIPDevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using HIP device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  HIPContextScope scope(this);
+
+  int num_all_devices = 0;
+  hip_assert(hipGetDeviceCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<hipDevice_t> gl_devices(num_all_devices);
+  uint num_gl_devices = 0;
+  hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll);
+
+  for (hipDevice_t gl_device : gl_devices) {
+    if (gl_device == hipDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int HIPDevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0);
+}
+
+int HIPDevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0);
+}
+
+bool HIPDevice::get_device_attribute(hipDeviceAttribute_t attribute, int *value)
+{
+  HIPContextScope scope(this);
+
+  return hipDeviceGetAttribute(value, attribute, hipDevice) == hipSuccess;
+}
+
+int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h
new file mode 100644
index 00000000000..1d138ee9856
--- /dev/null
+++ b/intern/cycles/device/hip/device_impl.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device.h"
+#  include "device/hip/kernel.h"
+#  include "device/hip/queue.h"
+#  include "device/hip/util.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  else
+#    include "util/util_opengl.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class HIPDevice : public Device {
+
+  friend class HIPContextScope;
+
+ public:
+  hipDevice_t hipDevice;
+  hipCtx_t hipContext;
+  hipModule_t hipModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int hipDevId;
+  int hipDevArchitecture;
+  bool first_error;
+
+  struct HIPMem {
+    HIPMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    hipTextureObject_t texobject;
+    hArray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, HIPMem> HIPMemMap;
+  HIPMemMap hip_mem_map;
+  thread_mutex hip_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  HIPDeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~HIPDevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "hip",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  /* Graphics resources interoperability. */
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(hipDeviceAttribute_t attribute, int *value);
+  int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/graphics_interop.cpp b/intern/cycles/device/hip/graphics_interop.cpp
new file mode 100644
index 00000000000..add6dbed5e1
--- /dev/null
+++ b/intern/cycles/device/hip/graphics_interop.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/graphics_interop.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue)
+    : queue_(queue), device_(static_cast<HIPDevice *>(queue->device))
+{
+}
+
+HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop()
+{
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+}
+
+void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+
+  const hipError_t result = hipGraphicsGLRegisterBuffer(
+      &hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr HIPDeviceGraphicsInterop::map()
+{
+  if (!hip_graphics_resource_) {
+    return 0;
+  }
+
+  HIPContextScope scope(device_);
+
+  hipDeviceptr_t hip_buffer;
+  size_t bytes;
+
+  hip_device_assert(device_,
+                    hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream()));
+  hip_device_assert(
+      device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_));
+
+  return static_cast<device_ptr>(hip_buffer);
+}
+
+void HIPDeviceGraphicsInterop::unmap()
+{
+  HIPContextScope scope(device_);
+
+  hip_device_assert(device_,
+                    hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h
new file mode 100644
index 00000000000..adcaa13a2d7
--- /dev/null
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class HIPDeviceQueue;
+
+class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue);
+
+  HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~HIPDeviceGraphicsInterop();
+
+  HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  HIPDeviceQueue *queue_ = nullptr;
+  HIPDevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  hipGraphicsResource hip_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/kernel.cpp b/intern/cycles/device/hip/kernel.cpp
new file mode 100644
index 00000000000..e0acd6f17c6
--- /dev/null
+++ b/intern/cycles/device/hip/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/kernel.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void HIPDeviceKernels::load(HIPDevice *device)
+{
+  hipModule_t hipModule = device->hipModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    HIPDeviceKernel &kernel = kernels_[i];
+
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    hip_device_assert(device,
+                      hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str()));
+
+    if (kernel.function) {
+      hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1));
+
+      hip_device_assert(
+          device,
+          hipModuleOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool HIPDeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP*/
diff --git a/intern/cycles/device/hip/kernel.h b/intern/cycles/device/hip/kernel.h
new file mode 100644
index 00000000000..3301731f56e
--- /dev/null
+++ b/intern/cycles/device/hip/kernel.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* HIP kernel and associate occupancy information. */
+class HIPDeviceKernel {
+ public:
+  hipFunction_t function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of HIP kernels for each DeviceKernel. */
+class HIPDeviceKernels {
+ public:
+  void load(HIPDevice *device);
+  const HIPDeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
new file mode 100644
index 00000000000..78c77e5fdae
--- /dev/null
+++ b/intern/cycles/device/hip/queue.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/queue.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/graphics_interop.h"
+#  include "device/hip/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* HIPDeviceQueue */
+
+HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
+    : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
+{
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
+}
+
+HIPDeviceQueue::~HIPDeviceQueue()
+{
+  const HIPContextScope scope(hip_device_);
+  hipStreamDestroy(hip_stream_);
+}
+
+int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+{
+  /* TODO: compute automatically. */
+  /* TODO: must have at least num_threads_per_block. */
+  return 14416128;
+}
+
+int HIPDeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void HIPDeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  HIPContextScope scope(hip_device_);
+  hip_device_->load_texture_info();
+  hip_device_assert(hip_device_, hipDeviceSynchronize());
+
+  debug_init_execution();
+}
+
+bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return hip_device_->kernels.available(kernel);
+}
+
+bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const HIPContextScope scope(hip_device_);
+  const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = hip_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  hip_device_assert(hip_device_,
+                    hipModuleLaunchKernel(hip_kernel.function,
+                                          num_blocks,
+                                          1,
+                                          1,
+                                          num_threads_per_block,
+                                          1,
+                                          1,
+                                          shared_mem_bytes,
+                                          hip_stream_,
+                                          args,
+                                          0));
+  return !(hip_device_->have_error());
+}
+
+bool HIPDeviceQueue::synchronize()
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  debug_synchronize();
+
+  return !(hip_device_->have_error());
+}
+
+void HIPDeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyHtoDAsync(
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyDtoHAsync(
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+}
+
+// TODO : (Arya) Enable this after stabilizing dev branch
+unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
+{
+  return make_unique<HIPDeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
new file mode 100644
index 00000000000..04c8a5982ce
--- /dev/null
+++ b/intern/cycles/device/hip/queue.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class device_memory;
+
+/* Base class for HIP queues. */
+class HIPDeviceQueue : public DeviceQueue {
+ public:
+  HIPDeviceQueue(HIPDevice *device);
+  ~HIPDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual hipStream_t stream()
+  {
+    return hip_stream_;
+  }
+
+  // TODO : (Arya) Enable this after stabilizing the dev branch
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  HIPDevice *hip_device_;
+  hipStream_t hip_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/util.cpp b/intern/cycles/device/hip/util.cpp
new file mode 100644
index 00000000000..44f52c4e17b
--- /dev/null
+++ b/intern/cycles/device/hip/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/util.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPContextScope::HIPContextScope(HIPDevice *device) : device(device)
+{
+  hip_device_assert(device, hipCtxPushCurrent(device->hipContext));
+}
+
+HIPContextScope::~HIPContextScope()
+{
+  hip_device_assert(device, hipCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_HIP_DYNLOAD
+const char *hipewErrorString(hipError_t result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *hipewCompilerPath()
+{
+  return CYCLES_HIP_HIPCC_EXECUTABLE;
+}
+
+int hipewCompilerVersion()
+{
+  return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h
new file mode 100644
index 00000000000..f8468e0dc5c
--- /dev/null
+++ b/intern/cycles/device/hip/util.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* Utility to push/pop HIP context. */
+class HIPContextScope {
+ public:
+  HIPContextScope(HIPDevice *device);
+  ~HIPContextScope();
+
+ private:
+  HIPDevice *device;
+};
+
+/* Utility for checking return values of HIP function calls. */
+#  define hip_device_assert(hip_device, stmt) \
+    { \
+      hipError_t result = stmt; \
+      if (result != hipSuccess) { \
+        const char *name = hipewErrorString(result); \
+        hip_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define hip_assert(stmt) hip_device_assert(this, stmt)
+
+#  ifndef WITH_HIP_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked HIP at all. */
+const char *hipewErrorString(hipError_t result);
+const char *hipewCompilerPath();
+int hipewCompilerVersion();
+#  endif /* WITH_HIP_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
+\ No newline at end of file
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 10d0c5e7b4c..9633d3b87d3 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -1035,6 +1035,8 @@ static const char *device_type_for_description(const DeviceType type)
       return "CUDA";
     case DEVICE_OPTIX:
       return "OptiX";
+    case DEVICE_HIP:
+      return "HIP";
     case DEVICE_DUMMY:
       return "Dummy";
     case DEVICE_MULTI:
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 4196539a9b1..8c2cb2c68de 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA
   device/cuda/kernel.cu
 )
 
+set(SRC_DEVICE_HIP
+  device/hip/kernel.cpp
+)
+
 set(SRC_DEVICE_OPTIX
   device/optix/kernel.cu
   device/optix/kernel_shader_raytrace.cu
@@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS
   device/cuda/globals.h
 )
 
+set(SRC_DEVICE_HIP_HEADERS
+  device/hip/compat.h
+  device/hip/config.h
+  device/hip/globals.h
+)
+
 set(SRC_DEVICE_OPTIX_HEADERS
   device/optix/compat.h
   device/optix/globals.h
@@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_cuda)
 endif()
 
+####################################################### START
+
+# HIP module
+
+if(WITH_CYCLES_HIP_BINARIES)
+  # 64 bit only
+  set(HIP_BITS 64)
+
+  # HIP version
+  execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}")
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}")
+  set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}")
+
+
+  message(WARNING
+    "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected")
+
+  # build for each arch
+  set(hip_sources device/hip/kernel.cpp
+    ${SRC_HEADERS}
+    ${SRC_DEVICE_HIP_HEADERS}
+    ${SRC_BVH_HEADERS}
+    ${SRC_SVM_HEADERS}
+    ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
+    ${SRC_CLOSURE_HEADERS}
+    ${SRC_UTIL_HEADERS}
+  )
+  set(hip_fatbins)
+
+  macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "fatbin")
+    endif()
+    set(hip_file ${name}_${arch}.${format})
+
+    set(kernel_sources ${sources})
+    if(NOT ${prev_arch} STREQUAL "none")
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin)
+      endif()
+    endif()
+
+    set(hip_kernel_src "/device/hip/${name}.cpp")
+
+    set(hip_flags ${flags}
+      -D CCL_NAMESPACE_BEGIN=
+      -D CCL_NAMESPACE_END=
+      -D HIPCC
+      -m ${HIP_BITS}
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/..
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip
+      --use_fast_math
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
+
+    if(${experimental})
+      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
+      set(name ${name}_experimental)
+    endif()
+
+    if(WITH_CYCLES_DEBUG)
+      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+    endif()
+
+    if(WITH_NANOVDB)
+      set(hip_flags ${hip_flags}
+        -D WITH_NANOVDB
+        -I "${NANOVDB_INCLUDE_DIR}")
+    endif()
+
+
+  set(prev_arch "none")
+  foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
+      set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE})
+      set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR})
+    if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir)
+      # Compile regular kernel
+      CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE)
+
+      if(WITH_CYCLES_HIP_BUILD_SERIAL)
+        set(prev_arch ${arch})
+      endif()
+
+      unset(hip_hipcc_executable)
+      unset(hip_toolkit_root_dir)
+    endif()
+  endforeach()
+
+  add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins})
+  cycles_set_solution_folder(cycles_kernel_hip)
+endif()
+
+####################################################### END
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
@@ -602,11 +710,13 @@ endif()
 cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_DEVICE_CPU}
   ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_HIP}
   ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
   ${SRC_DEVICE_CPU_HEADERS}
   ${SRC_DEVICE_GPU_HEADERS}
   ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_HIP_HEADERS}
   ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
@@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
 source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS})
 source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
 source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
 source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
@@ -632,14 +743,19 @@ endif()
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
+if(WITH_CYCLES_HIP)
+  add_dependencies(cycles_kernel cycles_kernel_hip)
+endif()
 
 # Install kernel source for runtime compilation
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index a68d1d80c7d..db4a4bf71e0 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -25,7 +25,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index f609520b8b4..a1349e82efb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
 {
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
index 65b1990dbb8..b60dceb2ed0 100644
--- a/intern/cycles/kernel/device/gpu/parallel_reduce.h
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
  * the overall cost of the algorithm while keeping the work complexity O(n) and
  * the step complexity O(log n). (Brent's Theorem optimization) */
 
-#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
 __device__ void gpu_parallel_sum(
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 99b35468517..9bca1fad22f 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
 template<uint blocksize, typename GetKeyOp>
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
new file mode 100644
index 00000000000..3644925d5be
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_HIP__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#ifdef __HIPCC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_HIPBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#define ccl_device_inline __device__ __inline__
+#define ccl_device_forceinline __device__ __forceinline__
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+#define kernel_assert(cond)
+
+/* Types */
+#ifdef __HIP__
+#  include "hip/hip_fp16.h"
+#  include "hip/hip_runtime.h"
+#endif
+
+#ifdef _MSC_VER
+#  include <immintrin.h>
+#endif
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot(predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+typedef hipTextureObject_t ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
new file mode 100644
index 00000000000..2fde0d46015
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from HIP occupancy calculator.
+ *
+ * Terminology
+ * - HIP GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* Launch Bound Definitions */
+#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
+#define GPU_BLOCK_MAX_THREADS 1024
+#define GPU_THREAD_MAX_REGISTERS 255
+
+#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
+#define GPU_KERNEL_MAX_REGISTERS 64
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
new file mode 100644
index 00000000000..39978ae7899
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp
new file mode 100644
index 00000000000..c801320a2e1
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/kernel.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* HIP kernel entry points */
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+#  include "kernel/device/hip/compat.h"
+#  include "kernel/device/hip/config.h"
+#  include "kernel/device/hip/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index de17efafcf2..faba411c769 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,7 +34,7 @@
 
 #else /* __KERNEL_GPU__ */
 
-#  ifdef __KERNEL_CUDA__
+#  if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
 
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 1d598725c84..2245668d02f 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false)
   reset();
 }
 
+DebugFlags::HIP::HIP() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
     adaptive_compile = true;
 }
 
+void DebugFlags::HIP::reset()
+{
+  if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
   reset();
@@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
 
   os << "OptiX flags:\n"
      << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
+
+  os << "HIP flags:\n"
+     << "  HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n";
+
   return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index a2acaea5675..81677201790 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -93,6 +93,17 @@ class DebugFlags {
     bool adaptive_compile;
   };
 
+  /* Descriptor of HIP feature-set to be used. */
+  struct HIP {
+    HIP();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
+    bool adaptive_compile;
+  };
+
   /* Descriptor of OptiX feature-set to be used. */
   struct OptiX {
     OptiX();
@@ -124,6 +135,9 @@ class DebugFlags {
   /* Requested OptiX flags. */
   OptiX optix;
 
+  /* Requested HIP flags. */
+  HIP hip;
+
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index d9edfec5da3..f36a492a1b0 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 /* Half Floats */
 
 /* CUDA has its own half data type, no need to define then */
-#ifndef __KERNEL_CUDA__
+#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -59,7 +59,7 @@ struct half4 {
   half x, y, z, w;
 };
 
-#ifdef __KERNEL_CUDA__
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
@@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 
 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
+
 #  ifndef __KERNEL_SSE2__
   for (int i = 0; i < 4; i++) {
     /* optimized float to half for pixels:
@@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 #  endif
 }
 
+#  ifndef __KERNEL_HIP__
+
 ccl_device_inline float half_to_float(half h)
 {
   float f;
@@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h)
 
   return f;
 }
+#  else
+
+ccl_device_inline float half_to_float(std::uint32_t a) noexcept
+{
+
+  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
+
+  std::uint32_t v = __float_as_uint(__uint_as_float(u) *
+                                    __uint_as_float(0x77800000U) /*0x1.0p+112f*/) +
+                    0x38000000U;
+
+  u = (a & 0x7fff) != 0 ? v : u;
+
+  return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/;
+}
+
+#  endif /* __KERNEL_HIP__ */
 
 ccl_device_inline float4 half4_to_float4(half4 h)
 {
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 6d728dde679..cb1e94c838c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,6 +26,10 @@
 #  include <cmath>
 #endif
 
+#ifdef __HIP__
+#  include <hip/hip_vector_types.h>
+#endif
+
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
@@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Scalar */
 
-#ifdef _WIN32
+#ifndef __HIP__
+#  ifdef _WIN32
 ccl_device_inline float fmaxf(float a, float b)
 {
   return (a > b) ? a : b;
@@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b)
 {
   return (a < b) ? a : b;
 }
-#endif /* _WIN32 */
+
+#  endif /* _WIN32 */
+#endif   /* __HIP__ */
 
 #ifndef __KERNEL_GPU__
 using std::isfinite;
@@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f)
   return u.i;
 }
 
+#ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
   union {
@@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i)
   u.i = i;
   return u.f;
 }
+#endif
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
@@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits)
 
 ccl_device_inline uint count_leading_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __clz(x);
 #else
   assert(x != 0);
@@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x)
 
 ccl_device_inline uint count_trailing_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return (__ffs(x) - 1);
 #else
   assert(x != 0);
@@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 
 ccl_device_inline uint find_first_set(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __ffs(x);
 #else
 #  ifdef _MSC_VER
author	Brian Savery <bsavery>	2021-09-28 17:51:14 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-28 20:18:55 +0300
commit	044a77352f8a8a0e1f60190369d69ef26587b65f (patch)
tree	22096da4d5214cbd7419d1a5e0dadc70e6cacea3
parent	262b2118565826177133013c324212c66d882456 (diff)