diff options
45 files changed, 4854 insertions, 19 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e807b84e22..c4b8bf6dcd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -419,6 +419,8 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY) option(WITH_CYCLES_DEVICE_CUDA "Enable Cycles CUDA compute support" ON) option(WITH_CYCLES_DEVICE_OPTIX "Enable Cycles OptiX support" ON) +option(WITH_CYCLES_DEVICE_HIP "Enable Cycles HIP support" OFF) +mark_as_advanced(WITH_CYCLES_DEVICE_HIP) mark_as_advanced(WITH_CYCLES_DEVICE_CUDA) option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON) @@ -821,6 +823,11 @@ if(NOT WITH_CUDA_DYNLOAD) endif() endif() +if(WITH_CYCLES_DEVICE_HIP) + # Currently HIP must be dynamically loaded, this may change in future toolkits + set(WITH_HIP_DYNLOAD ON) +endif() + #----------------------------------------------------------------------------- # Check check if submodules are cloned @@ -1850,6 +1857,9 @@ elseif(WITH_CYCLES_STANDALONE) if(WITH_CUDA_DYNLOAD) add_subdirectory(extern/cuew) endif() + if(WITH_HIP_DYNLOAD) + add_subdirectory(extern/hipew) + endif() if(NOT WITH_SYSTEM_GLEW) add_subdirectory(extern/glew) endif() diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 7f7d91f0765..2b2cca04503 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -70,6 +70,9 @@ if(WITH_CYCLES OR WITH_COMPOSITOR OR WITH_OPENSUBDIV) if(WITH_CUDA_DYNLOAD) add_subdirectory(cuew) endif() + if(WITH_HIP_DYNLOAD) + add_subdirectory(hipew) + endif() endif() if(WITH_GHOST_X11 AND WITH_GHOST_XDND) diff --git a/extern/hipew/CMakeLists.txt b/extern/hipew/CMakeLists.txt new file mode 100644 index 00000000000..d215ea8c691 --- /dev/null +++ b/extern/hipew/CMakeLists.txt @@ -0,0 +1,39 @@ +# ***** BEGIN GPL LICENSE BLOCK ***** +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# The Original Code is Copyright (C) 2021, Blender Foundation +# All rights reserved. +# ***** END GPL LICENSE BLOCK ***** + +set(INC + . + include +) + +set(INC_SYS + +) + +set(SRC + src/hipew.c + + include/hipew.h +) + +set(LIB +) + +blender_add_lib(extern_hipew "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h new file mode 100644 index 00000000000..02fffc331bf --- /dev/null +++ b/extern/hipew/include/hipew.h @@ -0,0 +1,1207 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __HIPEW_H__ +#define __HIPEW_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdlib.h> + +#define HIP_IPC_HANDLE_SIZE 64 +#define hipHostMallocPortable 0x01 +#define hipHostMallocMapped 0x02 +#define hipHostMallocWriteCombined 0x04 +#define hipHostRegisterPortable 0x01 +#define hipHostRegisterMapped 0x02 +#define hipHostRegisterIoMemory 0x04 +#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01 +#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02 +#define hipArrayLayered 0x01 +#define hipArraySurfaceLoadStore 0x02 +#define hipArrayCubemap 0x04 +#define hipArrayTextureGather 0x08 +#define HIP_TRSA_OVERRIDE_FORMAT 0x01 +#define HIP_TRSF_READ_AS_INTEGER 0x01 +#define HIP_TRSF_NORMALIZED_COORDINATES 0x02 +#define HIP_LAUNCH_PARAM_END ((void*)0x00) +#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) +#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) + +/* Functions which changed 3.1 -> 3.2 for 64 bit stuff, + * the cuda library has both the old ones for compatibility and new + * ones with _v2 postfix, + */ +#define hipModuleGetGlobal hipModuleGetGlobal +#define hipMemGetInfo hipMemGetInfo +#define hipMemAllocPitch hipMemAllocPitch +#define hipMemGetAddressRange hipMemGetAddressRange +#define hipMemcpyHtoD hipMemcpyHtoD +#define hipMemcpyDtoH hipMemcpyDtoH +#define hipMemcpyDtoD hipMemcpyDtoD +#define hipMemcpyHtoA hipMemcpyHtoA +#define hipMemcpyAtoH hipMemcpyAtoH +#define hipMemcpyHtoDAsync hipMemcpyHtoDAsync +#define hipMemcpyDtoHAsync hipMemcpyDtoHAsync +#define hipMemcpyDtoDAsync hipMemcpyDtoDAsync +#define hipMemsetD8 hipMemsetD8 +#define hipMemsetD16 hipMemsetD16 +#define hipMemsetD32 hipMemsetD32 +#define hipArrayCreate hipArrayCreate +#define hipArray3DCreate hipArray3DCreate +#define hipTexRefSetAddress hipTexRefSetAddress +#define hipTexRefGetAddress hipTexRefGetAddress +#define hipStreamDestroy hipStreamDestroy +#define hipEventDestroy hipEventDestroy +#define hipTexRefSetAddress2D hipTexRefSetAddress2D + +/* Types. */ +#ifdef _MSC_VER +typedef unsigned __int32 hipuint32_t; +typedef unsigned __int64 hipuint64_t; +#else +#include <stdint.h> +typedef uint32_t hipuint32_t; +typedef uint64_t hipuint64_t; +#endif + +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__) +typedef unsigned long long hipDeviceptr_t; +#else +typedef unsigned int hipDeviceptr_t; +#endif + + +#ifdef _WIN32 +# define HIPAPI __stdcall +# define HIP_CB __stdcall +#else +# define HIPAPI +# define HIP_CB +#endif + +typedef int hipDevice_t; +typedef struct ihipCtx_t* hipCtx_t; +typedef struct ihipModule_t* hipModule_t; +typedef struct ihipModuleSymbol_t* hipFunction_t; +typedef struct hipArray* hArray; +typedef struct hipMipmappedArray_st* hipMipmappedArray_t; +typedef struct ihipEvent_t* hipEvent_t; +typedef struct ihipStream_t* hipStream_t; +typedef unsigned long long hipTextureObject_t; + +typedef struct HIPuuid_st { + char bytes[16]; +} HIPuuid; + +typedef enum hipChannelFormatKind { + hipChannelFormatKindSigned = 0, + hipChannelFormatKindUnsigned = 1, + hipChannelFormatKindFloat = 2, + hipChannelFormatKindNone = 3, +}hipChannelFormatKind; + +typedef struct hipChannelFormatDesc { + int x; + int y; + int z; + int w; + enum hipChannelFormatKind f; +}hipChannelFormatDesc; + +typedef enum hipTextureFilterMode { + hipFilterModePoint = 0, + hipFilterModeLinear = 1, +} hipTextureFilterMode; + +typedef enum hipArray_Format { + HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, + HIP_AD_FORMAT_SIGNED_INT8 = 0x08, + HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02, + HIP_AD_FORMAT_SIGNED_INT16 = 0x09, + HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03, + HIP_AD_FORMAT_SIGNED_INT32 = 0x0a, + HIP_AD_FORMAT_HALF = 0x10, + HIP_AD_FORMAT_FLOAT = 0x20, +} hipArray_Format; + +typedef enum hipTextureAddressMode { + hipAddressModeWrap = 0, + hipAddressModeClamp = 1, + hipAddressModeMirror = 2, + hipAddressModeBorder = 3, +} hipTextureAddressMode; + +/** + * hip texture reference + */ +typedef struct textureReference { + int normalized; + //enum hipTextureReadMode readMode;// used only for driver API's + enum hipTextureFilterMode filterMode; + enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions + struct hipChannelFormatDesc channelDesc; + int sRGB; // Perform sRGB->linear conversion during texture read + unsigned int maxAnisotropy; // Limit to the anisotropy ratio + enum hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + + hipTextureObject_t textureObject; + int numChannels; + enum hipArray_Format format; +}textureReference; + +typedef textureReference* hipTexRef; + +typedef enum hipMemoryType { + hipMemoryTypeHost = 0x00, + hipMemoryTypeDevice = 0x01, + hipMemoryTypeArray = 0x02, + hipMemoryTypeUnified = 0x03, +} hipMemoryType; + +/** + * Pointer attributes + */ +typedef struct hipPointerAttribute_t { + enum hipMemoryType memoryType; + int device; + void* devicePointer; + void* hostPointer; + int isManaged; + unsigned allocationFlags; /* flags specified when memory was allocated*/ + /* peers? */ +} hipPointerAttribute_t; + +typedef struct ihipIpcEventHandle_t { + char reserved[HIP_IPC_HANDLE_SIZE]; +} ihipIpcEventHandle_t; + +typedef struct hipIpcMemHandle_st { + char reserved[HIP_IPC_HANDLE_SIZE]; +} hipIpcMemHandle_t; + +typedef enum HIPipcMem_flags_enum { + hipIpcMemLazyEnablePeerAccess = 0x1, +} HIPipcMem_flags; + +typedef enum HIPmemAttach_flags_enum { + hipMemAttachGlobal = 0x1, + hipMemAttachHost = 0x2, + HIP_MEM_ATTACH_SINGLE = 0x4, +} HIPmemAttach_flags; + +typedef enum HIPctx_flags_enum { + hipDeviceScheduleAuto = 0x00, + hipDeviceScheduleSpin = 0x01, + hipDeviceScheduleYield = 0x02, + hipDeviceScheduleBlockingSync = 0x04, + hipDeviceScheduleMask = 0x07, + hipDeviceMapHost = 0x08, + hipDeviceLmemResizeToMax = 0x10, +} HIPctx_flags; + +typedef enum HIPstream_flags_enum { + hipStreamDefault = 0x0, + hipStreamNonBlocking = 0x1, +} HIPstream_flags; + +typedef enum HIPevent_flags_enum { + hipEventDefault = 0x0, + hipEventBlockingSync = 0x1, + hipEventDisableTiming = 0x2, + hipEventInterprocess = 0x4, +} HIPevent_flags; + +typedef enum HIPstreamWaitValue_flags_enum { + HIP_STREAM_WAIT_VALUE_GEQ = 0x0, + HIP_STREAM_WAIT_VALUE_EQ = 0x1, + HIP_STREAM_WAIT_VALUE_AND = 0x2, + HIP_STREAM_WAIT_VALUE_NOR = 0x3, + HIP_STREAM_WAIT_VALUE_FLUSH = (1 << 30), +} HIPstreamWaitValue_flags; + +typedef enum HIPstreamWriteValue_flags_enum { + HIP_STREAM_WRITE_VALUE_DEFAULT = 0x0, + HIP_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1, +} HIPstreamWriteValue_flags; + +typedef enum HIPstreamBatchMemOpType_enum { + HIP_STREAM_MEM_OP_WAIT_VALUE_32 = 1, + HIP_STREAM_MEM_OP_WRITE_VALUE_32 = 2, + HIP_STREAM_MEM_OP_WAIT_VALUE_64 = 4, + HIP_STREAM_MEM_OP_WRITE_VALUE_64 = 5, + HIP_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3, +} HIPstreamBatchMemOpType; + + +typedef union HIPstreamBatchMemOpParams_union { + HIPstreamBatchMemOpType operation; + struct HIPstreamMemOpWaitValueParams_st { + HIPstreamBatchMemOpType operation; + hipDeviceptr_t address; + union { + hipuint32_t value; + hipuint64_t value64; + }; + unsigned int flags; + hipDeviceptr_t alias; + } waitValue; + struct HIPstreamMemOpWriteValueParams_st { + HIPstreamBatchMemOpType operation; + hipDeviceptr_t address; + union { + hipuint32_t value; + hipuint64_t value64; + }; + unsigned int flags; + hipDeviceptr_t alias; + } writeValue; + struct HIPstreamMemOpFlushRemoteWritesParams_st { + HIPstreamBatchMemOpType operation; + unsigned int flags; + } flushRemoteWrites; + hipuint64_t pad[6]; +} HIPstreamBatchMemOpParams; + +typedef enum HIPoccupancy_flags_enum { + hipOccupancyDefault = 0x0, + HIP_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1, +} HIPoccupancy_flags; + +typedef enum hipDeviceAttribute_t { + hipDeviceAttributeCudaCompatibleBegin = 0, + hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled. + hipDeviceAttributeAccessPolicyMaxWindowSize, ///< Cuda only. The maximum size of the window policy in bytes. + hipDeviceAttributeAsyncEngineCount, ///< Cuda only. Asynchronous engines number. + hipDeviceAttributeCanMapHostMemory, ///< Whether host memory can be mapped into device address space + hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Cuda only. Device can access host registered memory + ///< at the same virtual address as the CPU + hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. + hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. + hipDeviceAttributeComputePreemptionSupported, ///< Cuda only. Device supports Compute Preemption. + hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. + hipDeviceAttributeConcurrentManagedAccess, ///< Device can coherently access managed memory concurrently with the CPU + hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch + hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices + hipDeviceAttributeDeviceOverlap, ///< Cuda only. Device can concurrently copy memory and execute a kernel. + ///< Deprecated. Use instead asyncEngineCount. + hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on + ///< the device without migration + hipDeviceAttributeGlobalL1CacheSupported, ///< Cuda only. Device supports caching globals in L1 + hipDeviceAttributeHostNativeAtomicSupported, ///< Cuda only. Link between the device and the host supports native atomic operations + hipDeviceAttributeIntegrated, ///< Device is integrated GPU + hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. + hipDeviceAttributeKernelExecTimeout, ///< Run time limit for kernels executed on the device + hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. + hipDeviceAttributeLocalL1CacheSupported, ///< caching locals in L1 is supported + hipDeviceAttributeLuid, ///< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms + hipDeviceAttributeLuidDeviceNodeMask, ///< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms + hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. + hipDeviceAttributeManagedMemory, ///< Device supports allocating managed memory on this system + hipDeviceAttributeMaxBlocksPerMultiProcessor, ///< Cuda only. Max block size per multiprocessor + hipDeviceAttributeMaxBlockDimX, ///< Max block size in width. + hipDeviceAttributeMaxBlockDimY, ///< Max block size in height. + hipDeviceAttributeMaxBlockDimZ, ///< Max block size in depth. + hipDeviceAttributeMaxGridDimX, ///< Max grid size in width. + hipDeviceAttributeMaxGridDimY, ///< Max grid size in height. + hipDeviceAttributeMaxGridDimZ, ///< Max grid size in depth. + hipDeviceAttributeMaxSurface1D, ///< Maximum size of 1D surface. + hipDeviceAttributeMaxSurface1DLayered, ///< Cuda only. Maximum dimensions of 1D layered surface. + hipDeviceAttributeMaxSurface2D, ///< Maximum dimension (width, height) of 2D surface. + hipDeviceAttributeMaxSurface2DLayered, ///< Cuda only. Maximum dimensions of 2D layered surface. + hipDeviceAttributeMaxSurface3D, ///< Maximum dimension (width, height, depth) of 3D surface. + hipDeviceAttributeMaxSurfaceCubemap, ///< Cuda only. Maximum dimensions of Cubemap surface. + hipDeviceAttributeMaxSurfaceCubemapLayered, ///< Cuda only. Maximum dimension of Cubemap layered surface. + hipDeviceAttributeMaxTexture1DWidth, ///< Maximum size of 1D texture. + hipDeviceAttributeMaxTexture1DLayered, ///< Cuda only. Maximum dimensions of 1D layered texture. + hipDeviceAttributeMaxTexture1DLinear, ///< Maximum number of elements allocatable in a 1D linear texture. + ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda. + hipDeviceAttributeMaxTexture1DMipmap, ///< Cuda only. Maximum size of 1D mipmapped texture. + hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D texture. + hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension hight of 2D texture. + hipDeviceAttributeMaxTexture2DGather, ///< Cuda only. Maximum dimensions of 2D texture if gather operations performed. + hipDeviceAttributeMaxTexture2DLayered, ///< Cuda only. Maximum dimensions of 2D layered texture. + hipDeviceAttributeMaxTexture2DLinear, ///< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory. + hipDeviceAttributeMaxTexture2DMipmap, ///< Cuda only. Maximum dimensions of 2D mipmapped texture. + hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D texture. + hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimension height of 3D texture. + hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimension depth of 3D texture. + hipDeviceAttributeMaxTexture3DAlt, ///< Cuda only. Maximum dimensions of alternate 3D texture. + hipDeviceAttributeMaxTextureCubemap, ///< Cuda only. Maximum dimensions of Cubemap texture + hipDeviceAttributeMaxTextureCubemapLayered, ///< Cuda only. Maximum dimensions of Cubemap layered texture. + hipDeviceAttributeMaxThreadsDim, ///< Maximum dimension of a block + hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. + hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. + hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies + hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. + hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. + hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDeviceAttributeMultiGpuBoardGroupID, ///< Cuda only. Unique ID of device group on the same multi-GPU board + hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. + hipDeviceAttributeName, ///< Device name. + hipDeviceAttributePageableMemoryAccess, ///< Device supports coherently accessing pageable memory + ///< without calling hipHostRegister on it + hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables + hipDeviceAttributePciBusId, ///< PCI Bus ID. + hipDeviceAttributePciDeviceId, ///< PCI Device ID. + hipDeviceAttributePciDomainID, ///< PCI Domain ID. + hipDeviceAttributePersistingL2CacheMaxSize, ///< Cuda11 only. Maximum l2 persisting lines capacity in bytes + hipDeviceAttributeMaxRegistersPerBlock, ///< 32-bit registers available to a thread block. This number is shared + ///< by all thread blocks simultaneously resident on a multiprocessor. + hipDeviceAttributeMaxRegistersPerMultiprocessor, ///< 32-bit registers available per block. + hipDeviceAttributeReservedSharedMemPerBlock, ///< Cuda11 only. Shared memory reserved by CUDA driver per block. + hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. + hipDeviceAttributeSharedMemPerBlockOptin, ///< Cuda only. Maximum shared memory per block usable by special opt in. + hipDeviceAttributeSharedMemPerMultiprocessor, ///< Cuda only. Shared memory available per multiprocessor. + hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision. + hipDeviceAttributeStreamPrioritiesSupported, ///< Cuda only. Whether to support stream priorities. + hipDeviceAttributeSurfaceAlignment, ///< Cuda only. Alignment requirement for surfaces + hipDeviceAttributeTccDriver, ///< Cuda only. Whether device is a Tesla device using TCC driver + hipDeviceAttributeTextureAlignment, ///< Alignment requirement for textures + hipDeviceAttributeTexturePitchAlignment, ///< Pitch alignment requirement for 2D texture references bound to pitched memory; + hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. + hipDeviceAttributeTotalGlobalMem, ///< Global memory available on devicice. + hipDeviceAttributeUnifiedAddressing, ///< Cuda only. An unified address space shared with the host. + hipDeviceAttributeUuid, ///< Cuda only. Unique ID in 16 byte. + hipDeviceAttributeWarpSize, ///< Warp size in threads. + hipDeviceAttributeCudaCompatibleEnd = 9999, + hipDeviceAttributeAmdSpecificBegin = 10000, + hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin, ///< Frequency in khz of the timer used by the device-side "clock*" + hipDeviceAttributeArch, ///< Device architecture + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory PerMultiprocessor. + hipDeviceAttributeGcnArch, ///< Device gcn architecture + hipDeviceAttributeGcnArchName, ///< Device gcnArch name in 256 bytes + hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, ///< Supports cooperative launch on multiple + ///< devices with unmatched functions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim, ///< Supports cooperative launch on multiple + ///< devices with unmatched grid dimensions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim, ///< Supports cooperative launch on multiple + ///< devices with unmatched block dimensions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple + ///< devices with unmatched shared memories + hipDeviceAttributeIsLargeBar, ///< Whether it is LargeBar + hipDeviceAttributeAsicRevision, ///< Revision of the GPU in this device + hipDeviceAttributeCanUseStreamWaitValue, ///< '1' if Device supports hipStreamWaitValue32() and + ///< hipStreamWaitValue64() , '0' otherwise. + hipDeviceAttributeAmdSpecificEnd = 19999, + hipDeviceAttributeVendorSpecificBegin = 20000, + // Extended attributes for vendors +} hipDeviceAttribute_t; + +typedef struct HIPdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign; +} HIPdevprop; + +typedef enum HIPpointer_attribute_enum { + HIP_POINTER_ATTRIBUTE_CONTEXT = 1, + HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, + HIP_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, + HIP_POINTER_ATTRIBUTE_HOST_POINTER = 4, + HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, + HIP_POINTER_ATTRIBUTE_BUFFER_ID = 7, + HIP_POINTER_ATTRIBUTE_IS_MANAGED = 8, + HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, +} HIPpointer_attribute; + +typedef enum hipFunction_attribute { + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + HIP_FUNC_ATTRIBUTE_NUM_REGS = 4, + HIP_FUNC_ATTRIBUTE_PTX_VERSION = 5, + HIP_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + HIP_FUNC_ATTRIBUTE_MAX, +} hipFunction_attribute; + +typedef enum hipFuncCache_t { + hipFuncCachePreferNone = 0x00, + hipFuncCachePreferShared = 0x01, + hipFuncCachePreferL1 = 0x02, + hipFuncCachePreferEqual = 0x03, +} hipFuncCache_t; + +typedef enum hipSharedMemConfig { + hipSharedMemBankSizeDefault = 0x00, + hipSharedMemBankSizeFourByte = 0x01, + hipSharedMemBankSizeEightByte = 0x02, +} hipSharedMemConfig; + +typedef enum HIPshared_carveout_enum { + HIP_SHAREDMEM_CARVEOUT_DEFAULT, + HIP_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, + HIP_SHAREDMEM_CARVEOUT_MAX_L1 = 0, +} HIPshared_carveout; + + + +typedef enum hipComputeMode { + hipComputeModeDefault = 0, + hipComputeModeProhibited = 2, + hipComputeModeExclusiveProcess = 3, +} hipComputeMode; + +typedef enum HIPmem_advise_enum { + HIP_MEM_ADVISE_SET_READ_MOSTLY = 1, + HIP_MEM_ADVISE_UNSET_READ_MOSTLY = 2, + HIP_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, + HIP_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, + HIP_MEM_ADVISE_SET_ACCESSED_BY = 5, + HIP_MEM_ADVISE_UNSET_ACCESSED_BY = 6, +} HIPmem_advise; + +typedef enum HIPmem_range_attribute_enum { + HIP_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, + HIP_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, + HIP_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, + HIP_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4, +} HIPmem_range_attribute; + +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionSm3xOpt, + hipJitOptionFastCompile, + hipJitOptionNumOptions, +} hipJitOption; + +typedef enum HIPjit_target_enum { + HIP_TARGET_COMPUTE_20 = 20, + HIP_TARGET_COMPUTE_21 = 21, + HIP_TARGET_COMPUTE_30 = 30, + HIP_TARGET_COMPUTE_32 = 32, + HIP_TARGET_COMPUTE_35 = 35, + HIP_TARGET_COMPUTE_37 = 37, + HIP_TARGET_COMPUTE_50 = 50, + HIP_TARGET_COMPUTE_52 = 52, + HIP_TARGET_COMPUTE_53 = 53, + HIP_TARGET_COMPUTE_60 = 60, + HIP_TARGET_COMPUTE_61 = 61, + HIP_TARGET_COMPUTE_62 = 62, + HIP_TARGET_COMPUTE_70 = 70, + HIP_TARGET_COMPUTE_73 = 73, + HIP_TARGET_COMPUTE_75 = 75, +} HIPjit_target; + +typedef enum HIPjit_fallback_enum { + HIP_PREFER_PTX = 0, + HIP_PREFER_BINARY, +} HIPjit_fallback; + +typedef enum HIPjit_cacheMode_enum { + HIP_JIT_CACHE_OPTION_NONE = 0, + HIP_JIT_CACHE_OPTION_CG, + HIP_JIT_CACHE_OPTION_CA, +} HIPjit_cacheMode; + +typedef enum HIPjitInputType_enum { + HIP_JIT_INPUT_HIPBIN = 0, + HIP_JIT_INPUT_PTX, + HIP_JIT_INPUT_FATBINARY, + HIP_JIT_INPUT_OBJECT, + HIP_JIT_INPUT_LIBRARY, + HIP_JIT_NUM_INPUT_TYPES, +} HIPjitInputType; + +typedef struct HIPlinkState_st* HIPlinkState; + +typedef enum hipGLDeviceList { + hipGLDeviceListAll = 1, ///< All hip devices used by current OpenGL context. + hipGLDeviceListCurrentFrame = 2, ///< Hip devices used by current OpenGL context in current + ///< frame + hipGLDeviceListNextFrame = 3 ///< Hip devices used by current OpenGL context in next + ///< frame. +} hipGLDeviceList; + +typedef enum hipGraphicsRegisterFlags { + hipGraphicsRegisterFlagsNone = 0, + hipGraphicsRegisterFlagsReadOnly = 1, ///< HIP will not write to this registered resource + hipGraphicsRegisterFlagsWriteDiscard = + 2, ///< HIP will only write and will not read from this registered resource + hipGraphicsRegisterFlagsSurfaceLoadStore = 4, ///< HIP will bind this resource to a surface + hipGraphicsRegisterFlagsTextureGather = + 8 ///< HIP will perform texture gather operations on this registered resource +} hipGraphicsRegisterFlags; + +typedef enum HIPgraphicsRegisterFlags_enum { + HIP_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + HIP_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + HIP_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + HIP_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, + HIP_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08, +} HIPgraphicsRegisterFlags; + +typedef enum HIPgraphicsMapResourceFlags_enum { + HIP_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + HIP_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + HIP_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02, +} HIPgraphicsMapResourceFlags; + +typedef enum HIParray_cubemap_face_enum { + HIP_HIPBEMAP_FACE_POSITIVE_X = 0x00, + HIP_HIPBEMAP_FACE_NEGATIVE_X = 0x01, + HIP_HIPBEMAP_FACE_POSITIVE_Y = 0x02, + HIP_HIPBEMAP_FACE_NEGATIVE_Y = 0x03, + HIP_HIPBEMAP_FACE_POSITIVE_Z = 0x04, + HIP_HIPBEMAP_FACE_NEGATIVE_Z = 0x05, +} HIParray_cubemap_face; + +typedef enum hipLimit_t { + HIP_LIMIT_STACK_SIZE = 0x00, + HIP_LIMIT_PRINTF_FIFO_SIZE = 0x01, + hipLimitMallocHeapSize = 0x02, + HIP_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, + HIP_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, + HIP_LIMIT_MAX, +} hipLimit_t; + +typedef enum hipResourceType { + hipResourceTypeArray = 0x00, + hipResourceTypeMipmappedArray = 0x01, + hipResourceTypeLinear = 0x02, + hipResourceTypePitch2D = 0x03, +} hipResourceType; + +typedef enum hipError_t { + hipSuccess = 0, + hipErrorInvalidValue = 1, + hipErrorOutOfMemory = 2, + hipErrorNotInitialized = 3, + hipErrorDeinitialized = 4, + hipErrorProfilerDisabled = 5, + hipErrorProfilerNotInitialized = 6, + hipErrorProfilerAlreadyStarted = 7, + hipErrorProfilerAlreadyStopped = 8, + hipErrorNoDevice = 100, + hipErrorInvalidDevice = 101, + hipErrorInvalidImage = 200, + hipErrorInvalidContext = 201, + hipErrorContextAlreadyCurrent = 202, + hipErrorMapFailed = 205, + hipErrorUnmapFailed = 206, + hipErrorArrayIsMapped = 207, + hipErrorAlreadyMapped = 208, + hipErrorNoBinaryForGpu = 209, + hipErrorAlreadyAcquired = 210, + hipErrorNotMapped = 211, + hipErrorNotMappedAsArray = 212, + hipErrorNotMappedAsPointer = 213, + hipErrorECCNotCorrectable = 214, + hipErrorUnsupportedLimit = 215, + hipErrorContextAlreadyInUse = 216, + hipErrorPeerAccessUnsupported = 217, + hipErrorInvalidKernelFile = 218, + hipErrorInvalidGraphicsContext = 219, + hipErrorInvalidSource = 300, + hipErrorFileNotFound = 301, + hipErrorSharedObjectSymbolNotFound = 302, + hipErrorSharedObjectInitFailed = 303, + hipErrorOperatingSystem = 304, + hipErrorInvalidHandle = 400, + hipErrorNotFound = 500, + hipErrorNotReady = 600, + hipErrorIllegalAddress = 700, + hipErrorLaunchOutOfResources = 701, + hipErrorLaunchTimeOut = 702, + hipErrorPeerAccessAlreadyEnabled = 704, + hipErrorPeerAccessNotEnabled = 705, + hipErrorSetOnActiveProcess = 708, + hipErrorAssert = 710, + hipErrorHostMemoryAlreadyRegistered = 712, + hipErrorHostMemoryNotRegistered = 713, + hipErrorLaunchFailure = 719, + hipErrorCooperativeLaunchTooLarge = 720, + hipErrorNotSupported = 801, + hipErrorUnknown = 999, +} hipError_t; + +/** + * Stream CallBack struct + */ +typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData); + +typedef enum HIPdevice_P2PAttribute_enum { + HIP_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, + HIP_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, + HIP_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, + HIP_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED = 0x04, +} HIPdevice_P2PAttribute; + +typedef struct hipGraphicsResource_st* hipGraphicsResource; + +typedef struct hip_Memcpy2D { + size_t srcXInBytes; + size_t srcY; + hipMemoryType srcMemoryType; + const void* srcHost; + hipDeviceptr_t srcDevice; + hArray * srcArray; + size_t srcPitch; + size_t dstXInBytes; + size_t dstY; + hipMemoryType dstMemoryType; + void* dstHost; + hipDeviceptr_t dstDevice; + hArray * dstArray; + size_t dstPitch; + size_t WidthInBytes; + size_t Height; +} hip_Memcpy2D; + +typedef enum hipDeviceP2PAttr { + hipDevP2PAttrPerformanceRank = 0, + hipDevP2PAttrAccessSupported, + hipDevP2PAttrNativeAtomicSupported, + hipDevP2PAttrHipArrayAccessSupported +} hipDeviceP2PAttr; + +typedef struct HIP_MEMCPY3D { + size_t srcXInBytes; + size_t srcY; + size_t srcZ; + size_t srcLOD; + hipMemoryType srcMemoryType; + const void* srcHost; + hipDeviceptr_t srcDevice; + hArray * srcArray; + void* reserved0; + size_t srcPitch; + size_t srcHeight; + size_t dstXInBytes; + size_t dstY; + size_t dstZ; + size_t dstLOD; + hipMemoryType dstMemoryType; + void* dstHost; + hipDeviceptr_t dstDevice; + hArray * dstArray; + void* reserved1; + size_t dstPitch; + size_t dstHeight; + size_t WidthInBytes; + size_t Height; + size_t Depth; +} HIP_MEMCPY3D; + +typedef struct HIP_MEMCPY3D_PEER_st { + size_t srcXInBytes; + size_t srcY; + size_t srcZ; + size_t srcLOD; + hipMemoryType srcMemoryType; + const void* srcHost; + hipDeviceptr_t srcDevice; + hArray * srcArray; + hipCtx_t srcContext; + size_t srcPitch; + size_t srcHeight; + size_t dstXInBytes; + size_t dstY; + size_t dstZ; + size_t dstLOD; + hipMemoryType dstMemoryType; + void* dstHost; + hipDeviceptr_t dstDevice; + hArray * dstArray; + hipCtx_t dstContext; + size_t dstPitch; + size_t dstHeight; + size_t WidthInBytes; + size_t Height; + size_t Depth; +} HIP_MEMCPY3D_PEER; + +typedef struct HIP_ARRAY_DESCRIPTOR { + size_t Width; + size_t Height; + hipArray_Format Format; + unsigned int NumChannels; +} HIP_ARRAY_DESCRIPTOR; + +typedef struct HIP_ARRAY3D_DESCRIPTOR { + size_t Width; + size_t Height; + size_t Depth; + hipArray_Format Format; + unsigned int NumChannels; + unsigned int Flags; +} HIP_ARRAY3D_DESCRIPTOR; + +typedef struct HIP_RESOURCE_DESC_st { + hipResourceType resType; + union { + struct { + hArray * h_Array; + } array; + struct { + hipMipmappedArray_t hMipmappedArray; + } mipmap; + struct { + hipDeviceptr_t devPtr; + hipArray_Format format; + unsigned int numChannels; + size_t sizeInBytes; + } linear; + struct { + hipDeviceptr_t devPtr; + hipArray_Format format; + unsigned int numChannels; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + unsigned int flags; +} hipResourceDesc; + +/** + * hip texture resource view formats + */ +typedef enum hipResourceViewFormat { + hipResViewFormatNone = 0x00, + hipResViewFormatUnsignedChar1 = 0x01, + hipResViewFormatUnsignedChar2 = 0x02, + hipResViewFormatUnsignedChar4 = 0x03, + hipResViewFormatSignedChar1 = 0x04, + hipResViewFormatSignedChar2 = 0x05, + hipResViewFormatSignedChar4 = 0x06, + hipResViewFormatUnsignedShort1 = 0x07, + hipResViewFormatUnsignedShort2 = 0x08, + hipResViewFormatUnsignedShort4 = 0x09, + hipResViewFormatSignedShort1 = 0x0a, + hipResViewFormatSignedShort2 = 0x0b, + hipResViewFormatSignedShort4 = 0x0c, + hipResViewFormatUnsignedInt1 = 0x0d, + hipResViewFormatUnsignedInt2 = 0x0e, + hipResViewFormatUnsignedInt4 = 0x0f, + hipResViewFormatSignedInt1 = 0x10, + hipResViewFormatSignedInt2 = 0x11, + hipResViewFormatSignedInt4 = 0x12, + hipResViewFormatHalf1 = 0x13, + hipResViewFormatHalf2 = 0x14, + hipResViewFormatHalf4 = 0x15, + hipResViewFormatFloat1 = 0x16, + hipResViewFormatFloat2 = 0x17, + hipResViewFormatFloat4 = 0x18, + hipResViewFormatUnsignedBlockCompressed1 = 0x19, + hipResViewFormatUnsignedBlockCompressed2 = 0x1a, + hipResViewFormatUnsignedBlockCompressed3 = 0x1b, + hipResViewFormatUnsignedBlockCompressed4 = 0x1c, + hipResViewFormatSignedBlockCompressed4 = 0x1d, + hipResViewFormatUnsignedBlockCompressed5 = 0x1e, + hipResViewFormatSignedBlockCompressed5 = 0x1f, + hipResViewFormatUnsignedBlockCompressed6H = 0x20, + hipResViewFormatSignedBlockCompressed6H = 0x21, + hipResViewFormatUnsignedBlockCompressed7 = 0x22 +}hipResourceViewFormat; + +typedef enum HIPresourceViewFormat_enum +{ + HIP_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} HIPresourceViewFormat; + +/** + * hip resource view descriptor + */ +struct hipResourceViewDesc { + enum hipResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; +}; + + +typedef struct hipTextureDesc_st { + hipTextureAddressMode addressMode[3]; + hipTextureFilterMode filterMode; + unsigned int flags; + unsigned int maxAnisotropy; + hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + float borderColor[4]; + int reserved[12]; +} hipTextureDesc; + +/** + * Resource view descriptor + */ +typedef struct HIP_RESOURCE_VIEW_DESC_st { + hipResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + unsigned int reserved[16]; +} HIP_RESOURCE_VIEW_DESC; + +typedef struct HIP_POINTER_ATTRIBUTE_P2P_TOKENS_st { + unsigned long long p2pToken; + unsigned int vaSpaceToken; +} HIP_POINTER_ATTRIBUTE_P2P_TOKENS; + + +typedef unsigned int GLenum; +typedef unsigned int GLuint; +typedef int GLint; + +typedef enum HIPGLDeviceList_enum { + HIP_GL_DEVICE_LIST_ALL = 0x01, + HIP_GL_DEVICE_LIST_CURRENT_FRAME = 0x02, + HIP_GL_DEVICE_LIST_NEXT_FRAME = 0x03, +} HIPGLDeviceList; + +typedef enum HIPGLmap_flags_enum { + HIP_GL_MAP_RESOURCE_FLAGS_NONE = 0x00, + HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02, +} HIPGLmap_flags; + + +/* Function types. */ +typedef hipError_t HIPAPI thipGetErrorName(hipError_t error, const char** pStr); +typedef hipError_t HIPAPI thipInit(unsigned int Flags); +typedef hipError_t HIPAPI thipDriverGetVersion(int* driverVersion); +typedef hipError_t HIPAPI thipGetDevice(hipDevice_t* device, int ordinal); +typedef hipError_t HIPAPI thipGetDeviceCount(int* count); +typedef hipError_t HIPAPI thipDeviceGetName(char* name, int len, hipDevice_t dev); +typedef hipError_t HIPAPI thipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attrib, hipDevice_t dev); +typedef hipError_t HIPAPI thipDeviceComputeCapability(int* major, int* minor, hipDevice_t dev); +typedef hipError_t HIPAPI thipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev); +typedef hipError_t HIPAPI thipDevicePrimaryCtxRelease(hipDevice_t dev); +typedef hipError_t HIPAPI thipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags); +typedef hipError_t HIPAPI thipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active); +typedef hipError_t HIPAPI thipDevicePrimaryCtxReset(hipDevice_t dev); +typedef hipError_t HIPAPI thipCtxCreate(hipCtx_t* pctx, unsigned int flags, hipDevice_t dev); +typedef hipError_t HIPAPI thipCtxDestroy(hipCtx_t ctx); +typedef hipError_t HIPAPI thipCtxPushCurrent(hipCtx_t ctx); +typedef hipError_t HIPAPI thipCtxPopCurrent(hipCtx_t* pctx); +typedef hipError_t HIPAPI thipCtxSetCurrent(hipCtx_t ctx); +typedef hipError_t HIPAPI thipCtxGetCurrent(hipCtx_t* pctx); +typedef hipError_t HIPAPI thipCtxGetDevice(hipDevice_t* device); +typedef hipError_t HIPAPI thipCtxGetFlags(unsigned int* flags); +typedef hipError_t HIPAPI thipCtxSynchronize(void); +typedef hipError_t HIPAPI thipDeviceSynchronize(void); +typedef hipError_t HIPAPI thipCtxGetCacheConfig(hipFuncCache_t* pconfig); +typedef hipError_t HIPAPI thipCtxSetCacheConfig(hipFuncCache_t config); +typedef hipError_t HIPAPI thipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig); +typedef hipError_t HIPAPI thipCtxSetSharedMemConfig(hipSharedMemConfig config); +typedef hipError_t HIPAPI thipCtxGetApiVersion(hipCtx_t ctx, unsigned int* version); +typedef hipError_t HIPAPI thipModuleLoad(hipModule_t* module, const char* fname); +typedef hipError_t HIPAPI thipModuleLoadData(hipModule_t* module, const void* image); +typedef hipError_t HIPAPI thipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions, hipJitOption* options, void** optionValues); +typedef hipError_t HIPAPI thipModuleUnload(hipModule_t hmod); +typedef hipError_t HIPAPI thipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const char* name); +typedef hipError_t HIPAPI thipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod, const char* name); +typedef hipError_t HIPAPI thipModuleGetTexRef(textureReference** pTexRef, hipModule_t hmod, const char* name); +typedef hipError_t HIPAPI thipMemGetInfo(size_t* free, size_t* total); +typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize); +typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); +typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr); +typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr); +typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize); +typedef hipError_t HIPAPI thipHostFree(void* p); +typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags); +typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags); +typedef hipError_t HIPAPI thipHostGetFlags(unsigned int* pFlags, void* p); +typedef hipError_t HIPAPI thipMallocManaged(hipDeviceptr_t* dptr, size_t bytesize, unsigned int flags); +typedef hipError_t HIPAPI thipDeviceGetByPCIBusId(hipDevice_t* dev, const char* pciBusId); +typedef hipError_t HIPAPI thipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t dev); +typedef hipError_t HIPAPI thipMemHostUnregister(void* p); +typedef hipError_t HIPAPI thipMemcpy(hipDeviceptr_t dst, hipDeviceptr_t src, size_t ByteCount); +typedef hipError_t HIPAPI thipMemcpyPeer(hipDeviceptr_t dstDevice, hipCtx_t dstContext, hipDeviceptr_t srcDevice, hipCtx_t srcContext, size_t ByteCount); +typedef hipError_t HIPAPI thipMemcpyHtoD(hipDeviceptr_t dstDevice, void* srcHost, size_t ByteCount); +typedef hipError_t HIPAPI thipMemcpyDtoH(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount); +typedef hipError_t HIPAPI thipMemcpyDtoD(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount); +typedef hipError_t HIPAPI thipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy); +typedef hipError_t HIPAPI thipMemcpyParam2D(const hip_Memcpy2D* pCopy); +typedef hipError_t HIPAPI thipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy); +typedef hipError_t HIPAPI thipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void* srcHost, size_t ByteCount, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t hStream); +typedef hipError_t HIPAPI thipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemsetD8(hipDeviceptr_t dstDevice, unsigned char uc, size_t N); +typedef hipError_t HIPAPI thipMemsetD16(hipDeviceptr_t dstDevice, unsigned short us, size_t N); +typedef hipError_t HIPAPI thipMemsetD32(hipDeviceptr_t dstDevice, unsigned int ui, size_t N); +typedef hipError_t HIPAPI thipMemsetD8Async(hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemsetD16Async(hipDeviceptr_t dstDevice, unsigned short us, size_t N, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemsetD32Async(hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemsetD2D8Async(hipDeviceptr_t dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemsetD2D16Async(hipDeviceptr_t dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, hipStream_t hStream); +typedef hipError_t HIPAPI thipMemsetD2D32Async(hipDeviceptr_t dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, hipStream_t hStream); +typedef hipError_t HIPAPI thipArrayCreate(hArray ** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray); +typedef hipError_t HIPAPI thipArrayDestroy(hArray hArray); +typedef hipError_t HIPAPI thipArray3DCreate(hArray * pHandle, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray); +typedef hipError_t HIPAPI hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr); +typedef hipError_t HIPAPI thipStreamCreateWithFlags(hipStream_t* phStream, unsigned int Flags); +typedef hipError_t HIPAPI thipStreamCreateWithPriority(hipStream_t* phStream, unsigned int flags, int priority); +typedef hipError_t HIPAPI thipStreamGetPriority(hipStream_t hStream, int* priority); +typedef hipError_t HIPAPI thipStreamGetFlags(hipStream_t hStream, unsigned int* flags); +typedef hipError_t HIPAPI thipStreamWaitEvent(hipStream_t hStream, hipEvent_t hEvent, unsigned int Flags); +typedef hipError_t HIPAPI thipStreamAddCallback(hipStream_t hStream, hipStreamCallback_t callback, void* userData, unsigned int flags); +typedef hipError_t HIPAPI thipStreamQuery(hipStream_t hStream); +typedef hipError_t HIPAPI thipStreamSynchronize(hipStream_t hStream); +typedef hipError_t HIPAPI thipStreamDestroy(hipStream_t hStream); +typedef hipError_t HIPAPI thipEventCreateWithFlags(hipEvent_t* phEvent, unsigned int Flags); +typedef hipError_t HIPAPI thipEventRecord(hipEvent_t hEvent, hipStream_t hStream); +typedef hipError_t HIPAPI thipEventQuery(hipEvent_t hEvent); +typedef hipError_t HIPAPI thipEventSynchronize(hipEvent_t hEvent); +typedef hipError_t HIPAPI thipEventDestroy(hipEvent_t hEvent); +typedef hipError_t HIPAPI thipEventElapsedTime(float* pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd); +typedef hipError_t HIPAPI thipFuncGetAttribute(int* pi, hipFunction_attribute attrib, hipFunction_t hfunc); +typedef hipError_t HIPAPI thipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config); +typedef hipError_t HIPAPI thipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra); +typedef hipError_t HIPAPI thipDrvOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t func, int blockSize, size_t dynamicSMemSize); +typedef hipError_t HIPAPI thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, hipFunction_t func, int blockSize, size_t dynamicSMemSize, unsigned int flags); +typedef hipError_t HIPAPI thipModuleOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, hipFunction_t func, size_t dynamicSMemSize, int blockSizeLimit); +typedef hipError_t HIPAPI thipTexRefSetArray(hipTexRef hTexRef, hArray * hArray, unsigned int Flags); +typedef hipError_t HIPAPI thipTexRefSetAddress(size_t* ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes); +typedef hipError_t HIPAPI thipTexRefSetAddress2D(hipTexRef hTexRef, const HIP_ARRAY_DESCRIPTOR* desc, hipDeviceptr_t dptr, size_t Pitch); +typedef hipError_t HIPAPI thipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents); +typedef hipError_t HIPAPI thipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipTextureAddressMode am); +typedef hipError_t HIPAPI thipTexRefSetFilterMode(hipTexRef hTexRef, hipTextureFilterMode fm); +typedef hipError_t HIPAPI thipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags); +typedef hipError_t HIPAPI thipTexRefGetAddress(hipDeviceptr_t* pdptr, hipTexRef hTexRef); +typedef hipError_t HIPAPI thipTexRefGetArray(hArray ** phArray, hipTexRef hTexRef); +typedef hipError_t HIPAPI thipTexRefGetAddressMode(hipTextureAddressMode* pam, hipTexRef hTexRef, int dim); +typedef hipError_t HIPAPI thipTexObjectCreate(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc, const hipTextureDesc* pTexDesc, const HIP_RESOURCE_VIEW_DESC* pResViewDesc); +typedef hipError_t HIPAPI thipTexObjectDestroy(hipTextureObject_t texObject); +typedef hipError_t HIPAPI thipDeviceCanAccessPeer(int* canAccessPeer, hipDevice_t dev, hipDevice_t peerDev); +typedef hipError_t HIPAPI thipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int Flags); +typedef hipError_t HIPAPI thipCtxDisablePeerAccess(hipCtx_t peerContext); +typedef hipError_t HIPAPI thipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attrib, hipDevice_t srcDevice, hipDevice_t dstDevice); +typedef hipError_t HIPAPI thipGraphicsUnregisterResource(hipGraphicsResource resource); +typedef hipError_t HIPAPI thipGraphicsResourceGetMappedMipmappedArray(hipMipmappedArray_t* pMipmappedArray, hipGraphicsResource resource); +typedef hipError_t HIPAPI thipGraphicsResourceGetMappedPointer(hipDeviceptr_t* pDevPtr, size_t* pSize, hipGraphicsResource resource); +typedef hipError_t HIPAPI thipGraphicsMapResources(unsigned int count, hipGraphicsResource* resources, hipStream_t hStream); +typedef hipError_t HIPAPI thipGraphicsUnmapResources(unsigned int count, hipGraphicsResource* resources, hipStream_t hStream); +typedef hipError_t HIPAPI thipGraphicsGLRegisterBuffer(hipGraphicsResource* pCudaResource, GLuint buffer, unsigned int Flags); +typedef hipError_t HIPAPI thipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices, unsigned int hipDeviceCount, hipGLDeviceList deviceList); + + +/* Function declarations. */ +extern thipGetErrorName *hipGetErrorName; +extern thipInit *hipInit; +extern thipDriverGetVersion *hipDriverGetVersion; +extern thipGetDevice *hipGetDevice; +extern thipGetDeviceCount *hipGetDeviceCount; +extern thipDeviceGetName *hipDeviceGetName; +extern thipDeviceGetAttribute *hipDeviceGetAttribute; +extern thipDeviceComputeCapability *hipDeviceComputeCapability; +extern thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain; +extern thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease; +extern thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags; +extern thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState; +extern thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset; +extern thipCtxCreate *hipCtxCreate; +extern thipCtxDestroy *hipCtxDestroy; +extern thipCtxPushCurrent *hipCtxPushCurrent; +extern thipCtxPopCurrent *hipCtxPopCurrent; +extern thipCtxSetCurrent *hipCtxSetCurrent; +extern thipCtxGetCurrent *hipCtxGetCurrent; +extern thipCtxGetDevice *hipCtxGetDevice; +extern thipCtxGetFlags *hipCtxGetFlags; +extern thipCtxSynchronize *hipCtxSynchronize; +extern thipDeviceSynchronize *hipDeviceSynchronize; +extern thipCtxGetCacheConfig *hipCtxGetCacheConfig; +extern thipCtxSetCacheConfig *hipCtxSetCacheConfig; +extern thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig; +extern thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig; +extern thipCtxGetApiVersion *hipCtxGetApiVersion; +extern thipModuleLoad *hipModuleLoad; +extern thipModuleLoadData *hipModuleLoadData; +extern thipModuleLoadDataEx *hipModuleLoadDataEx; +extern thipModuleUnload *hipModuleUnload; +extern thipModuleGetFunction *hipModuleGetFunction; +extern thipModuleGetGlobal *hipModuleGetGlobal; +extern thipModuleGetTexRef *hipModuleGetTexRef; +extern thipMemGetInfo *hipMemGetInfo; +extern thipMalloc *hipMalloc; +extern thipMemAllocPitch *hipMemAllocPitch; +extern thipFree *hipFree; +extern thipMemGetAddressRange *hipMemGetAddressRange; +extern thipHostMalloc *hipHostMalloc; +extern thipHostFree *hipHostFree; +extern thipHostGetDevicePointer *hipHostGetDevicePointer; +extern thipHostGetFlags *hipHostGetFlags; +extern thipMallocManaged *hipMallocManaged; +extern thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId; +extern thipDeviceGetPCIBusId *hipDeviceGetPCIBusId; +extern thipMemcpyPeer *hipMemcpyPeer; +extern thipMemcpyHtoD *hipMemcpyHtoD; +extern thipMemcpyDtoH *hipMemcpyDtoH; +extern thipMemcpyDtoD *hipMemcpyDtoD; +extern thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned; +extern thipMemcpyParam2D *hipMemcpyParam2D; +extern thipDrvMemcpy3D *hipDrvMemcpy3D; +extern thipMemcpyHtoDAsync *hipMemcpyHtoDAsync; +extern thipMemcpyDtoHAsync *hipMemcpyDtoHAsync; +extern thipMemcpyParam2DAsync *hipMemcpyParam2DAsync; +extern thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync; +extern thipMemsetD8 *hipMemsetD8; +extern thipMemsetD16 *hipMemsetD16; +extern thipMemsetD32 *hipMemsetD32; +extern thipMemsetD8Async *hipMemsetD8Async; +extern thipMemsetD16Async *hipMemsetD16Async; +extern thipMemsetD32Async *hipMemsetD32Async; +extern thipArrayCreate *hipArrayCreate; +extern thipArrayDestroy *hipArrayDestroy; +extern thipArray3DCreate *hipArray3DCreate; +extern thipStreamCreateWithFlags *hipStreamCreateWithFlags; +extern thipStreamCreateWithPriority *hipStreamCreateWithPriority; +extern thipStreamGetPriority *hipStreamGetPriority; +extern thipStreamGetFlags *hipStreamGetFlags; +extern thipStreamWaitEvent *hipStreamWaitEvent; +extern thipStreamAddCallback *hipStreamAddCallback; +extern thipStreamQuery *hipStreamQuery; +extern thipStreamSynchronize *hipStreamSynchronize; +extern thipStreamDestroy *hipStreamDestroy; +extern thipEventCreateWithFlags *hipEventCreateWithFlags; +extern thipEventRecord *hipEventRecord; +extern thipEventQuery *hipEventQuery; +extern thipEventSynchronize *hipEventSynchronize; +extern thipEventDestroy *hipEventDestroy; +extern thipEventElapsedTime *hipEventElapsedTime; +extern thipFuncGetAttribute *hipFuncGetAttribute; +extern thipFuncSetCacheConfig *hipFuncSetCacheConfig; +extern thipModuleLaunchKernel *hipModuleLaunchKernel; +extern thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor; +extern thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags; +extern thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize; +extern thipTexRefSetArray *hipTexRefSetArray; +extern thipTexRefSetAddress *hipTexRefSetAddress; +extern thipTexRefSetAddress2D *hipTexRefSetAddress2D; +extern thipTexRefSetFormat *hipTexRefSetFormat; +extern thipTexRefSetAddressMode *hipTexRefSetAddressMode; +extern thipTexRefSetFilterMode *hipTexRefSetFilterMode; +extern thipTexRefSetFlags *hipTexRefSetFlags; +extern thipTexRefGetAddress *hipTexRefGetAddress; +extern thipTexRefGetArray *hipTexRefGetArray; +extern thipTexRefGetAddressMode *hipTexRefGetAddressMode; +extern thipTexObjectCreate *hipTexObjectCreate; +extern thipTexObjectDestroy *hipTexObjectDestroy; +extern thipDeviceCanAccessPeer *hipDeviceCanAccessPeer; +extern thipCtxEnablePeerAccess *hipCtxEnablePeerAccess; +extern thipCtxDisablePeerAccess *hipCtxDisablePeerAccess; +extern thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute; +extern thipGraphicsUnregisterResource *hipGraphicsUnregisterResource; +extern thipGraphicsResourceGetMappedMipmappedArray *hipGraphicsResourceGetMappedMipmappedArray; +extern thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer; +extern thipGraphicsMapResources *hipGraphicsMapResources; +extern thipGraphicsUnmapResources *hipGraphicsUnmapResources; + +extern thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer; +extern thipGLGetDevices *hipGLGetDevices; + + +enum { + HIPEW_SUCCESS = 0, + HIPEW_ERROR_OPEN_FAILED = -1, + HIPEW_ERROR_ATEXIT_FAILED = -2, +}; + +enum { + HIPEW_INIT_HIP = 1, +}; + +int hipewInit(hipuint32_t flags); +const char *hipewErrorString(hipError_t result); +const char *hipewCompilerPath(void); +int hipewCompilerVersion(void); +int hipewNvrtcVersion(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __HIPEW_H__ */ diff --git a/extern/hipew/src/hipew.c b/extern/hipew/src/hipew.c new file mode 100644 index 00000000000..9d5a63f869a --- /dev/null +++ b/extern/hipew/src/hipew.c @@ -0,0 +1,533 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ +#ifdef _MSC_VER +# if _MSC_VER < 1900 +# define snprintf _snprintf +# endif +# define popen _popen +# define pclose _pclose +# define _CRT_SECURE_NO_WARNINGS +#endif + +#include <hipew.h> +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# define VC_EXTRALEAN +# include <windows.h> + +/* Utility macros. */ + +typedef HMODULE DynamicLibrary; + +# define dynamic_library_open(path) LoadLibraryA(path) +# define dynamic_library_close(lib) FreeLibrary(lib) +# define dynamic_library_find(lib, symbol) GetProcAddress(lib, symbol) +#else +# include <dlfcn.h> + +typedef void* DynamicLibrary; + +# define dynamic_library_open(path) dlopen(path, RTLD_NOW) +# define dynamic_library_close(lib) dlclose(lib) +# define dynamic_library_find(lib, symbol) dlsym(lib, symbol) +#endif + +#define _LIBRARY_FIND_CHECKED(lib, name) \ + name = (t##name *)dynamic_library_find(lib, #name); \ + assert(name); + +#define _LIBRARY_FIND(lib, name) \ + name = (t##name *)dynamic_library_find(lib, #name); + +#define HIP_LIBRARY_FIND_CHECKED(name) \ + _LIBRARY_FIND_CHECKED(hip_lib, name) +#define HIP_LIBRARY_FIND(name) _LIBRARY_FIND(hip_lib, name) + + +static DynamicLibrary hip_lib; + +/* Function definitions. */ +thipGetErrorName *hipGetErrorName; +thipInit *hipInit; +thipDriverGetVersion *hipDriverGetVersion; +thipGetDevice *hipGetDevice; +thipGetDeviceCount *hipGetDeviceCount; +thipDeviceGetName *hipDeviceGetName; +thipDeviceGetAttribute *hipDeviceGetAttribute; +thipDeviceComputeCapability *hipDeviceComputeCapability; +thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain; +thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease; +thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags; +thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState; +thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset; +thipCtxCreate *hipCtxCreate; +thipCtxDestroy *hipCtxDestroy; +thipCtxPushCurrent *hipCtxPushCurrent; +thipCtxPopCurrent *hipCtxPopCurrent; +thipCtxSetCurrent *hipCtxSetCurrent; +thipCtxGetCurrent *hipCtxGetCurrent; +thipCtxGetDevice *hipCtxGetDevice; +thipCtxGetFlags *hipCtxGetFlags; +thipCtxSynchronize *hipCtxSynchronize; +thipDeviceSynchronize *hipDeviceSynchronize; +thipCtxGetCacheConfig *hipCtxGetCacheConfig; +thipCtxSetCacheConfig *hipCtxSetCacheConfig; +thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig; +thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig; +thipCtxGetApiVersion *hipCtxGetApiVersion; +thipModuleLoad *hipModuleLoad; +thipModuleLoadData *hipModuleLoadData; +thipModuleLoadDataEx *hipModuleLoadDataEx; +thipModuleUnload *hipModuleUnload; +thipModuleGetFunction *hipModuleGetFunction; +thipModuleGetGlobal *hipModuleGetGlobal; +thipModuleGetTexRef *hipModuleGetTexRef; +thipMemGetInfo *hipMemGetInfo; +thipMalloc *hipMalloc; +thipMemAllocPitch *hipMemAllocPitch; +thipFree *hipFree; +thipMemGetAddressRange *hipMemGetAddressRange; +thipHostMalloc *hipHostMalloc; +thipHostFree *hipHostFree; +thipHostGetDevicePointer *hipHostGetDevicePointer; +thipHostGetFlags *hipHostGetFlags; +thipMallocManaged *hipMallocManaged; +thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId; +thipDeviceGetPCIBusId *hipDeviceGetPCIBusId; +thipMemcpyPeer *hipMemcpyPeer; +thipMemcpyHtoD *hipMemcpyHtoD; +thipMemcpyDtoH *hipMemcpyDtoH; +thipMemcpyDtoD *hipMemcpyDtoD; +thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned; +thipMemcpyParam2D *hipMemcpyParam2D; +thipDrvMemcpy3D *hipDrvMemcpy3D; +thipMemcpyHtoDAsync *hipMemcpyHtoDAsync; +thipMemcpyDtoHAsync *hipMemcpyDtoHAsync; +thipMemcpyParam2DAsync *hipMemcpyParam2DAsync; +thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync; +thipMemsetD8 *hipMemsetD8; +thipMemsetD16 *hipMemsetD16; +thipMemsetD32 *hipMemsetD32; +thipMemsetD8Async *hipMemsetD8Async; +thipMemsetD16Async *hipMemsetD16Async; +thipMemsetD32Async *hipMemsetD32Async; +thipArrayCreate *hipArrayCreate; +thipArrayDestroy *hipArrayDestroy; +thipArray3DCreate *hipArray3DCreate; +thipStreamCreateWithFlags *hipStreamCreateWithFlags; +thipStreamCreateWithPriority *hipStreamCreateWithPriority; +thipStreamGetPriority *hipStreamGetPriority; +thipStreamGetFlags *hipStreamGetFlags; +thipStreamWaitEvent *hipStreamWaitEvent; +thipStreamAddCallback *hipStreamAddCallback; +thipStreamQuery *hipStreamQuery; +thipStreamSynchronize *hipStreamSynchronize; +thipStreamDestroy *hipStreamDestroy; +thipEventCreateWithFlags *hipEventCreateWithFlags; +thipEventRecord *hipEventRecord; +thipEventQuery *hipEventQuery; +thipEventSynchronize *hipEventSynchronize; +thipEventDestroy *hipEventDestroy; +thipEventElapsedTime *hipEventElapsedTime; +thipFuncGetAttribute *hipFuncGetAttribute; +thipFuncSetCacheConfig *hipFuncSetCacheConfig; +thipModuleLaunchKernel *hipModuleLaunchKernel; +thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor; +thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags; +thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize; +thipTexRefSetArray *hipTexRefSetArray; +thipTexRefSetAddress *hipTexRefSetAddress; +thipTexRefSetAddress2D *hipTexRefSetAddress2D; +thipTexRefSetFormat *hipTexRefSetFormat; +thipTexRefSetAddressMode *hipTexRefSetAddressMode; +thipTexRefSetFilterMode *hipTexRefSetFilterMode; +thipTexRefSetFlags *hipTexRefSetFlags; +thipTexRefGetAddress *hipTexRefGetAddress; +thipTexRefGetArray *hipTexRefGetArray; +thipTexRefGetAddressMode *hipTexRefGetAddressMode; +thipTexObjectCreate *hipTexObjectCreate; +thipTexObjectDestroy *hipTexObjectDestroy; +thipDeviceCanAccessPeer *hipDeviceCanAccessPeer; + +thipCtxEnablePeerAccess *hipCtxEnablePeerAccess; +thipCtxDisablePeerAccess *hipCtxDisablePeerAccess; +thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute; +thipGraphicsUnregisterResource *hipGraphicsUnregisterResource; +thipGraphicsMapResources *hipGraphicsMapResources; +thipGraphicsUnmapResources *hipGraphicsUnmapResources; +thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer; + +thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer; +thipGLGetDevices *hipGLGetDevices; + + + +static DynamicLibrary dynamic_library_open_find(const char **paths) { + int i = 0; + while (paths[i] != NULL) { + DynamicLibrary lib = dynamic_library_open(paths[i]); + if (lib != NULL) { + return lib; + } + ++i; + } + return NULL; +} + +/* Implementation function. */ +static void hipewHipExit(void) { + if (hip_lib != NULL) { + /* Ignore errors. */ + dynamic_library_close(hip_lib); + hip_lib = NULL; + } +} + +static int hipewHipInit(void) { + /* Library paths. */ +#ifdef _WIN32 + /* Expected in c:/windows/system or similar, no path needed. */ + const char *hip_paths[] = {"amdhip64.dll", NULL}; +#elif defined(__APPLE__) + /* Default installation path. */ + const char *hip_paths[] = {"", NULL}; +#else + const char *hip_paths[] = {"/opt/rocm/hip/lib/libamdhip64.so", NULL}; +#endif + static int initialized = 0; + static int result = 0; + int error, driver_version; + + if (initialized) { + return result; + } + + initialized = 1; + + error = atexit(hipewHipExit); + if (error) { + result = HIPEW_ERROR_ATEXIT_FAILED; + return result; + } + + /* Load library. */ + hip_lib = dynamic_library_open_find(hip_paths); + + if (hip_lib == NULL) { + result = HIPEW_ERROR_OPEN_FAILED; + return result; + } + + /* Fetch all function pointers. */ + HIP_LIBRARY_FIND_CHECKED(hipGetErrorName); + HIP_LIBRARY_FIND_CHECKED(hipInit); + HIP_LIBRARY_FIND_CHECKED(hipDriverGetVersion); + HIP_LIBRARY_FIND_CHECKED(hipGetDevice); + HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount); + HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName); + HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute); + HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability); + HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRetain); + HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRelease); + HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxSetFlags); + HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxGetState); + HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxReset); + HIP_LIBRARY_FIND_CHECKED(hipCtxCreate); + HIP_LIBRARY_FIND_CHECKED(hipCtxDestroy); + HIP_LIBRARY_FIND_CHECKED(hipCtxPushCurrent); + HIP_LIBRARY_FIND_CHECKED(hipCtxPopCurrent); + HIP_LIBRARY_FIND_CHECKED(hipCtxSetCurrent); + HIP_LIBRARY_FIND_CHECKED(hipCtxGetCurrent); + HIP_LIBRARY_FIND_CHECKED(hipCtxGetDevice); + HIP_LIBRARY_FIND_CHECKED(hipCtxGetFlags); + HIP_LIBRARY_FIND_CHECKED(hipCtxSynchronize); + HIP_LIBRARY_FIND_CHECKED(hipDeviceSynchronize); + HIP_LIBRARY_FIND_CHECKED(hipCtxGetCacheConfig); + HIP_LIBRARY_FIND_CHECKED(hipCtxSetCacheConfig); + HIP_LIBRARY_FIND_CHECKED(hipCtxGetSharedMemConfig); + HIP_LIBRARY_FIND_CHECKED(hipCtxSetSharedMemConfig); + HIP_LIBRARY_FIND_CHECKED(hipCtxGetApiVersion); + HIP_LIBRARY_FIND_CHECKED(hipModuleLoad); + HIP_LIBRARY_FIND_CHECKED(hipModuleLoadData); + HIP_LIBRARY_FIND_CHECKED(hipModuleLoadDataEx); + HIP_LIBRARY_FIND_CHECKED(hipModuleUnload); + HIP_LIBRARY_FIND_CHECKED(hipModuleGetFunction); + HIP_LIBRARY_FIND_CHECKED(hipModuleGetGlobal); + HIP_LIBRARY_FIND_CHECKED(hipModuleGetTexRef); + HIP_LIBRARY_FIND_CHECKED(hipMemGetInfo); + HIP_LIBRARY_FIND_CHECKED(hipMalloc); + HIP_LIBRARY_FIND_CHECKED(hipMemAllocPitch); + HIP_LIBRARY_FIND_CHECKED(hipFree); + HIP_LIBRARY_FIND_CHECKED(hipMemGetAddressRange); + HIP_LIBRARY_FIND_CHECKED(hipHostMalloc); + HIP_LIBRARY_FIND_CHECKED(hipHostFree); + HIP_LIBRARY_FIND_CHECKED(hipHostGetDevicePointer); + HIP_LIBRARY_FIND_CHECKED(hipHostGetFlags); + HIP_LIBRARY_FIND_CHECKED(hipMallocManaged); + HIP_LIBRARY_FIND_CHECKED(hipDeviceGetByPCIBusId); + HIP_LIBRARY_FIND_CHECKED(hipDeviceGetPCIBusId); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyPeer); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoD); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoH); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoD); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2D); + HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3D); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoDAsync); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoHAsync); + HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy2DUnaligned); + HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2DAsync); + HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3DAsync); + HIP_LIBRARY_FIND_CHECKED(hipMemsetD8); + HIP_LIBRARY_FIND_CHECKED(hipMemsetD16); + HIP_LIBRARY_FIND_CHECKED(hipMemsetD32); + HIP_LIBRARY_FIND_CHECKED(hipMemsetD8Async); + HIP_LIBRARY_FIND_CHECKED(hipMemsetD16Async); + HIP_LIBRARY_FIND_CHECKED(hipMemsetD32Async); + HIP_LIBRARY_FIND_CHECKED(hipArrayCreate); + HIP_LIBRARY_FIND_CHECKED(hipArrayDestroy); + HIP_LIBRARY_FIND_CHECKED(hipArray3DCreate); + HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithFlags); + HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithPriority); + HIP_LIBRARY_FIND_CHECKED(hipStreamGetPriority); + HIP_LIBRARY_FIND_CHECKED(hipStreamGetFlags); + HIP_LIBRARY_FIND_CHECKED(hipStreamWaitEvent); + HIP_LIBRARY_FIND_CHECKED(hipStreamAddCallback); + HIP_LIBRARY_FIND_CHECKED(hipStreamQuery); + HIP_LIBRARY_FIND_CHECKED(hipStreamSynchronize); + HIP_LIBRARY_FIND_CHECKED(hipStreamDestroy); + HIP_LIBRARY_FIND_CHECKED(hipEventCreateWithFlags); + HIP_LIBRARY_FIND_CHECKED(hipEventRecord); + HIP_LIBRARY_FIND_CHECKED(hipEventQuery); + HIP_LIBRARY_FIND_CHECKED(hipEventSynchronize); + HIP_LIBRARY_FIND_CHECKED(hipEventDestroy); + HIP_LIBRARY_FIND_CHECKED(hipEventElapsedTime); + HIP_LIBRARY_FIND_CHECKED(hipFuncGetAttribute); + HIP_LIBRARY_FIND_CHECKED(hipFuncSetCacheConfig); + HIP_LIBRARY_FIND_CHECKED(hipModuleLaunchKernel); + HIP_LIBRARY_FIND_CHECKED(hipModuleOccupancyMaxPotentialBlockSize); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetArray); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress2D); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFormat); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddressMode); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFilterMode); + HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFlags); + HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddress); + HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddressMode); + HIP_LIBRARY_FIND_CHECKED(hipTexObjectCreate); + HIP_LIBRARY_FIND_CHECKED(hipTexObjectDestroy); + HIP_LIBRARY_FIND_CHECKED(hipDeviceCanAccessPeer); + HIP_LIBRARY_FIND_CHECKED(hipCtxEnablePeerAccess); + HIP_LIBRARY_FIND_CHECKED(hipCtxDisablePeerAccess); + HIP_LIBRARY_FIND_CHECKED(hipDeviceGetP2PAttribute); +#ifdef _WIN32 + HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnregisterResource); + HIP_LIBRARY_FIND_CHECKED(hipGraphicsMapResources); + HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnmapResources); + HIP_LIBRARY_FIND_CHECKED(hipGraphicsResourceGetMappedPointer); + HIP_LIBRARY_FIND_CHECKED(hipGraphicsGLRegisterBuffer); + HIP_LIBRARY_FIND_CHECKED(hipGLGetDevices); +#endif + result = HIPEW_SUCCESS; + return result; +} + + + +int hipewInit(hipuint32_t flags) { + int result = HIPEW_SUCCESS; + + if (flags & HIPEW_INIT_HIP) { + result = hipewHipInit(); + if (result != HIPEW_SUCCESS) { + return result; + } + } + + return result; +} + + +const char *hipewErrorString(hipError_t result) { + switch (result) { + case hipSuccess: return "No errors"; + case hipErrorInvalidValue: return "Invalid value"; + case hipErrorOutOfMemory: return "Out of memory"; + case hipErrorNotInitialized: return "Driver not initialized"; + case hipErrorDeinitialized: return "Driver deinitialized"; + case hipErrorProfilerDisabled: return "Profiler disabled"; + case hipErrorProfilerNotInitialized: return "Profiler not initialized"; + case hipErrorProfilerAlreadyStarted: return "Profiler already started"; + case hipErrorProfilerAlreadyStopped: return "Profiler already stopped"; + case hipErrorNoDevice: return "No HIP-capable device available"; + case hipErrorInvalidDevice: return "Invalid device"; + case hipErrorInvalidImage: return "Invalid kernel image"; + case hipErrorInvalidContext: return "Invalid context"; + case hipErrorContextAlreadyCurrent: return "Context already current"; + case hipErrorMapFailed: return "Map failed"; + case hipErrorUnmapFailed: return "Unmap failed"; + case hipErrorArrayIsMapped: return "Array is mapped"; + case hipErrorAlreadyMapped: return "Already mapped"; + case hipErrorNoBinaryForGpu: return "No binary for GPU"; + case hipErrorAlreadyAcquired: return "Already acquired"; + case hipErrorNotMapped: return "Not mapped"; + case hipErrorNotMappedAsArray: return "Mapped resource not available for access as an array"; + case hipErrorNotMappedAsPointer: return "Mapped resource not available for access as a pointer"; + case hipErrorECCNotCorrectable: return "Uncorrectable ECC error detected"; + case hipErrorUnsupportedLimit: return "hipLimit_t not supported by device"; + case hipErrorContextAlreadyInUse: return "Context already in use"; + case hipErrorPeerAccessUnsupported: return "Peer access unsupported"; + case hipErrorInvalidKernelFile: return "Invalid ptx"; + case hipErrorInvalidGraphicsContext: return "Invalid graphics context"; + case hipErrorInvalidSource: return "Invalid source"; + case hipErrorFileNotFound: return "File not found"; + case hipErrorSharedObjectSymbolNotFound: return "Link to a shared object failed to resolve"; + case hipErrorSharedObjectInitFailed: return "Shared object initialization failed"; + case hipErrorOperatingSystem: return "Operating system"; + case hipErrorInvalidHandle: return "Invalid handle"; + case hipErrorNotFound: return "Not found"; + case hipErrorNotReady: return "HIP not ready"; + case hipErrorIllegalAddress: return "Illegal address"; + case hipErrorLaunchOutOfResources: return "Launch exceeded resources"; + case hipErrorLaunchTimeOut: return "Launch exceeded timeout"; + case hipErrorPeerAccessAlreadyEnabled: return "Peer access already enabled"; + case hipErrorPeerAccessNotEnabled: return "Peer access not enabled"; + case hipErrorSetOnActiveProcess: return "Primary context active"; + case hipErrorAssert: return "Assert"; + case hipErrorHostMemoryAlreadyRegistered: return "Host memory already registered"; + case hipErrorHostMemoryNotRegistered: return "Host memory not registered"; + case hipErrorLaunchFailure: return "Launch failed"; + case hipErrorCooperativeLaunchTooLarge: return "Cooperative launch too large"; + case hipErrorNotSupported: return "Not supported"; + case hipErrorUnknown: return "Unknown error"; + default: return "Unknown HIP error value"; + } +} + +static void path_join(const char *path1, + const char *path2, + int maxlen, + char *result) { +#if defined(WIN32) || defined(_WIN32) + const char separator = '\\'; +#else + const char separator = '/'; +#endif + int n = snprintf(result, maxlen, "%s%c%s", path1, separator, path2); + if (n != -1 && n < maxlen) { + result[n] = '\0'; + } + else { + result[maxlen - 1] = '\0'; + } +} + +static int path_exists(const char *path) { + struct stat st; + if (stat(path, &st)) { + return 0; + } + return 1; +} + +const char *hipewCompilerPath(void) { + #ifdef _WIN32 + const char *hipPath = getenv("HIP_ROCCLR_HOME"); + const char *windowsCommand = "perl "; + const char *executable = "bin/hipcc"; + + static char hipcc[65536]; + static char finalCommand[65536]; + if(hipPath) { + path_join(hipPath, executable, sizeof(hipcc), hipcc); + if(path_exists(hipcc)) { + snprintf(finalCommand, sizeof(hipcc), "%s %s", windowsCommand, hipcc); + return finalCommand; + } else { + printf("Could not find hipcc. Make sure HIP_ROCCLR_HOME points to the directory holding /bin/hipcc"); + } + } + #else + const char *hipPath = "opt/rocm/hip/bin"; + const char *executable = "hipcc"; + + static char hipcc[65536]; + if(hipPath) { + path_join(hipPath, executable, sizeof(hipcc), hipcc); + if(path_exists(hipcc)){ + return hipcc; + } + } + #endif + + { +#ifdef _WIN32 + FILE *handle = popen("where hipcc", "r"); +#else + FILE *handle = popen("which hipcc", "r"); +#endif + if (handle) { + char buffer[4096] = {0}; + int len = fread(buffer, 1, sizeof(buffer) - 1, handle); + buffer[len] = '\0'; + pclose(handle); + if (buffer[0]) { + return "hipcc"; + } + } + } + + return NULL; +} + +int hipewCompilerVersion(void) { + const char *path = hipewCompilerPath(); + const char *marker = "Hip compilation tools, release "; + FILE *pipe; + int major, minor; + char *versionstr; + char buf[128]; + char output[65536] = "\0"; + char command[65536] = "\0"; + + if (path == NULL) { + return 0; + } + + /* get --version output */ + strcat(command, "\""); + strncat(command, path, sizeof(command) - 1); + strncat(command, "\" --version", sizeof(command) - strlen(path) - 1); + pipe = popen(command, "r"); + if (!pipe) { + fprintf(stderr, "HIP: failed to run compiler to retrieve version"); + return 0; + } + + while (!feof(pipe)) { + if (fgets(buf, sizeof(buf), pipe) != NULL) { + strncat(output, buf, sizeof(output) - strlen(output) - 1); + } + } + + pclose(pipe); + return 40; +} diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 17096d441f0..2018c1d9648 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -297,6 +297,7 @@ endif() if(WITH_CYCLES_STANDALONE) set(WITH_CYCLES_DEVICE_CUDA TRUE) + set(WITH_CYCLES_DEVICE_HIP TRUE) endif() # TODO(sergey): Consider removing it, only causes confusion in interface. set(WITH_CYCLES_DEVICE_MULTI TRUE) diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 5bdcfd56a4d..64d226cb9ec 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -95,6 +95,9 @@ set(ADDON_FILES add_definitions(${GL_DEFINITIONS}) +if(WITH_CYCLES_DEVICE_HIP) + add_definitions(-DWITH_HIP) +endif() if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e0e8ca10bef..d729cb1ee69 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -28,7 +28,7 @@ def _configure_argument_parser(): action='store_true') parser.add_argument("--cycles-device", help="Set the device to use for Cycles, overriding user preferences and the scene setting." - "Valid options are 'CPU', 'CUDA' or 'OPTIX'." + "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'" "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.", default=None) return parser diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index e2b671848d0..67207874431 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -111,6 +111,7 @@ enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), + ("HIP", "HIP", "HIP", 4) ) enum_texture_limit = ( @@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix = _cycles.get_device_types() + has_cuda, has_optix, has_hip = _cycles.get_device_types() + list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) if has_optix: list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3)) + if has_hip: + list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4)) + return list compute_device_type: EnumProperty( @@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPTIX', 'CPU'}: + if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): elif entry.type == 'CPU': cpu_devices.append(entry) # Extend all GPU devices with CPU. - if compute_device_type != 'CPU': + if compute_device_type != 'CPU' and compute_device_type != 'HIP': devices.extend(cpu_devices) return devices @@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): import _cycles # Ensure `self.devices` is not re-allocated when the second call to # get_devices_for_type is made, freeing items from the first list. - for device_type in ('CUDA', 'OPTIX', 'OPENCL'): + for device_type in ('CUDA', 'OPTIX', 'HIP'): self.update_device_entries(_cycles.available_devices(device_type)) # Deprecated: use refresh_devices instead. diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index ac96ddf5e10..c4a1844480c 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -99,6 +99,11 @@ def use_cuda(context): return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU') +def use_hip(context): + cscene = context.scene.cycles + + return (get_device_type(context) == 'HIP' and cscene.device == 'GPU') + def use_optix(context): cscene = context.scene.cycles diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index ce1770f18a3..7bed33855c2 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -26,6 +26,7 @@ enum ComputeDevice { COMPUTE_DEVICE_CPU = 0, COMPUTE_DEVICE_CUDA = 1, COMPUTE_DEVICE_OPTIX = 3, + COMPUTE_DEVICE_HIP = 4, COMPUTE_DEVICE_NUM }; @@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen else if (compute_device == COMPUTE_DEVICE_OPTIX) { mask |= DEVICE_MASK_OPTIX; } + else if (compute_device == COMPUTE_DEVICE_HIP) { + mask |= DEVICE_MASK_HIP; + } vector<DeviceInfo> devices = Device::available_devices(mask); /* Match device preferences and available devices. */ diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 694d8454422..d681517c9e1 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceType> device_types = Device::available_types(); - bool has_cuda = false, has_optix = false; + bool has_cuda = false, has_optix = false, has_hip = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); + has_hip |= (device_type == DEVICE_HIP); } - PyObject *list = PyTuple_New(2); + PyObject *list = PyTuple_New(3); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); + PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip)); return list; } @@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg) else if (override == "OPTIX") { BlenderSession::device_override = DEVICE_MASK_OPTIX; } + else if (override == "HIP") { + BlenderSession::device_override = DEVICE_MASK_HIP; + } else { printf("\nError: %s is not a valid Cycles device.\n", override.c_str()); Py_RETURN_FALSE; diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index da259171844..5653f51ec05 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -531,5 +531,9 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD) endif() endif() endif() +if(NOT WITH_HIP_DYNLOAD) + message(STATUS "Setting up HIP Dynamic Load") + set(WITH_HIP_DYNLOAD ON) +endif() unset(_cycles_lib_dir) diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake index 47196dfd1ce..172ae280cea 100644 --- a/intern/cycles/cmake/macros.cmake +++ b/intern/cycles/cmake/macros.cmake @@ -162,6 +162,10 @@ macro(cycles_target_link_libraries target) target_link_libraries(${target} ${CUDA_CUDA_LIBRARY}) endif() + if(WITH_HIP_DYNLOAD) + target_link_libraries(${target} extern_hipew) + endif() + if(CYCLES_STANDALONE_REPOSITORY) target_link_libraries(${target} extern_numaapi) else() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index d18f4360aef..9af68550d17 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -34,6 +34,13 @@ else() add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") endif() +if(WITH_HIP_DYNLOAD) + list(APPEND INC + ../../../extern/hipew/include + ) + add_definitions(-DWITH_HIP_DYNLOAD) +endif() + set(SRC device.cpp device_denoise.cpp @@ -70,6 +77,21 @@ set(SRC_CUDA cuda/util.h ) +set(SRC_HIP + hip/device.cpp + hip/device.h + hip/device_impl.cpp + hip/device_impl.h + hip/graphics_interop.cpp + hip/graphics_interop.h + hip/kernel.cpp + hip/kernel.h + hip/queue.cpp + hip/queue.h + hip/util.cpp + hip/util.h +) + set(SRC_DUMMY dummy/device.cpp dummy/device.h @@ -115,11 +137,20 @@ else() ) endif() +if(WITH_HIP_DYNLOAD) + list(APPEND LIB + extern_hipew + ) +endif() + add_definitions(${GL_DEFINITIONS}) if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() +if(WITH_CYCLES_DEVICE_HIP) + add_definitions(-DWITH_HIP) +endif() if(WITH_CYCLES_DEVICE_OPTIX) add_definitions(-DWITH_OPTIX) endif() @@ -140,6 +171,7 @@ cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CPU} ${SRC_CUDA} + ${SRC_HIP} ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 70935598437..81574e8b184 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -25,6 +25,7 @@ #include "device/cpu/device.h" #include "device/cuda/device.h" #include "device/dummy/device.h" +#include "device/hip/device.h" #include "device/multi/device.h" #include "device/optix/device.h" @@ -46,6 +47,7 @@ thread_mutex Device::device_mutex; vector<DeviceInfo> Device::cuda_devices; vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; +vector<DeviceInfo> Device::hip_devices; uint Device::devices_initialized_mask = 0; /* Device */ @@ -96,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) device = device_optix_create(info, stats, profiler); break; #endif + +#ifdef WITH_HIP + case DEVICE_HIP: + if (device_hip_init()) + device = device_hip_create(info, stats, profiler); + break; +#endif + default: break; } @@ -117,6 +127,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_OPTIX; else if (strcmp(name, "MULTI") == 0) return DEVICE_MULTI; + else if (strcmp(name, "HIP") == 0) + return DEVICE_HIP; return DEVICE_NONE; } @@ -131,6 +143,8 @@ string Device::string_from_type(DeviceType type) return "OPTIX"; else if (type == DEVICE_MULTI) return "MULTI"; + else if (type == DEVICE_HIP) + return "HIP"; return ""; } @@ -145,6 +159,10 @@ vector<DeviceType> Device::available_types() #ifdef WITH_OPTIX types.push_back(DEVICE_OPTIX); #endif +#ifdef WITH_HIP + types.push_back(DEVICE_HIP); +#endif + return types; } @@ -186,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask) } #endif +#ifdef WITH_HIP + if (mask & DEVICE_MASK_HIP) { + if (!(devices_initialized_mask & DEVICE_MASK_HIP)) { + if (device_hip_init()) { + device_hip_info(hip_devices); + } + devices_initialized_mask |= DEVICE_MASK_HIP; + } + foreach (DeviceInfo &info, hip_devices) { + devices.push_back(info); + } + } +#endif + if (mask & DEVICE_MASK_CPU) { if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { device_cpu_info(cpu_devices); @@ -226,6 +258,15 @@ string Device::device_capabilities(uint mask) } #endif +#ifdef WITH_HIP + if (mask & DEVICE_MASK_HIP) { + if (device_hip_init()) { + capabilities += "\nHIP device capabilities:\n"; + capabilities += device_hip_capabilities(); + } + } +#endif + return capabilities; } @@ -314,6 +355,7 @@ void Device::free_memory() devices_initialized_mask = 0; cuda_devices.free_memory(); optix_devices.free_memory(); + hip_devices.free_memory(); cpu_devices.free_memory(); } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 3bbad179f52..c73d74cdccc 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -51,6 +51,7 @@ enum DeviceType { DEVICE_CUDA, DEVICE_MULTI, DEVICE_OPTIX, + DEVICE_HIP, DEVICE_DUMMY, }; @@ -58,6 +59,7 @@ enum DeviceTypeMask { DEVICE_MASK_CPU = (1 << DEVICE_CPU), DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), + DEVICE_MASK_HIP = (1 << DEVICE_HIP), DEVICE_MASK_ALL = ~0 }; @@ -284,6 +286,7 @@ class Device { static vector<DeviceInfo> cuda_devices; static vector<DeviceInfo> optix_devices; static vector<DeviceInfo> cpu_devices; + static vector<DeviceInfo> hip_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index a854cc9b693..be6123e09b2 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -277,6 +277,7 @@ class device_memory { protected: friend class CUDADevice; friend class OptiXDevice; + friend class HIPDevice; /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp new file mode 100644 index 00000000000..90028ac7f10 --- /dev/null +++ b/intern/cycles/device/hip/device.cpp @@ -0,0 +1,276 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/hip/device.h" + +#include "util/util_logging.h" + +#ifdef WITH_HIP +# include "device/device.h" +# include "device/hip/device_impl.h" + +# include "util/util_string.h" +# include "util/util_windows.h" +#endif /* WITH_HIP */ + +CCL_NAMESPACE_BEGIN + +bool device_hip_init() +{ +#if !defined(WITH_HIP) + return false; +#elif defined(WITH_HIP_DYNLOAD) + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + int hipew_result = hipewInit(HIPEW_INIT_HIP); + if (hipew_result == HIPEW_SUCCESS) { + VLOG(1) << "HIPEW initialization succeeded"; + if (HIPDevice::have_precompiled_kernels()) { + VLOG(1) << "Found precompiled kernels"; + result = true; + } + else if (hipewCompilerPath() != NULL) { + VLOG(1) << "Found HIPCC " << hipewCompilerPath(); + result = true; + } + else { + VLOG(1) << "Neither precompiled kernels nor HIPCC was found," + << " unable to use HIP"; + } + } + else { + VLOG(1) << "HIPEW initialization failed: " + << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + + return result; +#else /* WITH_HIP_DYNLOAD */ + return true; +#endif /* WITH_HIP_DYNLOAD */ +} + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_HIP + return new HIPDevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen."; + + return nullptr; +#endif +} + +#ifdef WITH_HIP +static hipError_t device_hip_safe_init() +{ +# ifdef _WIN32 + __try { + return hipInit(0); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the HIP driver and hope we can + * survive even with corrupted HIP installs. */ + fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n"); + } + + return hipErrorNoDevice; +# else + return hipInit(0); +# endif +} +#endif /* WITH_HIP */ + +void device_hip_info(vector<DeviceInfo> &devices) +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) + fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result)); + return; + } + + int count = 0; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result)); + return; + } + + vector<DeviceInfo> display_devices; + + for (int num = 0; num < count; num++) { + char name[256]; + + result = hipDeviceGetName(name, 256, num); + if (result != hipSuccess) { + fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result)); + continue; + } + + int major; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num); + // TODO : (Arya) What is the last major version we are supporting? + + DeviceInfo info; + + info.type = DEVICE_HIP; + info.description = string(name); + info.num = num; + + info.has_half_images = (major >= 3); + info.has_nanovdb = true; + info.denoisers = 0; + + info.has_gpu_queue = true; + /* Check if the device has P2P access to any other device in the system. */ + for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { + if (num != peer_num) { + int can_access = 0; + hipDeviceCanAccessPeer(&can_access, num, peer_num); + info.has_peer_memory = (can_access != 0); + } + } + + int pci_location[3] = {0, 0, 0}; + hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num); + hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num); + hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num); + info.id = string_printf("HIP_%s_%04x:%02x:%02x", + name, + (unsigned int)pci_location[0], + (unsigned int)pci_location[1], + (unsigned int)pci_location[2]); + + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num); + + if (timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; + info.description += " (Display)"; + info.display_device = true; + display_devices.push_back(info); + } + else { + VLOG(1) << "Device has compute preemption or is not used for display."; + devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + } + + if (!display_devices.empty()) + devices.insert(devices.end(), display_devices.begin(), display_devices.end()); +#else /* WITH_HIP */ + (void)devices; +#endif /* WITH_HIP */ +} + +string device_hip_capabilities() +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) { + return string("Error initializing HIP: ") + hipewErrorString(result); + } + return "No HIP device found\n"; + } + + int count; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + return string("Error getting devices: ") + hipewErrorString(result); + } + + string capabilities = ""; + for (int num = 0; num < count; num++) { + char name[256]; + if (hipDeviceGetName(name, 256, num) != hipSuccess) { + continue; + } + capabilities += string("\t") + name + "\n"; + int value; +# define GET_ATTR(attr) \ + { \ + if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \ + capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \ + } \ + } \ + (void)0 + /* TODO(sergey): Strip all attributes which are not useful for us + * or does not depend on the driver. + */ + GET_ATTR(MaxThreadsPerBlock); + GET_ATTR(MaxBlockDimX); + GET_ATTR(MaxBlockDimY); + GET_ATTR(MaxBlockDimZ); + GET_ATTR(MaxGridDimX); + GET_ATTR(MaxGridDimY); + GET_ATTR(MaxGridDimZ); + GET_ATTR(MaxSharedMemoryPerBlock); + GET_ATTR(TotalConstantMemory); + GET_ATTR(WarpSize); + GET_ATTR(MaxPitch); + GET_ATTR(MaxRegistersPerBlock); + GET_ATTR(ClockRate); + GET_ATTR(TextureAlignment); + GET_ATTR(MultiprocessorCount); + GET_ATTR(KernelExecTimeout); + GET_ATTR(Integrated); + GET_ATTR(CanMapHostMemory); + GET_ATTR(ComputeMode); + GET_ATTR(MaxTexture1DWidth); + GET_ATTR(MaxTexture2DWidth); + GET_ATTR(MaxTexture2DHeight); + GET_ATTR(MaxTexture3DWidth); + GET_ATTR(MaxTexture3DHeight); + GET_ATTR(MaxTexture3DDepth); + GET_ATTR(ConcurrentKernels); + GET_ATTR(EccEnabled); + GET_ATTR(MemoryClockRate); + GET_ATTR(MemoryBusWidth); + GET_ATTR(L2CacheSize); + GET_ATTR(MaxThreadsPerMultiProcessor); + GET_ATTR(ComputeCapabilityMajor); + GET_ATTR(ComputeCapabilityMinor); + GET_ATTR(MaxSharedMemoryPerMultiprocessor); + GET_ATTR(ManagedMemory); + GET_ATTR(IsMultiGpuBoard); +# undef GET_ATTR + capabilities += "\n"; + } + + return capabilities; + +#else /* WITH_HIP */ + return ""; +#endif /* WITH_HIP */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/hip/device.h b/intern/cycles/device/hip/device.h new file mode 100644 index 00000000000..76fa8995bed --- /dev/null +++ b/intern/cycles/device/hip/device.h @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_hip_init(); + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_hip_info(vector<DeviceInfo> &devices); + +string device_hip_capabilities(); + +CCL_NAMESPACE_END
\ No newline at end of file diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp new file mode 100644 index 00000000000..0e5ac6ce401 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.cpp @@ -0,0 +1,1343 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/hip/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +bool HIPDevice::have_precompiled_kernels() +{ + string fatbins_path = path_get("lib"); + return path_exists(fatbins_path); +} + +bool HIPDevice::show_samples() const +{ + /* The HIPDevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask HIPDevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void HIPDevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + hipDevId = info.num; + hipDevice = 0; + hipContext = 0; + + hipModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize HIP. */ + hipError_t result = hipInit(0); + if (result != hipSuccess) { + set_error(string_printf("Failed to initialize HIP runtime (%s)", hipewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = hipGetDevice(&hipDevice, hipDevId); + if (result != hipSuccess) { + set_error(string_printf("Failed to get HIP device handle from ordinal (%s)", + hipewErrorString(result))); + return; + } + + hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); + + hip_assert( + hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice)); + + unsigned int ctx_flags = hipDeviceLmemResizeToMax; + if (can_map_host) { + ctx_flags |= hipDeviceMapHost; + init_host_memory(); + } + + /* Create context. */ + result = hipCtxCreate(&hipContext, ctx_flags, hipDevice); + + if (result != hipSuccess) { + set_error(string_printf("Failed to create HIP context (%s)", hipewErrorString(result))); + return; + } + + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + hipDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by hipCtxCreate. */ + hipCtxPopCurrent(NULL); +} + +HIPDevice::~HIPDevice() +{ + texture_info.free(); + + hip_assert(hipCtxDestroy(hipContext)); +} + +bool HIPDevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + // TODO : (Arya) What versions do we plan to support? + return true; +} + +bool HIPDevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_HIP && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + HIPDevice *const peer_device_hip = static_cast<HIPDevice *>(peer_device); + + int can_access = 0; + hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + hip_assert(hipDeviceGetP2PAttribute( + &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const HIPContextScope scope(this); + hipError_t result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + { + const HIPContextScope scope(peer_device_hip); + hipError_t result = hipCtxEnablePeerAccess(hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + + return true; +} + +bool HIPDevice::use_adaptive_compilation() +{ + return DebugFlags().hip.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DHIPCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + return cflags; +} + +string HIPDevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string fatbin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using precompiled kernel."; + return fatbin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing hip toolkit or changing other + * compiler command line arguments makes sure fatbin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = "genco"; +# ifdef _WIN32 + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp"; +# else + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ggdb"; +# endif + const string include_path = source_path; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string fatbin_file = string_printf( + "cycles_%s_%s_%d%d_%s", name, kernel_arch, major, minor, kernel_md5.c_str()); + const string fatbin = path_cache_get(path_join("kernels", fatbin_file)); + VLOG(1) << "Testing for locally compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using locally compiled kernel."; + return fatbin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (major < 3) { + set_error( + string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("HIP binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + /* Compile. */ + const char *const hipcc = hipewCompilerPath(); + if (hipcc == NULL) { + set_error( + "HIP hipcc compiler not found. " + "Install HIP toolkit in default location."); + return string(); + } + + const int hipcc_hip_version = hipewCompilerVersion(); + VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; + if (hipcc_hip_version < 40) { + printf( + "Unsupported HIP version %d.%d detected, " + "you need HIP 4.0 or newer.\n", + hipcc_hip_version / 10, + hipcc_hip_version % 10); + return string(); + } + + double starttime = time_dt(); + + path_create_directories(fatbin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cpp", name)))); + + string command = string_printf("%s -%s -I %s --%s %s -o \"%s\"", + hipcc, + options, + include_path.c_str(), + kernel_ext, + source_path.c_str(), + fatbin.c_str()); + + printf("Compiling HIP kernel ...\n%s\n", command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + + /* Verify if compilation succeeded */ + if (!path_exists(fatbin)) { + set_error( + "HIP kernel compilation failed, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return fatbin; +} + +bool HIPDevice::load_kernels(const uint kernel_features) +{ + /* TODO(sergey): Support kernels re-load for HIP devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in hipCtxSynchronize. + */ + if (hipModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if hip init succeeded */ + if (hipContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* get kernel */ + const char *kernel_name = "kernel"; + string fatbin = compile_kernel(kernel_features, kernel_name); + if (fatbin.empty()) + return false; + + /* open module */ + HIPContextScope scope(this); + + string fatbin_data; + hipError_t result; + + if (path_read_text(fatbin, fatbin_data)) + result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); + else + result = hipErrorFileNotFound; + + if (result != hipSuccess) + set_error(string_printf( + "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result))); + + if (result == hipSuccess) { + kernels.load(this); + reserve_local_memory(kernel_features); + } + + return (result == hipSuccess); +} + +void HIPDevice::reserve_local_memory(const uint) +{ + /* Together with hipDeviceLmemResizeToMax, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + size_t total = 0, free_before = 0, free_after = 0; + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_before, &total); + } + + { + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE; + + /* Launch kernel, using just 1 block appears sufficient to reserve memory for all + * multiprocessors. It would be good to do this in parallel for the multi GPU case + * still to make it faster. */ + HIPDeviceQueue queue(this); + + void *d_path_index = nullptr; + void *d_render_buffer = nullptr; + int d_work_size = 0; + void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_after, &total); + } + + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + hipDeviceptr_t tmp; + hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL)); + hipMemGetInfo(&free_after, &total); + } +# endif +} + +void HIPDevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void HIPDevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + } +} + +void HIPDevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(hip_mem_map_mutex); + foreach (HIPMemMap::value_type &pair, hip_mem_map) { + device_memory &mem = *pair.first; + HIPMem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple HIP devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + HIPContextScope scope(this); + + hipDeviceptr_t device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + hipError_t mem_alloc_result = hipErrorOutOfMemory; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + hipMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + hipMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = hipMalloc(&device_pointer, size); + if (mem_alloc_result == hipSuccess) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != hipSuccess && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = hipSuccess; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = hipHostMalloc(&shared_pointer, size); + + assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || + (mem_alloc_result != hipSuccess && shared_pointer == 0)); + } + + if (mem_alloc_result == hipSuccess) { + hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + } + + if (mem_alloc_result != hipSuccess) { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(hip_mem_map_mutex); + HIPMem *cmem = &hip_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * HIP memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void HIPDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from + * mem.host_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert( + hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } +} + +void HIPDevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + hipHostFree(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + hip_assert(hipFree(mem.device_pointer)); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } +} + +void HIPDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + generic_alloc(mem); + } +} + +void HIPDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void HIPDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemcpyDtoH( + (char *)mem.host_pointer + offset, (hipDeviceptr_t)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void HIPDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void HIPDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr HIPDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void HIPDevice::const_copy_to(const char *name, void *host, size_t size) +{ + HIPContextScope scope(this); + hipDeviceptr_t mem; + size_t bytes; + + hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); + assert(bytes == size); + hip_assert(hipMemcpyHtoD(mem, host, size)); +} + +void HIPDevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void HIPDevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void HIPDevice::tex_alloc(device_texture &mem) +{ + HIPContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + hipTextureAddressMode address_mode = hipAddressModeWrap; + switch (mem.info.extension) { + case EXTENSION_REPEAT: + address_mode = hipAddressModeWrap; + break; + case EXTENSION_EXTEND: + address_mode = hipAddressModeClamp; + break; + case EXTENSION_CLIP: + // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported + // in hip + address_mode = hipAddressModeClamp; + break; + default: + assert(0); + break; + } + + hipTextureFilterMode filter_mode; + if (mem.info.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = hipFilterModePoint; + } + else { + filter_mode = hipFilterModeLinear; + } + + /* Image Texture Storage */ + hipArray_Format format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = HIP_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = HIP_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = HIP_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = HIP_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = HIP_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = HIP_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + HIPMem *cmem = NULL; + hArray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (!mem.is_resident(this)) { + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (hArray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + HIP_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + hip_assert(hipArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + HIP_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeArray; + param.dstArray = &array_3d; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + hip_assert(hipDrvMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + dst_pitch = align_up(src_pitch, pitch_alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + hip_Memcpy2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeDevice; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + hip_assert(hipDrvMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; + + if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + /* Kepler+, bindless textures. */ + hipResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = hipResourceTypeArray; + resDesc.res.array.h_Array = &array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = hipResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = hipResourceTypeLinear; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES; + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + + hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + texture_info[slot].data = (uint64_t)cmem->texobject; + } + else { + texture_info[slot].data = (uint64_t)mem.device_pointer; + } +} + +void HIPDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + hipTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + hipArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } + } +} + +# if 0 +void HIPDevice::render(DeviceTask &task, + RenderTile &rtile, + device_vector<KernelWorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + HIPContextScope scope(this); + hipFunction_t hipRender; + + /* Get kernel function. */ + if (rtile.task == RenderTile::BAKE) { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake")); + } + else { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace")); + } + + if (have_error()) { + return; + } + + hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + KernelWorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer; + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + hip_assert( + hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + uint start_sample = rtile.start_sample; + uint end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample;) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); + work_tiles.copy_to_device(); + + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + hip_assert( + hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + + hip_assert(hipDeviceSynchronize()); + + /* Update progress. */ + sample += wtile->num_samples; + rtile.sample = sample; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + hip_assert(hipDeviceSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } +} + +void HIPDevice::thread_run(DeviceTask &task) +{ + HIPContextScope scope(this); + + if (task.type == DeviceTask::RENDER) { + device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, task); + + while (task.acquire_tile(this, tile, task.tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, work_tiles); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, work_tiles); + } + + task.release_tile(tile); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } +} +# endif + +unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create() +{ + return make_unique<HIPDeviceQueue>(this); +} + +bool HIPDevice::should_use_graphics_interop() +{ + /* Check whether this device is part of OpenGL context. + * + * Using HIP device for graphics interoperability which is not part of the OpenGL context is + * possible, but from the empiric measurements it can be considerably slower than using naive + * pixels copy. */ + + HIPContextScope scope(this); + + int num_all_devices = 0; + hip_assert(hipGetDeviceCount(&num_all_devices)); + + if (num_all_devices == 0) { + return false; + } + + vector<hipDevice_t> gl_devices(num_all_devices); + uint num_gl_devices = 0; + hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll); + + for (hipDevice_t gl_device : gl_devices) { + if (gl_device == hipDevice) { + return true; + } + } + + return false; +} + +int HIPDevice::get_num_multiprocessors() +{ + return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0); +} + +int HIPDevice::get_max_num_threads_per_multiprocessor() +{ + return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0); +} + +bool HIPDevice::get_device_attribute(hipDeviceAttribute_t attribute, int *value) +{ + HIPContextScope scope(this); + + return hipDeviceGetAttribute(value, attribute, hipDevice) == hipSuccess; +} + +int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value) +{ + int value = 0; + if (!get_device_attribute(attribute, &value)) { + return default_value; + } + return value; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h new file mode 100644 index 00000000000..1d138ee9856 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.h @@ -0,0 +1,153 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device.h" +# include "device/hip/kernel.h" +# include "device/hip/queue.h" +# include "device/hip/util.h" + +# include "util/util_map.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# else +# include "util/util_opengl.h" +# endif + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class HIPDevice : public Device { + + friend class HIPContextScope; + + public: + hipDevice_t hipDevice; + hipCtx_t hipContext; + hipModule_t hipModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int pitch_alignment; + int hipDevId; + int hipDevArchitecture; + bool first_error; + + struct HIPMem { + HIPMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + hipTextureObject_t texobject; + hArray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, HIPMem> HIPMemMap; + HIPMemMap hip_mem_map; + thread_mutex hip_mem_map_mutex; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + HIPDeviceKernels kernels; + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~HIPDevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + virtual string compile_kernel_get_common_cflags(const uint kernel_features); + + string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "hip", + bool force_ptx = false); + + virtual bool load_kernels(const uint kernel_features) override; + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + /* Graphics resources interoperability. */ + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + protected: + bool get_device_attribute(hipDeviceAttribute_t attribute, int *value); + int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.cpp b/intern/cycles/device/hip/graphics_interop.cpp new file mode 100644 index 00000000000..add6dbed5e1 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.cpp @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/graphics_interop.h" + +# include "device/hip/device_impl.h" +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue) + : queue_(queue), device_(static_cast<HIPDevice *>(queue->device)) +{ +} + +HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop() +{ + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } +} + +void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination) +{ + const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + + if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + return; + } + + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } + + const hipError_t result = hipGraphicsGLRegisterBuffer( + &hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone); + if (result != hipSuccess) { + LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result); + } + + opengl_pbo_id_ = destination.opengl_pbo_id; + buffer_area_ = new_buffer_area; +} + +device_ptr HIPDeviceGraphicsInterop::map() +{ + if (!hip_graphics_resource_) { + return 0; + } + + HIPContextScope scope(device_); + + hipDeviceptr_t hip_buffer; + size_t bytes; + + hip_device_assert(device_, + hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream())); + hip_device_assert( + device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_)); + + return static_cast<device_ptr>(hip_buffer); +} + +void HIPDeviceGraphicsInterop::unmap() +{ + HIPContextScope scope(device_); + + hip_device_assert(device_, + hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream())); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h new file mode 100644 index 00000000000..adcaa13a2d7 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device_graphics_interop.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class HIPDeviceQueue; + +class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop { + public: + explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue); + + HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete; + + ~HIPDeviceGraphicsInterop(); + + HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete; + + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + + virtual device_ptr map() override; + virtual void unmap() override; + + protected: + HIPDeviceQueue *queue_ = nullptr; + HIPDevice *device_ = nullptr; + + /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + uint opengl_pbo_id_ = 0; + /* Buffer area in pixels of the corresponding PBO. */ + int64_t buffer_area_ = 0; + + hipGraphicsResource hip_graphics_resource_ = nullptr; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/kernel.cpp b/intern/cycles/device/hip/kernel.cpp new file mode 100644 index 00000000000..e0acd6f17c6 --- /dev/null +++ b/intern/cycles/device/hip/kernel.cpp @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/kernel.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +void HIPDeviceKernels::load(HIPDevice *device) +{ + hipModule_t hipModule = device->hipModule; + + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + HIPDeviceKernel &kernel = kernels_[i]; + + /* No megakernel used for GPU. */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + continue; + } + + const std::string function_name = std::string("kernel_gpu_") + + device_kernel_as_string((DeviceKernel)i); + hip_device_assert(device, + hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str())); + + if (kernel.function) { + hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1)); + + hip_device_assert( + device, + hipModuleOccupancyMaxPotentialBlockSize( + &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0)); + } + else { + LOG(ERROR) << "Unable to load kernel " << function_name; + } + } + + loaded = true; +} + +const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool HIPDeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].function != nullptr; +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP*/ diff --git a/intern/cycles/device/hip/kernel.h b/intern/cycles/device/hip/kernel.h new file mode 100644 index 00000000000..3301731f56e --- /dev/null +++ b/intern/cycles/device/hip/kernel.h @@ -0,0 +1,54 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* HIP kernel and associate occupancy information. */ +class HIPDeviceKernel { + public: + hipFunction_t function = nullptr; + + int num_threads_per_block = 0; + int min_blocks = 0; +}; + +/* Cache of HIP kernels for each DeviceKernel. */ +class HIPDeviceKernels { + public: + void load(HIPDevice *device); + const HIPDeviceKernel &get(DeviceKernel kernel) const; + bool available(DeviceKernel kernel) const; + + protected: + HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM]; + bool loaded = false; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp new file mode 100644 index 00000000000..78c77e5fdae --- /dev/null +++ b/intern/cycles/device/hip/queue.cpp @@ -0,0 +1,209 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/queue.h" + +# include "device/hip/device_impl.h" +# include "device/hip/graphics_interop.h" +# include "device/hip/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* HIPDeviceQueue */ + +HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device) + : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr) +{ + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking)); +} + +HIPDeviceQueue::~HIPDeviceQueue() +{ + const HIPContextScope scope(hip_device_); + hipStreamDestroy(hip_stream_); +} + +int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +{ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + return 14416128; +} + +int HIPDeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = hip_device_->get_num_multiprocessors() * + hip_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void HIPDeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + HIPContextScope scope(hip_device_); + hip_device_->load_texture_info(); + hip_device_assert(hip_device_, hipDeviceSynchronize()); + + debug_init_execution(); +} + +bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return hip_device_->kernels.available(kernel); +} + +bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (hip_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const HIPContextScope scope(hip_device_); + const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = hip_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + default: + break; + } + + /* Launch kernel. */ + hip_device_assert(hip_device_, + hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + args, + 0)); + return !(hip_device_->have_error()); +} + +bool HIPDeviceQueue::synchronize() +{ + if (hip_device_->have_error()) { + return false; + } + + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_)); + debug_synchronize(); + + return !(hip_device_->have_error()); +} + +void HIPDeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyHtoDAsync( + (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyDtoHAsync( + mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_)); +} + +// TODO : (Arya) Enable this after stabilizing dev branch +unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create() +{ + return make_unique<HIPDeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h new file mode 100644 index 00000000000..04c8a5982ce --- /dev/null +++ b/intern/cycles/device/hip/queue.h @@ -0,0 +1,68 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" +# include "device/device_memory.h" +# include "device/device_queue.h" + +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class device_memory; + +/* Base class for HIP queues. */ +class HIPDeviceQueue : public DeviceQueue { + public: + HIPDeviceQueue(HIPDevice *device); + ~HIPDeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual hipStream_t stream() + { + return hip_stream_; + } + + // TODO : (Arya) Enable this after stabilizing the dev branch + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; + + protected: + HIPDevice *hip_device_; + hipStream_t hip_stream_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.cpp b/intern/cycles/device/hip/util.cpp new file mode 100644 index 00000000000..44f52c4e17b --- /dev/null +++ b/intern/cycles/device/hip/util.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/util.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +HIPContextScope::HIPContextScope(HIPDevice *device) : device(device) +{ + hip_device_assert(device, hipCtxPushCurrent(device->hipContext)); +} + +HIPContextScope::~HIPContextScope() +{ + hip_device_assert(device, hipCtxPopCurrent(NULL)); +} + +# ifndef WITH_HIP_DYNLOAD +const char *hipewErrorString(hipError_t result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *hipewCompilerPath() +{ + return CYCLES_HIP_HIPCC_EXECUTABLE; +} + +int hipewCompilerVersion() +{ + return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10); +} +# endif + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h new file mode 100644 index 00000000000..f8468e0dc5c --- /dev/null +++ b/intern/cycles/device/hip/util.h @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* Utility to push/pop HIP context. */ +class HIPContextScope { + public: + HIPContextScope(HIPDevice *device); + ~HIPContextScope(); + + private: + HIPDevice *device; +}; + +/* Utility for checking return values of HIP function calls. */ +# define hip_device_assert(hip_device, stmt) \ + { \ + hipError_t result = stmt; \ + if (result != hipSuccess) { \ + const char *name = hipewErrorString(result); \ + hip_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define hip_assert(stmt) hip_device_assert(this, stmt) + +# ifndef WITH_HIP_DYNLOAD +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked HIP at all. */ +const char *hipewErrorString(hipError_t result); +const char *hipewCompilerPath(); +int hipewCompilerVersion(); +# endif /* WITH_HIP_DYNLOAD */ + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */
\ No newline at end of file diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 10d0c5e7b4c..9633d3b87d3 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -1035,6 +1035,8 @@ static const char *device_type_for_description(const DeviceType type) return "CUDA"; case DEVICE_OPTIX: return "OptiX"; + case DEVICE_HIP: + return "HIP"; case DEVICE_DUMMY: return "Dummy"; case DEVICE_MULTI: diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 4196539a9b1..8c2cb2c68de 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA device/cuda/kernel.cu ) +set(SRC_DEVICE_HIP + device/hip/kernel.cpp +) + set(SRC_DEVICE_OPTIX device/optix/kernel.cu device/optix/kernel_shader_raytrace.cu @@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS device/cuda/globals.h ) +set(SRC_DEVICE_HIP_HEADERS + device/hip/compat.h + device/hip/config.h + device/hip/globals.h +) + set(SRC_DEVICE_OPTIX_HEADERS device/optix/compat.h device/optix/globals.h @@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES) cycles_set_solution_folder(cycles_kernel_cuda) endif() +####################################################### START + +# HIP module + +if(WITH_CYCLES_HIP_BINARIES) + # 64 bit only + set(HIP_BITS 64) + + # HIP version + execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT) + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}") + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}") + set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}") + + + message(WARNING + "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected") + + # build for each arch + set(hip_sources device/hip/kernel.cpp + ${SRC_HEADERS} + ${SRC_DEVICE_HIP_HEADERS} + ${SRC_BVH_HEADERS} + ${SRC_SVM_HEADERS} + ${SRC_GEOM_HEADERS} + ${SRC_INTEGRATOR_HEADERS} + ${SRC_CLOSURE_HEADERS} + ${SRC_UTIL_HEADERS} + ) + set(hip_fatbins) + + macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental) + if(${arch} MATCHES "compute_.*") + set(format "ptx") + else() + set(format "fatbin") + endif() + set(hip_file ${name}_${arch}.${format}) + + set(kernel_sources ${sources}) + if(NOT ${prev_arch} STREQUAL "none") + if(${prev_arch} MATCHES "compute_.*") + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx) + else() + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin) + endif() + endif() + + set(hip_kernel_src "/device/hip/${name}.cpp") + + set(hip_flags ${flags} + -D CCL_NAMESPACE_BEGIN= + -D CCL_NAMESPACE_END= + -D HIPCC + -m ${HIP_BITS} + -I ${CMAKE_CURRENT_SOURCE_DIR}/.. + -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip + --use_fast_math + -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file}) + + if(${experimental}) + set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) + endif() + + if(WITH_CYCLES_DEBUG) + set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__) + endif() + + if(WITH_NANOVDB) + set(hip_flags ${hip_flags} + -D WITH_NANOVDB + -I "${NANOVDB_INCLUDE_DIR}") + endif() + + + set(prev_arch "none") + foreach(arch ${CYCLES_HIP_BINARIES_ARCH}) + set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE}) + set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR}) + if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir) + # Compile regular kernel + CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE) + + if(WITH_CYCLES_HIP_BUILD_SERIAL) + set(prev_arch ${arch}) + endif() + + unset(hip_hipcc_executable) + unset(hip_toolkit_root_dir) + endif() + endforeach() + + add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins}) + cycles_set_solution_folder(cycles_kernel_hip) +endif() + +####################################################### END # OptiX PTX modules if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) @@ -602,11 +710,13 @@ endif() cycles_add_library(cycles_kernel "${LIB}" ${SRC_DEVICE_CPU} ${SRC_DEVICE_CUDA} + ${SRC_DEVICE_HIP} ${SRC_DEVICE_OPTIX} ${SRC_HEADERS} ${SRC_DEVICE_CPU_HEADERS} ${SRC_DEVICE_GPU_HEADERS} ${SRC_DEVICE_CUDA_HEADERS} + ${SRC_DEVICE_HIP_HEADERS} ${SRC_DEVICE_OPTIX_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} @@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS}) source_group("kernel" FILES ${SRC_HEADERS}) source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS}) +source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS}) source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS}) source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS}) source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS}) @@ -632,14 +743,19 @@ endif() if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) add_dependencies(cycles_kernel cycles_kernel_optix) endif() +if(WITH_CYCLES_HIP) + add_dependencies(cycles_kernel cycles_kernel_hip) +endif() # Install kernel source for runtime compilation delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index a68d1d80c7d..db4a4bf71e0 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -25,7 +25,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize, typename IsActiveOp> __device__ void gpu_parallel_active_index_array(const uint num_states, diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index f609520b8b4..a1349e82efb 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values) { diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h index 65b1990dbb8..b60dceb2ed0 100644 --- a/intern/cycles/kernel/device/gpu/parallel_reduce.h +++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h @@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN * the overall cost of the algorithm while keeping the work complexity O(n) and * the step complexity O(log n). (Brent's Theorem optimization) */ -#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp> __device__ void gpu_parallel_sum( diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index 99b35468517..9bca1fad22f 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) template<uint blocksize, typename GetKeyOp> diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h new file mode 100644 index 00000000000..3644925d5be --- /dev/null +++ b/intern/cycles/kernel/device/hip/compat.h @@ -0,0 +1,121 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_HIP__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +#ifdef __HIPCC_RTC__ +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +#else +# include <stdint.h> +#endif + +#ifdef CYCLES_HIPBIN_CC +# define FLT_MIN 1.175494350822287507969e-38f +# define FLT_MAX 340282346638528859811704183484516925440.0f +# define FLT_EPSILON 1.192092896e-07F +#endif + +/* Qualifiers */ + +#define ccl_device __device__ __inline__ +#define ccl_device_inline __device__ __inline__ +#define ccl_device_forceinline __device__ __forceinline__ +#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device +#define ccl_global +#define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ +#define ccl_constant const +#define ccl_gpu_shared __shared__ +#define ccl_private +#define ccl_may_alias +#define ccl_addr_space +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_align(n) __align__(n) +#define ccl_optional_struct_init + +#define kernel_assert(cond) + +/* Types */ +#ifdef __HIP__ +# include "hip/hip_fp16.h" +# include "hip/hip_runtime.h" +#endif + +#ifdef _MSC_VER +# include <immintrin.h> +#endif + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronizaton */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot(predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ +typedef hipTextureObject_t ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Use fast math functions */ + +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h new file mode 100644 index 00000000000..2fde0d46015 --- /dev/null +++ b/intern/cycles/kernel/device/hip/config.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Device data taken from HIP occupancy calculator. + * + * Terminology + * - HIP GPUs have multiple streaming multiprocessors + * - Each multiprocessor executes multiple thread blocks + * - Each thread block contains a number of threads, also known as the block size + * - Multiprocessors have a fixed number of registers, and the amount of registers + * used by each threads limits the number of threads per block. + */ + +/* Launch Bound Definitions */ +#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64 +#define GPU_BLOCK_MAX_THREADS 1024 +#define GPU_THREAD_MAX_REGISTERS 255 + +#define GPU_KERNEL_BLOCK_NUM_THREADS 1024 +#define GPU_KERNEL_MAX_REGISTERS 64 + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ + extern "C" __global__ void __launch_bounds__(block_num_threads, \ + GPU_MULTIPRESSOR_MAX_REGISTERS / \ + (block_num_threads * thread_num_registers)) + +/* sanity checks */ + +#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \ + GPU_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h new file mode 100644 index 00000000000..39978ae7899 --- /dev/null +++ b/intern/cycles/kernel/device/hip/globals.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobals { + /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ + int unused[1]; +}; + +/* Global scene data and textures */ +__constant__ KernelData __data; +#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name; +#include "kernel/kernel_textures.h" + +/* Integrator state */ +__constant__ IntegratorStateGPU __integrator_state; + +/* Abstraction macros */ +#define kernel_data __data +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) +#define kernel_integrator_state __integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp new file mode 100644 index 00000000000..c801320a2e1 --- /dev/null +++ b/intern/cycles/kernel/device/hip/kernel.cpp @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* HIP kernel entry points */ + +#ifdef __HIP_DEVICE_COMPILE__ + +# include "kernel/device/hip/compat.h" +# include "kernel/device/hip/config.h" +# include "kernel/device/hip/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index de17efafcf2..faba411c769 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -34,7 +34,7 @@ #else /* __KERNEL_GPU__ */ -# ifdef __KERNEL_CUDA__ +# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) # define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x)) diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 1d598725c84..2245668d02f 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false) reset(); } +DebugFlags::HIP::HIP() : adaptive_compile(false) +{ + reset(); +} + void DebugFlags::CUDA::reset() { if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; } +void DebugFlags::HIP::reset() +{ + if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL) + adaptive_compile = true; +} + DebugFlags::OptiX::OptiX() { reset(); @@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags) os << "OptiX flags:\n" << " Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n"; + + os << "HIP flags:\n" + << " HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n"; + return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index a2acaea5675..81677201790 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -93,6 +93,17 @@ class DebugFlags { bool adaptive_compile; }; + /* Descriptor of HIP feature-set to be used. */ + struct HIP { + HIP(); + + /* Reset flags to their defaults. */ + void reset(); + + /* Whether adaptive feature based runtime compile is enabled or not.*/ + bool adaptive_compile; + }; + /* Descriptor of OptiX feature-set to be used. */ struct OptiX { OptiX(); @@ -124,6 +135,9 @@ class DebugFlags { /* Requested OptiX flags. */ OptiX optix; + /* Requested HIP flags. */ + HIP hip; + private: DebugFlags(); diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index d9edfec5da3..f36a492a1b0 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ /* CUDA has its own half data type, no need to define then */ -#ifndef __KERNEL_CUDA__ +#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -59,7 +59,7 @@ struct half4 { half x, y, z, w; }; -#ifdef __KERNEL_CUDA__ +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) ccl_device_inline void float4_store_half(half *h, float4 f) { @@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f) ccl_device_inline void float4_store_half(half *h, float4 f) { + # ifndef __KERNEL_SSE2__ for (int i = 0; i < 4; i++) { /* optimized float to half for pixels: @@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f) # endif } +# ifndef __KERNEL_HIP__ + ccl_device_inline float half_to_float(half h) { float f; @@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h) return f; } +# else + +ccl_device_inline float half_to_float(std::uint32_t a) noexcept +{ + + std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; + + std::uint32_t v = __float_as_uint(__uint_as_float(u) * + __uint_as_float(0x77800000U) /*0x1.0p+112f*/) + + 0x38000000U; + + u = (a & 0x7fff) != 0 ? v : u; + + return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/; +} + +# endif /* __KERNEL_HIP__ */ ccl_device_inline float4 half4_to_float4(half4 h) { diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 6d728dde679..cb1e94c838c 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -26,6 +26,10 @@ # include <cmath> #endif +#ifdef __HIP__ +# include <hip/hip_vector_types.h> +#endif + #include <float.h> #include <math.h> #include <stdio.h> @@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN /* Scalar */ -#ifdef _WIN32 +#ifndef __HIP__ +# ifdef _WIN32 ccl_device_inline float fmaxf(float a, float b) { return (a > b) ? a : b; @@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b) { return (a < b) ? a : b; } -#endif /* _WIN32 */ + +# endif /* _WIN32 */ +#endif /* __HIP__ */ #ifndef __KERNEL_GPU__ using std::isfinite; @@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f) return u.i; } +#ifndef __HIP__ ccl_device_inline int __float_as_int(float f) { union { @@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } +#endif ccl_device_inline int4 __float4_as_int4(float4 f) { @@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits) ccl_device_inline uint count_leading_zeros(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __clz(x); #else assert(x != 0); @@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x) ccl_device_inline uint count_trailing_zeros(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return (__ffs(x) - 1); #else assert(x != 0); @@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x) ccl_device_inline uint find_first_set(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __ffs(x); #else # ifdef _MSC_VER |