diff options
Diffstat (limited to 'intern/cycles')
40 files changed, 3062 insertions, 19 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 17096d441f0..2018c1d9648 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -297,6 +297,7 @@ endif() if(WITH_CYCLES_STANDALONE) set(WITH_CYCLES_DEVICE_CUDA TRUE) + set(WITH_CYCLES_DEVICE_HIP TRUE) endif() # TODO(sergey): Consider removing it, only causes confusion in interface. set(WITH_CYCLES_DEVICE_MULTI TRUE) diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 5bdcfd56a4d..64d226cb9ec 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -95,6 +95,9 @@ set(ADDON_FILES add_definitions(${GL_DEFINITIONS}) +if(WITH_CYCLES_DEVICE_HIP) + add_definitions(-DWITH_HIP) +endif() if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e0e8ca10bef..d729cb1ee69 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -28,7 +28,7 @@ def _configure_argument_parser(): action='store_true') parser.add_argument("--cycles-device", help="Set the device to use for Cycles, overriding user preferences and the scene setting." - "Valid options are 'CPU', 'CUDA' or 'OPTIX'." + "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'" "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.", default=None) return parser diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index e2b671848d0..67207874431 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -111,6 +111,7 @@ enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), + ("HIP", "HIP", "HIP", 4) ) enum_texture_limit = ( @@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix = _cycles.get_device_types() + has_cuda, has_optix, has_hip = _cycles.get_device_types() + list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) if has_optix: list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3)) + if has_hip: + list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4)) + return list compute_device_type: EnumProperty( @@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPTIX', 'CPU'}: + if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): elif entry.type == 'CPU': cpu_devices.append(entry) # Extend all GPU devices with CPU. - if compute_device_type != 'CPU': + if compute_device_type != 'CPU' and compute_device_type != 'HIP': devices.extend(cpu_devices) return devices @@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): import _cycles # Ensure `self.devices` is not re-allocated when the second call to # get_devices_for_type is made, freeing items from the first list. - for device_type in ('CUDA', 'OPTIX', 'OPENCL'): + for device_type in ('CUDA', 'OPTIX', 'HIP'): self.update_device_entries(_cycles.available_devices(device_type)) # Deprecated: use refresh_devices instead. diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index ac96ddf5e10..c4a1844480c 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -99,6 +99,11 @@ def use_cuda(context): return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU') +def use_hip(context): + cscene = context.scene.cycles + + return (get_device_type(context) == 'HIP' and cscene.device == 'GPU') + def use_optix(context): cscene = context.scene.cycles diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index ce1770f18a3..7bed33855c2 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -26,6 +26,7 @@ enum ComputeDevice { COMPUTE_DEVICE_CPU = 0, COMPUTE_DEVICE_CUDA = 1, COMPUTE_DEVICE_OPTIX = 3, + COMPUTE_DEVICE_HIP = 4, COMPUTE_DEVICE_NUM }; @@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen else if (compute_device == COMPUTE_DEVICE_OPTIX) { mask |= DEVICE_MASK_OPTIX; } + else if (compute_device == COMPUTE_DEVICE_HIP) { + mask |= DEVICE_MASK_HIP; + } vector<DeviceInfo> devices = Device::available_devices(mask); /* Match device preferences and available devices. */ diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 694d8454422..d681517c9e1 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceType> device_types = Device::available_types(); - bool has_cuda = false, has_optix = false; + bool has_cuda = false, has_optix = false, has_hip = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); + has_hip |= (device_type == DEVICE_HIP); } - PyObject *list = PyTuple_New(2); + PyObject *list = PyTuple_New(3); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); + PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip)); return list; } @@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg) else if (override == "OPTIX") { BlenderSession::device_override = DEVICE_MASK_OPTIX; } + else if (override == "HIP") { + BlenderSession::device_override = DEVICE_MASK_HIP; + } else { printf("\nError: %s is not a valid Cycles device.\n", override.c_str()); Py_RETURN_FALSE; diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index da259171844..5653f51ec05 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -531,5 +531,9 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD) endif() endif() endif() +if(NOT WITH_HIP_DYNLOAD) + message(STATUS "Setting up HIP Dynamic Load") + set(WITH_HIP_DYNLOAD ON) +endif() unset(_cycles_lib_dir) diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake index 47196dfd1ce..172ae280cea 100644 --- a/intern/cycles/cmake/macros.cmake +++ b/intern/cycles/cmake/macros.cmake @@ -162,6 +162,10 @@ macro(cycles_target_link_libraries target) target_link_libraries(${target} ${CUDA_CUDA_LIBRARY}) endif() + if(WITH_HIP_DYNLOAD) + target_link_libraries(${target} extern_hipew) + endif() + if(CYCLES_STANDALONE_REPOSITORY) target_link_libraries(${target} extern_numaapi) else() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index d18f4360aef..9af68550d17 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -34,6 +34,13 @@ else() add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") endif() +if(WITH_HIP_DYNLOAD) + list(APPEND INC + ../../../extern/hipew/include + ) + add_definitions(-DWITH_HIP_DYNLOAD) +endif() + set(SRC device.cpp device_denoise.cpp @@ -70,6 +77,21 @@ set(SRC_CUDA cuda/util.h ) +set(SRC_HIP + hip/device.cpp + hip/device.h + hip/device_impl.cpp + hip/device_impl.h + hip/graphics_interop.cpp + hip/graphics_interop.h + hip/kernel.cpp + hip/kernel.h + hip/queue.cpp + hip/queue.h + hip/util.cpp + hip/util.h +) + set(SRC_DUMMY dummy/device.cpp dummy/device.h @@ -115,11 +137,20 @@ else() ) endif() +if(WITH_HIP_DYNLOAD) + list(APPEND LIB + extern_hipew + ) +endif() + add_definitions(${GL_DEFINITIONS}) if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() +if(WITH_CYCLES_DEVICE_HIP) + add_definitions(-DWITH_HIP) +endif() if(WITH_CYCLES_DEVICE_OPTIX) add_definitions(-DWITH_OPTIX) endif() @@ -140,6 +171,7 @@ cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CPU} ${SRC_CUDA} + ${SRC_HIP} ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 70935598437..81574e8b184 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -25,6 +25,7 @@ #include "device/cpu/device.h" #include "device/cuda/device.h" #include "device/dummy/device.h" +#include "device/hip/device.h" #include "device/multi/device.h" #include "device/optix/device.h" @@ -46,6 +47,7 @@ thread_mutex Device::device_mutex; vector<DeviceInfo> Device::cuda_devices; vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; +vector<DeviceInfo> Device::hip_devices; uint Device::devices_initialized_mask = 0; /* Device */ @@ -96,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) device = device_optix_create(info, stats, profiler); break; #endif + +#ifdef WITH_HIP + case DEVICE_HIP: + if (device_hip_init()) + device = device_hip_create(info, stats, profiler); + break; +#endif + default: break; } @@ -117,6 +127,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_OPTIX; else if (strcmp(name, "MULTI") == 0) return DEVICE_MULTI; + else if (strcmp(name, "HIP") == 0) + return DEVICE_HIP; return DEVICE_NONE; } @@ -131,6 +143,8 @@ string Device::string_from_type(DeviceType type) return "OPTIX"; else if (type == DEVICE_MULTI) return "MULTI"; + else if (type == DEVICE_HIP) + return "HIP"; return ""; } @@ -145,6 +159,10 @@ vector<DeviceType> Device::available_types() #ifdef WITH_OPTIX types.push_back(DEVICE_OPTIX); #endif +#ifdef WITH_HIP + types.push_back(DEVICE_HIP); +#endif + return types; } @@ -186,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask) } #endif +#ifdef WITH_HIP + if (mask & DEVICE_MASK_HIP) { + if (!(devices_initialized_mask & DEVICE_MASK_HIP)) { + if (device_hip_init()) { + device_hip_info(hip_devices); + } + devices_initialized_mask |= DEVICE_MASK_HIP; + } + foreach (DeviceInfo &info, hip_devices) { + devices.push_back(info); + } + } +#endif + if (mask & DEVICE_MASK_CPU) { if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { device_cpu_info(cpu_devices); @@ -226,6 +258,15 @@ string Device::device_capabilities(uint mask) } #endif +#ifdef WITH_HIP + if (mask & DEVICE_MASK_HIP) { + if (device_hip_init()) { + capabilities += "\nHIP device capabilities:\n"; + capabilities += device_hip_capabilities(); + } + } +#endif + return capabilities; } @@ -314,6 +355,7 @@ void Device::free_memory() devices_initialized_mask = 0; cuda_devices.free_memory(); optix_devices.free_memory(); + hip_devices.free_memory(); cpu_devices.free_memory(); } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 3bbad179f52..c73d74cdccc 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -51,6 +51,7 @@ enum DeviceType { DEVICE_CUDA, DEVICE_MULTI, DEVICE_OPTIX, + DEVICE_HIP, DEVICE_DUMMY, }; @@ -58,6 +59,7 @@ enum DeviceTypeMask { DEVICE_MASK_CPU = (1 << DEVICE_CPU), DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), + DEVICE_MASK_HIP = (1 << DEVICE_HIP), DEVICE_MASK_ALL = ~0 }; @@ -284,6 +286,7 @@ class Device { static vector<DeviceInfo> cuda_devices; static vector<DeviceInfo> optix_devices; static vector<DeviceInfo> cpu_devices; + static vector<DeviceInfo> hip_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index a854cc9b693..be6123e09b2 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -277,6 +277,7 @@ class device_memory { protected: friend class CUDADevice; friend class OptiXDevice; + friend class HIPDevice; /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp new file mode 100644 index 00000000000..90028ac7f10 --- /dev/null +++ b/intern/cycles/device/hip/device.cpp @@ -0,0 +1,276 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/hip/device.h" + +#include "util/util_logging.h" + +#ifdef WITH_HIP +# include "device/device.h" +# include "device/hip/device_impl.h" + +# include "util/util_string.h" +# include "util/util_windows.h" +#endif /* WITH_HIP */ + +CCL_NAMESPACE_BEGIN + +bool device_hip_init() +{ +#if !defined(WITH_HIP) + return false; +#elif defined(WITH_HIP_DYNLOAD) + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + int hipew_result = hipewInit(HIPEW_INIT_HIP); + if (hipew_result == HIPEW_SUCCESS) { + VLOG(1) << "HIPEW initialization succeeded"; + if (HIPDevice::have_precompiled_kernels()) { + VLOG(1) << "Found precompiled kernels"; + result = true; + } + else if (hipewCompilerPath() != NULL) { + VLOG(1) << "Found HIPCC " << hipewCompilerPath(); + result = true; + } + else { + VLOG(1) << "Neither precompiled kernels nor HIPCC was found," + << " unable to use HIP"; + } + } + else { + VLOG(1) << "HIPEW initialization failed: " + << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + + return result; +#else /* WITH_HIP_DYNLOAD */ + return true; +#endif /* WITH_HIP_DYNLOAD */ +} + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_HIP + return new HIPDevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen."; + + return nullptr; +#endif +} + +#ifdef WITH_HIP +static hipError_t device_hip_safe_init() +{ +# ifdef _WIN32 + __try { + return hipInit(0); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the HIP driver and hope we can + * survive even with corrupted HIP installs. */ + fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n"); + } + + return hipErrorNoDevice; +# else + return hipInit(0); +# endif +} +#endif /* WITH_HIP */ + +void device_hip_info(vector<DeviceInfo> &devices) +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) + fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result)); + return; + } + + int count = 0; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result)); + return; + } + + vector<DeviceInfo> display_devices; + + for (int num = 0; num < count; num++) { + char name[256]; + + result = hipDeviceGetName(name, 256, num); + if (result != hipSuccess) { + fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result)); + continue; + } + + int major; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num); + // TODO : (Arya) What is the last major version we are supporting? + + DeviceInfo info; + + info.type = DEVICE_HIP; + info.description = string(name); + info.num = num; + + info.has_half_images = (major >= 3); + info.has_nanovdb = true; + info.denoisers = 0; + + info.has_gpu_queue = true; + /* Check if the device has P2P access to any other device in the system. */ + for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { + if (num != peer_num) { + int can_access = 0; + hipDeviceCanAccessPeer(&can_access, num, peer_num); + info.has_peer_memory = (can_access != 0); + } + } + + int pci_location[3] = {0, 0, 0}; + hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num); + hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num); + hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num); + info.id = string_printf("HIP_%s_%04x:%02x:%02x", + name, + (unsigned int)pci_location[0], + (unsigned int)pci_location[1], + (unsigned int)pci_location[2]); + + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num); + + if (timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; + info.description += " (Display)"; + info.display_device = true; + display_devices.push_back(info); + } + else { + VLOG(1) << "Device has compute preemption or is not used for display."; + devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + } + + if (!display_devices.empty()) + devices.insert(devices.end(), display_devices.begin(), display_devices.end()); +#else /* WITH_HIP */ + (void)devices; +#endif /* WITH_HIP */ +} + +string device_hip_capabilities() +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) { + return string("Error initializing HIP: ") + hipewErrorString(result); + } + return "No HIP device found\n"; + } + + int count; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + return string("Error getting devices: ") + hipewErrorString(result); + } + + string capabilities = ""; + for (int num = 0; num < count; num++) { + char name[256]; + if (hipDeviceGetName(name, 256, num) != hipSuccess) { + continue; + } + capabilities += string("\t") + name + "\n"; + int value; +# define GET_ATTR(attr) \ + { \ + if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \ + capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \ + } \ + } \ + (void)0 + /* TODO(sergey): Strip all attributes which are not useful for us + * or does not depend on the driver. + */ + GET_ATTR(MaxThreadsPerBlock); + GET_ATTR(MaxBlockDimX); + GET_ATTR(MaxBlockDimY); + GET_ATTR(MaxBlockDimZ); + GET_ATTR(MaxGridDimX); + GET_ATTR(MaxGridDimY); + GET_ATTR(MaxGridDimZ); + GET_ATTR(MaxSharedMemoryPerBlock); + GET_ATTR(TotalConstantMemory); + GET_ATTR(WarpSize); + GET_ATTR(MaxPitch); + GET_ATTR(MaxRegistersPerBlock); + GET_ATTR(ClockRate); + GET_ATTR(TextureAlignment); + GET_ATTR(MultiprocessorCount); + GET_ATTR(KernelExecTimeout); + GET_ATTR(Integrated); + GET_ATTR(CanMapHostMemory); + GET_ATTR(ComputeMode); + GET_ATTR(MaxTexture1DWidth); + GET_ATTR(MaxTexture2DWidth); + GET_ATTR(MaxTexture2DHeight); + GET_ATTR(MaxTexture3DWidth); + GET_ATTR(MaxTexture3DHeight); + GET_ATTR(MaxTexture3DDepth); + GET_ATTR(ConcurrentKernels); + GET_ATTR(EccEnabled); + GET_ATTR(MemoryClockRate); + GET_ATTR(MemoryBusWidth); + GET_ATTR(L2CacheSize); + GET_ATTR(MaxThreadsPerMultiProcessor); + GET_ATTR(ComputeCapabilityMajor); + GET_ATTR(ComputeCapabilityMinor); + GET_ATTR(MaxSharedMemoryPerMultiprocessor); + GET_ATTR(ManagedMemory); + GET_ATTR(IsMultiGpuBoard); +# undef GET_ATTR + capabilities += "\n"; + } + + return capabilities; + +#else /* WITH_HIP */ + return ""; +#endif /* WITH_HIP */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/hip/device.h b/intern/cycles/device/hip/device.h new file mode 100644 index 00000000000..76fa8995bed --- /dev/null +++ b/intern/cycles/device/hip/device.h @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_hip_init(); + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_hip_info(vector<DeviceInfo> &devices); + +string device_hip_capabilities(); + +CCL_NAMESPACE_END
\ No newline at end of file diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp new file mode 100644 index 00000000000..0e5ac6ce401 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.cpp @@ -0,0 +1,1343 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/hip/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +bool HIPDevice::have_precompiled_kernels() +{ + string fatbins_path = path_get("lib"); + return path_exists(fatbins_path); +} + +bool HIPDevice::show_samples() const +{ + /* The HIPDevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask HIPDevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void HIPDevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + hipDevId = info.num; + hipDevice = 0; + hipContext = 0; + + hipModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize HIP. */ + hipError_t result = hipInit(0); + if (result != hipSuccess) { + set_error(string_printf("Failed to initialize HIP runtime (%s)", hipewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = hipGetDevice(&hipDevice, hipDevId); + if (result != hipSuccess) { + set_error(string_printf("Failed to get HIP device handle from ordinal (%s)", + hipewErrorString(result))); + return; + } + + hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); + + hip_assert( + hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice)); + + unsigned int ctx_flags = hipDeviceLmemResizeToMax; + if (can_map_host) { + ctx_flags |= hipDeviceMapHost; + init_host_memory(); + } + + /* Create context. */ + result = hipCtxCreate(&hipContext, ctx_flags, hipDevice); + + if (result != hipSuccess) { + set_error(string_printf("Failed to create HIP context (%s)", hipewErrorString(result))); + return; + } + + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + hipDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by hipCtxCreate. */ + hipCtxPopCurrent(NULL); +} + +HIPDevice::~HIPDevice() +{ + texture_info.free(); + + hip_assert(hipCtxDestroy(hipContext)); +} + +bool HIPDevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + // TODO : (Arya) What versions do we plan to support? + return true; +} + +bool HIPDevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_HIP && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + HIPDevice *const peer_device_hip = static_cast<HIPDevice *>(peer_device); + + int can_access = 0; + hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + hip_assert(hipDeviceGetP2PAttribute( + &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const HIPContextScope scope(this); + hipError_t result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + { + const HIPContextScope scope(peer_device_hip); + hipError_t result = hipCtxEnablePeerAccess(hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + + return true; +} + +bool HIPDevice::use_adaptive_compilation() +{ + return DebugFlags().hip.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DHIPCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + return cflags; +} + +string HIPDevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string fatbin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using precompiled kernel."; + return fatbin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing hip toolkit or changing other + * compiler command line arguments makes sure fatbin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = "genco"; +# ifdef _WIN32 + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp"; +# else + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ggdb"; +# endif + const string include_path = source_path; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string fatbin_file = string_printf( + "cycles_%s_%s_%d%d_%s", name, kernel_arch, major, minor, kernel_md5.c_str()); + const string fatbin = path_cache_get(path_join("kernels", fatbin_file)); + VLOG(1) << "Testing for locally compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using locally compiled kernel."; + return fatbin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (major < 3) { + set_error( + string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("HIP binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + /* Compile. */ + const char *const hipcc = hipewCompilerPath(); + if (hipcc == NULL) { + set_error( + "HIP hipcc compiler not found. " + "Install HIP toolkit in default location."); + return string(); + } + + const int hipcc_hip_version = hipewCompilerVersion(); + VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; + if (hipcc_hip_version < 40) { + printf( + "Unsupported HIP version %d.%d detected, " + "you need HIP 4.0 or newer.\n", + hipcc_hip_version / 10, + hipcc_hip_version % 10); + return string(); + } + + double starttime = time_dt(); + + path_create_directories(fatbin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cpp", name)))); + + string command = string_printf("%s -%s -I %s --%s %s -o \"%s\"", + hipcc, + options, + include_path.c_str(), + kernel_ext, + source_path.c_str(), + fatbin.c_str()); + + printf("Compiling HIP kernel ...\n%s\n", command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + + /* Verify if compilation succeeded */ + if (!path_exists(fatbin)) { + set_error( + "HIP kernel compilation failed, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return fatbin; +} + +bool HIPDevice::load_kernels(const uint kernel_features) +{ + /* TODO(sergey): Support kernels re-load for HIP devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in hipCtxSynchronize. + */ + if (hipModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if hip init succeeded */ + if (hipContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* get kernel */ + const char *kernel_name = "kernel"; + string fatbin = compile_kernel(kernel_features, kernel_name); + if (fatbin.empty()) + return false; + + /* open module */ + HIPContextScope scope(this); + + string fatbin_data; + hipError_t result; + + if (path_read_text(fatbin, fatbin_data)) + result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); + else + result = hipErrorFileNotFound; + + if (result != hipSuccess) + set_error(string_printf( + "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result))); + + if (result == hipSuccess) { + kernels.load(this); + reserve_local_memory(kernel_features); + } + + return (result == hipSuccess); +} + +void HIPDevice::reserve_local_memory(const uint) +{ + /* Together with hipDeviceLmemResizeToMax, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + size_t total = 0, free_before = 0, free_after = 0; + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_before, &total); + } + + { + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE; + + /* Launch kernel, using just 1 block appears sufficient to reserve memory for all + * multiprocessors. It would be good to do this in parallel for the multi GPU case + * still to make it faster. */ + HIPDeviceQueue queue(this); + + void *d_path_index = nullptr; + void *d_render_buffer = nullptr; + int d_work_size = 0; + void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_after, &total); + } + + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + hipDeviceptr_t tmp; + hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL)); + hipMemGetInfo(&free_after, &total); + } +# endif +} + +void HIPDevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void HIPDevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + } +} + +void HIPDevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(hip_mem_map_mutex); + foreach (HIPMemMap::value_type &pair, hip_mem_map) { + device_memory &mem = *pair.first; + HIPMem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple HIP devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + HIPContextScope scope(this); + + hipDeviceptr_t device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + hipError_t mem_alloc_result = hipErrorOutOfMemory; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + hipMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + hipMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = hipMalloc(&device_pointer, size); + if (mem_alloc_result == hipSuccess) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != hipSuccess && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = hipSuccess; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = hipHostMalloc(&shared_pointer, size); + + assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || + (mem_alloc_result != hipSuccess && shared_pointer == 0)); + } + + if (mem_alloc_result == hipSuccess) { + hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + } + + if (mem_alloc_result != hipSuccess) { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(hip_mem_map_mutex); + HIPMem *cmem = &hip_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * HIP memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void HIPDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from + * mem.host_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert( + hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } +} + +void HIPDevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + hipHostFree(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + hip_assert(hipFree(mem.device_pointer)); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } +} + +void HIPDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + generic_alloc(mem); + } +} + +void HIPDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void HIPDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemcpyDtoH( + (char *)mem.host_pointer + offset, (hipDeviceptr_t)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void HIPDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void HIPDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr HIPDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void HIPDevice::const_copy_to(const char *name, void *host, size_t size) +{ + HIPContextScope scope(this); + hipDeviceptr_t mem; + size_t bytes; + + hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); + assert(bytes == size); + hip_assert(hipMemcpyHtoD(mem, host, size)); +} + +void HIPDevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void HIPDevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void HIPDevice::tex_alloc(device_texture &mem) +{ + HIPContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + hipTextureAddressMode address_mode = hipAddressModeWrap; + switch (mem.info.extension) { + case EXTENSION_REPEAT: + address_mode = hipAddressModeWrap; + break; + case EXTENSION_EXTEND: + address_mode = hipAddressModeClamp; + break; + case EXTENSION_CLIP: + // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported + // in hip + address_mode = hipAddressModeClamp; + break; + default: + assert(0); + break; + } + + hipTextureFilterMode filter_mode; + if (mem.info.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = hipFilterModePoint; + } + else { + filter_mode = hipFilterModeLinear; + } + + /* Image Texture Storage */ + hipArray_Format format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = HIP_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = HIP_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = HIP_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = HIP_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = HIP_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = HIP_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + HIPMem *cmem = NULL; + hArray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (!mem.is_resident(this)) { + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (hArray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + HIP_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + hip_assert(hipArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + HIP_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeArray; + param.dstArray = &array_3d; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + hip_assert(hipDrvMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + dst_pitch = align_up(src_pitch, pitch_alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + hip_Memcpy2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeDevice; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + hip_assert(hipDrvMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; + + if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + /* Kepler+, bindless textures. */ + hipResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = hipResourceTypeArray; + resDesc.res.array.h_Array = &array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = hipResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = hipResourceTypeLinear; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES; + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + + hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + texture_info[slot].data = (uint64_t)cmem->texobject; + } + else { + texture_info[slot].data = (uint64_t)mem.device_pointer; + } +} + +void HIPDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + hipTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + hipArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } + } +} + +# if 0 +void HIPDevice::render(DeviceTask &task, + RenderTile &rtile, + device_vector<KernelWorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + HIPContextScope scope(this); + hipFunction_t hipRender; + + /* Get kernel function. */ + if (rtile.task == RenderTile::BAKE) { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake")); + } + else { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace")); + } + + if (have_error()) { + return; + } + + hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + KernelWorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer; + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + hip_assert( + hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + uint start_sample = rtile.start_sample; + uint end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample;) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); + work_tiles.copy_to_device(); + + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + hip_assert( + hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + + hip_assert(hipDeviceSynchronize()); + + /* Update progress. */ + sample += wtile->num_samples; + rtile.sample = sample; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + hip_assert(hipDeviceSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } +} + +void HIPDevice::thread_run(DeviceTask &task) +{ + HIPContextScope scope(this); + + if (task.type == DeviceTask::RENDER) { + device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, task); + + while (task.acquire_tile(this, tile, task.tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, work_tiles); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, work_tiles); + } + + task.release_tile(tile); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } +} +# endif + +unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create() +{ + return make_unique<HIPDeviceQueue>(this); +} + +bool HIPDevice::should_use_graphics_interop() +{ + /* Check whether this device is part of OpenGL context. + * + * Using HIP device for graphics interoperability which is not part of the OpenGL context is + * possible, but from the empiric measurements it can be considerably slower than using naive + * pixels copy. */ + + HIPContextScope scope(this); + + int num_all_devices = 0; + hip_assert(hipGetDeviceCount(&num_all_devices)); + + if (num_all_devices == 0) { + return false; + } + + vector<hipDevice_t> gl_devices(num_all_devices); + uint num_gl_devices = 0; + hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll); + + for (hipDevice_t gl_device : gl_devices) { + if (gl_device == hipDevice) { + return true; + } + } + + return false; +} + +int HIPDevice::get_num_multiprocessors() +{ + return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0); +} + +int HIPDevice::get_max_num_threads_per_multiprocessor() +{ + return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0); +} + +bool HIPDevice::get_device_attribute(hipDeviceAttribute_t attribute, int *value) +{ + HIPContextScope scope(this); + + return hipDeviceGetAttribute(value, attribute, hipDevice) == hipSuccess; +} + +int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value) +{ + int value = 0; + if (!get_device_attribute(attribute, &value)) { + return default_value; + } + return value; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h new file mode 100644 index 00000000000..1d138ee9856 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.h @@ -0,0 +1,153 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device.h" +# include "device/hip/kernel.h" +# include "device/hip/queue.h" +# include "device/hip/util.h" + +# include "util/util_map.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# else +# include "util/util_opengl.h" +# endif + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class HIPDevice : public Device { + + friend class HIPContextScope; + + public: + hipDevice_t hipDevice; + hipCtx_t hipContext; + hipModule_t hipModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int pitch_alignment; + int hipDevId; + int hipDevArchitecture; + bool first_error; + + struct HIPMem { + HIPMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + hipTextureObject_t texobject; + hArray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, HIPMem> HIPMemMap; + HIPMemMap hip_mem_map; + thread_mutex hip_mem_map_mutex; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + HIPDeviceKernels kernels; + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~HIPDevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + virtual string compile_kernel_get_common_cflags(const uint kernel_features); + + string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "hip", + bool force_ptx = false); + + virtual bool load_kernels(const uint kernel_features) override; + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + /* Graphics resources interoperability. */ + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + protected: + bool get_device_attribute(hipDeviceAttribute_t attribute, int *value); + int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.cpp b/intern/cycles/device/hip/graphics_interop.cpp new file mode 100644 index 00000000000..add6dbed5e1 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.cpp @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/graphics_interop.h" + +# include "device/hip/device_impl.h" +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue) + : queue_(queue), device_(static_cast<HIPDevice *>(queue->device)) +{ +} + +HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop() +{ + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } +} + +void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination) +{ + const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + + if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + return; + } + + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } + + const hipError_t result = hipGraphicsGLRegisterBuffer( + &hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone); + if (result != hipSuccess) { + LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result); + } + + opengl_pbo_id_ = destination.opengl_pbo_id; + buffer_area_ = new_buffer_area; +} + +device_ptr HIPDeviceGraphicsInterop::map() +{ + if (!hip_graphics_resource_) { + return 0; + } + + HIPContextScope scope(device_); + + hipDeviceptr_t hip_buffer; + size_t bytes; + + hip_device_assert(device_, + hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream())); + hip_device_assert( + device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_)); + + return static_cast<device_ptr>(hip_buffer); +} + +void HIPDeviceGraphicsInterop::unmap() +{ + HIPContextScope scope(device_); + + hip_device_assert(device_, + hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream())); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h new file mode 100644 index 00000000000..adcaa13a2d7 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device_graphics_interop.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class HIPDeviceQueue; + +class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop { + public: + explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue); + + HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete; + + ~HIPDeviceGraphicsInterop(); + + HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete; + + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + + virtual device_ptr map() override; + virtual void unmap() override; + + protected: + HIPDeviceQueue *queue_ = nullptr; + HIPDevice *device_ = nullptr; + + /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + uint opengl_pbo_id_ = 0; + /* Buffer area in pixels of the corresponding PBO. */ + int64_t buffer_area_ = 0; + + hipGraphicsResource hip_graphics_resource_ = nullptr; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/kernel.cpp b/intern/cycles/device/hip/kernel.cpp new file mode 100644 index 00000000000..e0acd6f17c6 --- /dev/null +++ b/intern/cycles/device/hip/kernel.cpp @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/kernel.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +void HIPDeviceKernels::load(HIPDevice *device) +{ + hipModule_t hipModule = device->hipModule; + + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + HIPDeviceKernel &kernel = kernels_[i]; + + /* No megakernel used for GPU. */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + continue; + } + + const std::string function_name = std::string("kernel_gpu_") + + device_kernel_as_string((DeviceKernel)i); + hip_device_assert(device, + hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str())); + + if (kernel.function) { + hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1)); + + hip_device_assert( + device, + hipModuleOccupancyMaxPotentialBlockSize( + &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0)); + } + else { + LOG(ERROR) << "Unable to load kernel " << function_name; + } + } + + loaded = true; +} + +const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool HIPDeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].function != nullptr; +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP*/ diff --git a/intern/cycles/device/hip/kernel.h b/intern/cycles/device/hip/kernel.h new file mode 100644 index 00000000000..3301731f56e --- /dev/null +++ b/intern/cycles/device/hip/kernel.h @@ -0,0 +1,54 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* HIP kernel and associate occupancy information. */ +class HIPDeviceKernel { + public: + hipFunction_t function = nullptr; + + int num_threads_per_block = 0; + int min_blocks = 0; +}; + +/* Cache of HIP kernels for each DeviceKernel. */ +class HIPDeviceKernels { + public: + void load(HIPDevice *device); + const HIPDeviceKernel &get(DeviceKernel kernel) const; + bool available(DeviceKernel kernel) const; + + protected: + HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM]; + bool loaded = false; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp new file mode 100644 index 00000000000..78c77e5fdae --- /dev/null +++ b/intern/cycles/device/hip/queue.cpp @@ -0,0 +1,209 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/queue.h" + +# include "device/hip/device_impl.h" +# include "device/hip/graphics_interop.h" +# include "device/hip/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* HIPDeviceQueue */ + +HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device) + : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr) +{ + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking)); +} + +HIPDeviceQueue::~HIPDeviceQueue() +{ + const HIPContextScope scope(hip_device_); + hipStreamDestroy(hip_stream_); +} + +int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +{ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + return 14416128; +} + +int HIPDeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = hip_device_->get_num_multiprocessors() * + hip_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void HIPDeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + HIPContextScope scope(hip_device_); + hip_device_->load_texture_info(); + hip_device_assert(hip_device_, hipDeviceSynchronize()); + + debug_init_execution(); +} + +bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return hip_device_->kernels.available(kernel); +} + +bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (hip_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const HIPContextScope scope(hip_device_); + const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = hip_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + default: + break; + } + + /* Launch kernel. */ + hip_device_assert(hip_device_, + hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + args, + 0)); + return !(hip_device_->have_error()); +} + +bool HIPDeviceQueue::synchronize() +{ + if (hip_device_->have_error()) { + return false; + } + + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_)); + debug_synchronize(); + + return !(hip_device_->have_error()); +} + +void HIPDeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyHtoDAsync( + (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyDtoHAsync( + mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_)); +} + +// TODO : (Arya) Enable this after stabilizing dev branch +unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create() +{ + return make_unique<HIPDeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h new file mode 100644 index 00000000000..04c8a5982ce --- /dev/null +++ b/intern/cycles/device/hip/queue.h @@ -0,0 +1,68 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" +# include "device/device_memory.h" +# include "device/device_queue.h" + +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class device_memory; + +/* Base class for HIP queues. */ +class HIPDeviceQueue : public DeviceQueue { + public: + HIPDeviceQueue(HIPDevice *device); + ~HIPDeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual hipStream_t stream() + { + return hip_stream_; + } + + // TODO : (Arya) Enable this after stabilizing the dev branch + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; + + protected: + HIPDevice *hip_device_; + hipStream_t hip_stream_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.cpp b/intern/cycles/device/hip/util.cpp new file mode 100644 index 00000000000..44f52c4e17b --- /dev/null +++ b/intern/cycles/device/hip/util.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/util.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +HIPContextScope::HIPContextScope(HIPDevice *device) : device(device) +{ + hip_device_assert(device, hipCtxPushCurrent(device->hipContext)); +} + +HIPContextScope::~HIPContextScope() +{ + hip_device_assert(device, hipCtxPopCurrent(NULL)); +} + +# ifndef WITH_HIP_DYNLOAD +const char *hipewErrorString(hipError_t result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *hipewCompilerPath() +{ + return CYCLES_HIP_HIPCC_EXECUTABLE; +} + +int hipewCompilerVersion() +{ + return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10); +} +# endif + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h new file mode 100644 index 00000000000..f8468e0dc5c --- /dev/null +++ b/intern/cycles/device/hip/util.h @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* Utility to push/pop HIP context. */ +class HIPContextScope { + public: + HIPContextScope(HIPDevice *device); + ~HIPContextScope(); + + private: + HIPDevice *device; +}; + +/* Utility for checking return values of HIP function calls. */ +# define hip_device_assert(hip_device, stmt) \ + { \ + hipError_t result = stmt; \ + if (result != hipSuccess) { \ + const char *name = hipewErrorString(result); \ + hip_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define hip_assert(stmt) hip_device_assert(this, stmt) + +# ifndef WITH_HIP_DYNLOAD +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked HIP at all. */ +const char *hipewErrorString(hipError_t result); +const char *hipewCompilerPath(); +int hipewCompilerVersion(); +# endif /* WITH_HIP_DYNLOAD */ + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */
\ No newline at end of file diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 10d0c5e7b4c..9633d3b87d3 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -1035,6 +1035,8 @@ static const char *device_type_for_description(const DeviceType type) return "CUDA"; case DEVICE_OPTIX: return "OptiX"; + case DEVICE_HIP: + return "HIP"; case DEVICE_DUMMY: return "Dummy"; case DEVICE_MULTI: diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 4196539a9b1..8c2cb2c68de 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA device/cuda/kernel.cu ) +set(SRC_DEVICE_HIP + device/hip/kernel.cpp +) + set(SRC_DEVICE_OPTIX device/optix/kernel.cu device/optix/kernel_shader_raytrace.cu @@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS device/cuda/globals.h ) +set(SRC_DEVICE_HIP_HEADERS + device/hip/compat.h + device/hip/config.h + device/hip/globals.h +) + set(SRC_DEVICE_OPTIX_HEADERS device/optix/compat.h device/optix/globals.h @@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES) cycles_set_solution_folder(cycles_kernel_cuda) endif() +####################################################### START + +# HIP module + +if(WITH_CYCLES_HIP_BINARIES) + # 64 bit only + set(HIP_BITS 64) + + # HIP version + execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT) + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}") + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}") + set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}") + + + message(WARNING + "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected") + + # build for each arch + set(hip_sources device/hip/kernel.cpp + ${SRC_HEADERS} + ${SRC_DEVICE_HIP_HEADERS} + ${SRC_BVH_HEADERS} + ${SRC_SVM_HEADERS} + ${SRC_GEOM_HEADERS} + ${SRC_INTEGRATOR_HEADERS} + ${SRC_CLOSURE_HEADERS} + ${SRC_UTIL_HEADERS} + ) + set(hip_fatbins) + + macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental) + if(${arch} MATCHES "compute_.*") + set(format "ptx") + else() + set(format "fatbin") + endif() + set(hip_file ${name}_${arch}.${format}) + + set(kernel_sources ${sources}) + if(NOT ${prev_arch} STREQUAL "none") + if(${prev_arch} MATCHES "compute_.*") + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx) + else() + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin) + endif() + endif() + + set(hip_kernel_src "/device/hip/${name}.cpp") + + set(hip_flags ${flags} + -D CCL_NAMESPACE_BEGIN= + -D CCL_NAMESPACE_END= + -D HIPCC + -m ${HIP_BITS} + -I ${CMAKE_CURRENT_SOURCE_DIR}/.. + -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip + --use_fast_math + -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file}) + + if(${experimental}) + set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) + endif() + + if(WITH_CYCLES_DEBUG) + set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__) + endif() + + if(WITH_NANOVDB) + set(hip_flags ${hip_flags} + -D WITH_NANOVDB + -I "${NANOVDB_INCLUDE_DIR}") + endif() + + + set(prev_arch "none") + foreach(arch ${CYCLES_HIP_BINARIES_ARCH}) + set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE}) + set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR}) + if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir) + # Compile regular kernel + CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE) + + if(WITH_CYCLES_HIP_BUILD_SERIAL) + set(prev_arch ${arch}) + endif() + + unset(hip_hipcc_executable) + unset(hip_toolkit_root_dir) + endif() + endforeach() + + add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins}) + cycles_set_solution_folder(cycles_kernel_hip) +endif() + +####################################################### END # OptiX PTX modules if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) @@ -602,11 +710,13 @@ endif() cycles_add_library(cycles_kernel "${LIB}" ${SRC_DEVICE_CPU} ${SRC_DEVICE_CUDA} + ${SRC_DEVICE_HIP} ${SRC_DEVICE_OPTIX} ${SRC_HEADERS} ${SRC_DEVICE_CPU_HEADERS} ${SRC_DEVICE_GPU_HEADERS} ${SRC_DEVICE_CUDA_HEADERS} + ${SRC_DEVICE_HIP_HEADERS} ${SRC_DEVICE_OPTIX_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} @@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS}) source_group("kernel" FILES ${SRC_HEADERS}) source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS}) +source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS}) source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS}) source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS}) source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS}) @@ -632,14 +743,19 @@ endif() if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) add_dependencies(cycles_kernel cycles_kernel_optix) endif() +if(WITH_CYCLES_HIP) + add_dependencies(cycles_kernel cycles_kernel_hip) +endif() # Install kernel source for runtime compilation delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index a68d1d80c7d..db4a4bf71e0 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -25,7 +25,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize, typename IsActiveOp> __device__ void gpu_parallel_active_index_array(const uint num_states, diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index f609520b8b4..a1349e82efb 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values) { diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h index 65b1990dbb8..b60dceb2ed0 100644 --- a/intern/cycles/kernel/device/gpu/parallel_reduce.h +++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h @@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN * the overall cost of the algorithm while keeping the work complexity O(n) and * the step complexity O(log n). (Brent's Theorem optimization) */ -#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp> __device__ void gpu_parallel_sum( diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index 99b35468517..9bca1fad22f 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) template<uint blocksize, typename GetKeyOp> diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h new file mode 100644 index 00000000000..3644925d5be --- /dev/null +++ b/intern/cycles/kernel/device/hip/compat.h @@ -0,0 +1,121 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_HIP__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +#ifdef __HIPCC_RTC__ +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +#else +# include <stdint.h> +#endif + +#ifdef CYCLES_HIPBIN_CC +# define FLT_MIN 1.175494350822287507969e-38f +# define FLT_MAX 340282346638528859811704183484516925440.0f +# define FLT_EPSILON 1.192092896e-07F +#endif + +/* Qualifiers */ + +#define ccl_device __device__ __inline__ +#define ccl_device_inline __device__ __inline__ +#define ccl_device_forceinline __device__ __forceinline__ +#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device +#define ccl_global +#define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ +#define ccl_constant const +#define ccl_gpu_shared __shared__ +#define ccl_private +#define ccl_may_alias +#define ccl_addr_space +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_align(n) __align__(n) +#define ccl_optional_struct_init + +#define kernel_assert(cond) + +/* Types */ +#ifdef __HIP__ +# include "hip/hip_fp16.h" +# include "hip/hip_runtime.h" +#endif + +#ifdef _MSC_VER +# include <immintrin.h> +#endif + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronizaton */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot(predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ +typedef hipTextureObject_t ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Use fast math functions */ + +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h new file mode 100644 index 00000000000..2fde0d46015 --- /dev/null +++ b/intern/cycles/kernel/device/hip/config.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Device data taken from HIP occupancy calculator. + * + * Terminology + * - HIP GPUs have multiple streaming multiprocessors + * - Each multiprocessor executes multiple thread blocks + * - Each thread block contains a number of threads, also known as the block size + * - Multiprocessors have a fixed number of registers, and the amount of registers + * used by each threads limits the number of threads per block. + */ + +/* Launch Bound Definitions */ +#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64 +#define GPU_BLOCK_MAX_THREADS 1024 +#define GPU_THREAD_MAX_REGISTERS 255 + +#define GPU_KERNEL_BLOCK_NUM_THREADS 1024 +#define GPU_KERNEL_MAX_REGISTERS 64 + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ + extern "C" __global__ void __launch_bounds__(block_num_threads, \ + GPU_MULTIPRESSOR_MAX_REGISTERS / \ + (block_num_threads * thread_num_registers)) + +/* sanity checks */ + +#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \ + GPU_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h new file mode 100644 index 00000000000..39978ae7899 --- /dev/null +++ b/intern/cycles/kernel/device/hip/globals.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobals { + /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ + int unused[1]; +}; + +/* Global scene data and textures */ +__constant__ KernelData __data; +#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name; +#include "kernel/kernel_textures.h" + +/* Integrator state */ +__constant__ IntegratorStateGPU __integrator_state; + +/* Abstraction macros */ +#define kernel_data __data +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) +#define kernel_integrator_state __integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp new file mode 100644 index 00000000000..c801320a2e1 --- /dev/null +++ b/intern/cycles/kernel/device/hip/kernel.cpp @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* HIP kernel entry points */ + +#ifdef __HIP_DEVICE_COMPILE__ + +# include "kernel/device/hip/compat.h" +# include "kernel/device/hip/config.h" +# include "kernel/device/hip/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index de17efafcf2..faba411c769 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -34,7 +34,7 @@ #else /* __KERNEL_GPU__ */ -# ifdef __KERNEL_CUDA__ +# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) # define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x)) diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 1d598725c84..2245668d02f 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false) reset(); } +DebugFlags::HIP::HIP() : adaptive_compile(false) +{ + reset(); +} + void DebugFlags::CUDA::reset() { if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; } +void DebugFlags::HIP::reset() +{ + if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL) + adaptive_compile = true; +} + DebugFlags::OptiX::OptiX() { reset(); @@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags) os << "OptiX flags:\n" << " Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n"; + + os << "HIP flags:\n" + << " HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n"; + return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index a2acaea5675..81677201790 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -93,6 +93,17 @@ class DebugFlags { bool adaptive_compile; }; + /* Descriptor of HIP feature-set to be used. */ + struct HIP { + HIP(); + + /* Reset flags to their defaults. */ + void reset(); + + /* Whether adaptive feature based runtime compile is enabled or not.*/ + bool adaptive_compile; + }; + /* Descriptor of OptiX feature-set to be used. */ struct OptiX { OptiX(); @@ -124,6 +135,9 @@ class DebugFlags { /* Requested OptiX flags. */ OptiX optix; + /* Requested HIP flags. */ + HIP hip; + private: DebugFlags(); diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index d9edfec5da3..f36a492a1b0 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ /* CUDA has its own half data type, no need to define then */ -#ifndef __KERNEL_CUDA__ +#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -59,7 +59,7 @@ struct half4 { half x, y, z, w; }; -#ifdef __KERNEL_CUDA__ +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) ccl_device_inline void float4_store_half(half *h, float4 f) { @@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f) ccl_device_inline void float4_store_half(half *h, float4 f) { + # ifndef __KERNEL_SSE2__ for (int i = 0; i < 4; i++) { /* optimized float to half for pixels: @@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f) # endif } +# ifndef __KERNEL_HIP__ + ccl_device_inline float half_to_float(half h) { float f; @@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h) return f; } +# else + +ccl_device_inline float half_to_float(std::uint32_t a) noexcept +{ + + std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; + + std::uint32_t v = __float_as_uint(__uint_as_float(u) * + __uint_as_float(0x77800000U) /*0x1.0p+112f*/) + + 0x38000000U; + + u = (a & 0x7fff) != 0 ? v : u; + + return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/; +} + +# endif /* __KERNEL_HIP__ */ ccl_device_inline float4 half4_to_float4(half4 h) { diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 6d728dde679..cb1e94c838c 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -26,6 +26,10 @@ # include <cmath> #endif +#ifdef __HIP__ +# include <hip/hip_vector_types.h> +#endif + #include <float.h> #include <math.h> #include <stdio.h> @@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN /* Scalar */ -#ifdef _WIN32 +#ifndef __HIP__ +# ifdef _WIN32 ccl_device_inline float fmaxf(float a, float b) { return (a > b) ? a : b; @@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b) { return (a < b) ? a : b; } -#endif /* _WIN32 */ + +# endif /* _WIN32 */ +#endif /* __HIP__ */ #ifndef __KERNEL_GPU__ using std::isfinite; @@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f) return u.i; } +#ifndef __HIP__ ccl_device_inline int __float_as_int(float f) { union { @@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } +#endif ccl_device_inline int4 __float4_as_int4(float4 f) { @@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits) ccl_device_inline uint count_leading_zeros(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __clz(x); #else assert(x != 0); @@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x) ccl_device_inline uint count_trailing_zeros(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return (__ffs(x) - 1); #else assert(x != 0); @@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x) ccl_device_inline uint find_first_set(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __ffs(x); #else # ifdef _MSC_VER |