diff options
Diffstat (limited to 'intern/cycles/device/hip')
-rw-r--r-- | intern/cycles/device/hip/device.cpp | 276 | ||||
-rw-r--r-- | intern/cycles/device/hip/device.h | 37 | ||||
-rw-r--r-- | intern/cycles/device/hip/device_impl.cpp | 1343 | ||||
-rw-r--r-- | intern/cycles/device/hip/device_impl.h | 153 | ||||
-rw-r--r-- | intern/cycles/device/hip/graphics_interop.cpp | 93 | ||||
-rw-r--r-- | intern/cycles/device/hip/graphics_interop.h | 61 | ||||
-rw-r--r-- | intern/cycles/device/hip/kernel.cpp | 69 | ||||
-rw-r--r-- | intern/cycles/device/hip/kernel.h | 54 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.cpp | 209 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.h | 68 | ||||
-rw-r--r-- | intern/cycles/device/hip/util.cpp | 61 | ||||
-rw-r--r-- | intern/cycles/device/hip/util.h | 63 |
12 files changed, 2487 insertions, 0 deletions
diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp new file mode 100644 index 00000000000..90028ac7f10 --- /dev/null +++ b/intern/cycles/device/hip/device.cpp @@ -0,0 +1,276 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/hip/device.h" + +#include "util/util_logging.h" + +#ifdef WITH_HIP +# include "device/device.h" +# include "device/hip/device_impl.h" + +# include "util/util_string.h" +# include "util/util_windows.h" +#endif /* WITH_HIP */ + +CCL_NAMESPACE_BEGIN + +bool device_hip_init() +{ +#if !defined(WITH_HIP) + return false; +#elif defined(WITH_HIP_DYNLOAD) + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + int hipew_result = hipewInit(HIPEW_INIT_HIP); + if (hipew_result == HIPEW_SUCCESS) { + VLOG(1) << "HIPEW initialization succeeded"; + if (HIPDevice::have_precompiled_kernels()) { + VLOG(1) << "Found precompiled kernels"; + result = true; + } + else if (hipewCompilerPath() != NULL) { + VLOG(1) << "Found HIPCC " << hipewCompilerPath(); + result = true; + } + else { + VLOG(1) << "Neither precompiled kernels nor HIPCC was found," + << " unable to use HIP"; + } + } + else { + VLOG(1) << "HIPEW initialization failed: " + << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + + return result; +#else /* WITH_HIP_DYNLOAD */ + return true; +#endif /* WITH_HIP_DYNLOAD */ +} + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_HIP + return new HIPDevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen."; + + return nullptr; +#endif +} + +#ifdef WITH_HIP +static hipError_t device_hip_safe_init() +{ +# ifdef _WIN32 + __try { + return hipInit(0); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the HIP driver and hope we can + * survive even with corrupted HIP installs. */ + fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n"); + } + + return hipErrorNoDevice; +# else + return hipInit(0); +# endif +} +#endif /* WITH_HIP */ + +void device_hip_info(vector<DeviceInfo> &devices) +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) + fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result)); + return; + } + + int count = 0; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result)); + return; + } + + vector<DeviceInfo> display_devices; + + for (int num = 0; num < count; num++) { + char name[256]; + + result = hipDeviceGetName(name, 256, num); + if (result != hipSuccess) { + fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result)); + continue; + } + + int major; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num); + // TODO : (Arya) What is the last major version we are supporting? + + DeviceInfo info; + + info.type = DEVICE_HIP; + info.description = string(name); + info.num = num; + + info.has_half_images = (major >= 3); + info.has_nanovdb = true; + info.denoisers = 0; + + info.has_gpu_queue = true; + /* Check if the device has P2P access to any other device in the system. */ + for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { + if (num != peer_num) { + int can_access = 0; + hipDeviceCanAccessPeer(&can_access, num, peer_num); + info.has_peer_memory = (can_access != 0); + } + } + + int pci_location[3] = {0, 0, 0}; + hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num); + hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num); + hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num); + info.id = string_printf("HIP_%s_%04x:%02x:%02x", + name, + (unsigned int)pci_location[0], + (unsigned int)pci_location[1], + (unsigned int)pci_location[2]); + + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num); + + if (timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; + info.description += " (Display)"; + info.display_device = true; + display_devices.push_back(info); + } + else { + VLOG(1) << "Device has compute preemption or is not used for display."; + devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + } + + if (!display_devices.empty()) + devices.insert(devices.end(), display_devices.begin(), display_devices.end()); +#else /* WITH_HIP */ + (void)devices; +#endif /* WITH_HIP */ +} + +string device_hip_capabilities() +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) { + return string("Error initializing HIP: ") + hipewErrorString(result); + } + return "No HIP device found\n"; + } + + int count; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + return string("Error getting devices: ") + hipewErrorString(result); + } + + string capabilities = ""; + for (int num = 0; num < count; num++) { + char name[256]; + if (hipDeviceGetName(name, 256, num) != hipSuccess) { + continue; + } + capabilities += string("\t") + name + "\n"; + int value; +# define GET_ATTR(attr) \ + { \ + if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \ + capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \ + } \ + } \ + (void)0 + /* TODO(sergey): Strip all attributes which are not useful for us + * or does not depend on the driver. + */ + GET_ATTR(MaxThreadsPerBlock); + GET_ATTR(MaxBlockDimX); + GET_ATTR(MaxBlockDimY); + GET_ATTR(MaxBlockDimZ); + GET_ATTR(MaxGridDimX); + GET_ATTR(MaxGridDimY); + GET_ATTR(MaxGridDimZ); + GET_ATTR(MaxSharedMemoryPerBlock); + GET_ATTR(TotalConstantMemory); + GET_ATTR(WarpSize); + GET_ATTR(MaxPitch); + GET_ATTR(MaxRegistersPerBlock); + GET_ATTR(ClockRate); + GET_ATTR(TextureAlignment); + GET_ATTR(MultiprocessorCount); + GET_ATTR(KernelExecTimeout); + GET_ATTR(Integrated); + GET_ATTR(CanMapHostMemory); + GET_ATTR(ComputeMode); + GET_ATTR(MaxTexture1DWidth); + GET_ATTR(MaxTexture2DWidth); + GET_ATTR(MaxTexture2DHeight); + GET_ATTR(MaxTexture3DWidth); + GET_ATTR(MaxTexture3DHeight); + GET_ATTR(MaxTexture3DDepth); + GET_ATTR(ConcurrentKernels); + GET_ATTR(EccEnabled); + GET_ATTR(MemoryClockRate); + GET_ATTR(MemoryBusWidth); + GET_ATTR(L2CacheSize); + GET_ATTR(MaxThreadsPerMultiProcessor); + GET_ATTR(ComputeCapabilityMajor); + GET_ATTR(ComputeCapabilityMinor); + GET_ATTR(MaxSharedMemoryPerMultiprocessor); + GET_ATTR(ManagedMemory); + GET_ATTR(IsMultiGpuBoard); +# undef GET_ATTR + capabilities += "\n"; + } + + return capabilities; + +#else /* WITH_HIP */ + return ""; +#endif /* WITH_HIP */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/hip/device.h b/intern/cycles/device/hip/device.h new file mode 100644 index 00000000000..76fa8995bed --- /dev/null +++ b/intern/cycles/device/hip/device.h @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_hip_init(); + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_hip_info(vector<DeviceInfo> &devices); + +string device_hip_capabilities(); + +CCL_NAMESPACE_END
\ No newline at end of file diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp new file mode 100644 index 00000000000..0e5ac6ce401 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.cpp @@ -0,0 +1,1343 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/hip/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +bool HIPDevice::have_precompiled_kernels() +{ + string fatbins_path = path_get("lib"); + return path_exists(fatbins_path); +} + +bool HIPDevice::show_samples() const +{ + /* The HIPDevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask HIPDevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void HIPDevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + hipDevId = info.num; + hipDevice = 0; + hipContext = 0; + + hipModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize HIP. */ + hipError_t result = hipInit(0); + if (result != hipSuccess) { + set_error(string_printf("Failed to initialize HIP runtime (%s)", hipewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = hipGetDevice(&hipDevice, hipDevId); + if (result != hipSuccess) { + set_error(string_printf("Failed to get HIP device handle from ordinal (%s)", + hipewErrorString(result))); + return; + } + + hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); + + hip_assert( + hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice)); + + unsigned int ctx_flags = hipDeviceLmemResizeToMax; + if (can_map_host) { + ctx_flags |= hipDeviceMapHost; + init_host_memory(); + } + + /* Create context. */ + result = hipCtxCreate(&hipContext, ctx_flags, hipDevice); + + if (result != hipSuccess) { + set_error(string_printf("Failed to create HIP context (%s)", hipewErrorString(result))); + return; + } + + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + hipDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by hipCtxCreate. */ + hipCtxPopCurrent(NULL); +} + +HIPDevice::~HIPDevice() +{ + texture_info.free(); + + hip_assert(hipCtxDestroy(hipContext)); +} + +bool HIPDevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + // TODO : (Arya) What versions do we plan to support? + return true; +} + +bool HIPDevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_HIP && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + HIPDevice *const peer_device_hip = static_cast<HIPDevice *>(peer_device); + + int can_access = 0; + hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + hip_assert(hipDeviceGetP2PAttribute( + &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const HIPContextScope scope(this); + hipError_t result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + { + const HIPContextScope scope(peer_device_hip); + hipError_t result = hipCtxEnablePeerAccess(hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + + return true; +} + +bool HIPDevice::use_adaptive_compilation() +{ + return DebugFlags().hip.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DHIPCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + return cflags; +} + +string HIPDevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string fatbin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using precompiled kernel."; + return fatbin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing hip toolkit or changing other + * compiler command line arguments makes sure fatbin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = "genco"; +# ifdef _WIN32 + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp"; +# else + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ggdb"; +# endif + const string include_path = source_path; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string fatbin_file = string_printf( + "cycles_%s_%s_%d%d_%s", name, kernel_arch, major, minor, kernel_md5.c_str()); + const string fatbin = path_cache_get(path_join("kernels", fatbin_file)); + VLOG(1) << "Testing for locally compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using locally compiled kernel."; + return fatbin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (major < 3) { + set_error( + string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("HIP binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + /* Compile. */ + const char *const hipcc = hipewCompilerPath(); + if (hipcc == NULL) { + set_error( + "HIP hipcc compiler not found. " + "Install HIP toolkit in default location."); + return string(); + } + + const int hipcc_hip_version = hipewCompilerVersion(); + VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; + if (hipcc_hip_version < 40) { + printf( + "Unsupported HIP version %d.%d detected, " + "you need HIP 4.0 or newer.\n", + hipcc_hip_version / 10, + hipcc_hip_version % 10); + return string(); + } + + double starttime = time_dt(); + + path_create_directories(fatbin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cpp", name)))); + + string command = string_printf("%s -%s -I %s --%s %s -o \"%s\"", + hipcc, + options, + include_path.c_str(), + kernel_ext, + source_path.c_str(), + fatbin.c_str()); + + printf("Compiling HIP kernel ...\n%s\n", command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + + /* Verify if compilation succeeded */ + if (!path_exists(fatbin)) { + set_error( + "HIP kernel compilation failed, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return fatbin; +} + +bool HIPDevice::load_kernels(const uint kernel_features) +{ + /* TODO(sergey): Support kernels re-load for HIP devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in hipCtxSynchronize. + */ + if (hipModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if hip init succeeded */ + if (hipContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* get kernel */ + const char *kernel_name = "kernel"; + string fatbin = compile_kernel(kernel_features, kernel_name); + if (fatbin.empty()) + return false; + + /* open module */ + HIPContextScope scope(this); + + string fatbin_data; + hipError_t result; + + if (path_read_text(fatbin, fatbin_data)) + result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); + else + result = hipErrorFileNotFound; + + if (result != hipSuccess) + set_error(string_printf( + "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result))); + + if (result == hipSuccess) { + kernels.load(this); + reserve_local_memory(kernel_features); + } + + return (result == hipSuccess); +} + +void HIPDevice::reserve_local_memory(const uint) +{ + /* Together with hipDeviceLmemResizeToMax, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + size_t total = 0, free_before = 0, free_after = 0; + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_before, &total); + } + + { + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE; + + /* Launch kernel, using just 1 block appears sufficient to reserve memory for all + * multiprocessors. It would be good to do this in parallel for the multi GPU case + * still to make it faster. */ + HIPDeviceQueue queue(this); + + void *d_path_index = nullptr; + void *d_render_buffer = nullptr; + int d_work_size = 0; + void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_after, &total); + } + + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + hipDeviceptr_t tmp; + hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL)); + hipMemGetInfo(&free_after, &total); + } +# endif +} + +void HIPDevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void HIPDevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + } +} + +void HIPDevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(hip_mem_map_mutex); + foreach (HIPMemMap::value_type &pair, hip_mem_map) { + device_memory &mem = *pair.first; + HIPMem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple HIP devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + HIPContextScope scope(this); + + hipDeviceptr_t device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + hipError_t mem_alloc_result = hipErrorOutOfMemory; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + hipMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + hipMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = hipMalloc(&device_pointer, size); + if (mem_alloc_result == hipSuccess) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != hipSuccess && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = hipSuccess; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = hipHostMalloc(&shared_pointer, size); + + assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || + (mem_alloc_result != hipSuccess && shared_pointer == 0)); + } + + if (mem_alloc_result == hipSuccess) { + hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + } + + if (mem_alloc_result != hipSuccess) { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(hip_mem_map_mutex); + HIPMem *cmem = &hip_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * HIP memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void HIPDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from + * mem.host_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert( + hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } +} + +void HIPDevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + hipHostFree(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + hip_assert(hipFree(mem.device_pointer)); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } +} + +void HIPDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + generic_alloc(mem); + } +} + +void HIPDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void HIPDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemcpyDtoH( + (char *)mem.host_pointer + offset, (hipDeviceptr_t)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void HIPDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void HIPDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr HIPDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void HIPDevice::const_copy_to(const char *name, void *host, size_t size) +{ + HIPContextScope scope(this); + hipDeviceptr_t mem; + size_t bytes; + + hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); + assert(bytes == size); + hip_assert(hipMemcpyHtoD(mem, host, size)); +} + +void HIPDevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void HIPDevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void HIPDevice::tex_alloc(device_texture &mem) +{ + HIPContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + hipTextureAddressMode address_mode = hipAddressModeWrap; + switch (mem.info.extension) { + case EXTENSION_REPEAT: + address_mode = hipAddressModeWrap; + break; + case EXTENSION_EXTEND: + address_mode = hipAddressModeClamp; + break; + case EXTENSION_CLIP: + // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported + // in hip + address_mode = hipAddressModeClamp; + break; + default: + assert(0); + break; + } + + hipTextureFilterMode filter_mode; + if (mem.info.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = hipFilterModePoint; + } + else { + filter_mode = hipFilterModeLinear; + } + + /* Image Texture Storage */ + hipArray_Format format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = HIP_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = HIP_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = HIP_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = HIP_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = HIP_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = HIP_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + HIPMem *cmem = NULL; + hArray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (!mem.is_resident(this)) { + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (hArray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + HIP_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + hip_assert(hipArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + HIP_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeArray; + param.dstArray = &array_3d; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + hip_assert(hipDrvMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + dst_pitch = align_up(src_pitch, pitch_alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + hip_Memcpy2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeDevice; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + hip_assert(hipDrvMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; + + if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + /* Kepler+, bindless textures. */ + hipResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = hipResourceTypeArray; + resDesc.res.array.h_Array = &array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = hipResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = hipResourceTypeLinear; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES; + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + + hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + texture_info[slot].data = (uint64_t)cmem->texobject; + } + else { + texture_info[slot].data = (uint64_t)mem.device_pointer; + } +} + +void HIPDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + hipTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + hipArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } + } +} + +# if 0 +void HIPDevice::render(DeviceTask &task, + RenderTile &rtile, + device_vector<KernelWorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + HIPContextScope scope(this); + hipFunction_t hipRender; + + /* Get kernel function. */ + if (rtile.task == RenderTile::BAKE) { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake")); + } + else { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace")); + } + + if (have_error()) { + return; + } + + hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + KernelWorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer; + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + hip_assert( + hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + uint start_sample = rtile.start_sample; + uint end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample;) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); + work_tiles.copy_to_device(); + + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + hip_assert( + hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + + hip_assert(hipDeviceSynchronize()); + + /* Update progress. */ + sample += wtile->num_samples; + rtile.sample = sample; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + hip_assert(hipDeviceSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } +} + +void HIPDevice::thread_run(DeviceTask &task) +{ + HIPContextScope scope(this); + + if (task.type == DeviceTask::RENDER) { + device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, task); + + while (task.acquire_tile(this, tile, task.tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, work_tiles); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, work_tiles); + } + + task.release_tile(tile); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } +} +# endif + +unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create() +{ + return make_unique<HIPDeviceQueue>(this); +} + +bool HIPDevice::should_use_graphics_interop() +{ + /* Check whether this device is part of OpenGL context. + * + * Using HIP device for graphics interoperability which is not part of the OpenGL context is + * possible, but from the empiric measurements it can be considerably slower than using naive + * pixels copy. */ + + HIPContextScope scope(this); + + int num_all_devices = 0; + hip_assert(hipGetDeviceCount(&num_all_devices)); + + if (num_all_devices == 0) { + return false; + } + + vector<hipDevice_t> gl_devices(num_all_devices); + uint num_gl_devices = 0; + hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll); + + for (hipDevice_t gl_device : gl_devices) { + if (gl_device == hipDevice) { + return true; + } + } + + return false; +} + +int HIPDevice::get_num_multiprocessors() +{ + return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0); +} + +int HIPDevice::get_max_num_threads_per_multiprocessor() +{ + return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0); +} + +bool HIPDevice::get_device_attribute(hipDeviceAttribute_t attribute, int *value) +{ + HIPContextScope scope(this); + + return hipDeviceGetAttribute(value, attribute, hipDevice) == hipSuccess; +} + +int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value) +{ + int value = 0; + if (!get_device_attribute(attribute, &value)) { + return default_value; + } + return value; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h new file mode 100644 index 00000000000..1d138ee9856 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.h @@ -0,0 +1,153 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device.h" +# include "device/hip/kernel.h" +# include "device/hip/queue.h" +# include "device/hip/util.h" + +# include "util/util_map.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# else +# include "util/util_opengl.h" +# endif + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class HIPDevice : public Device { + + friend class HIPContextScope; + + public: + hipDevice_t hipDevice; + hipCtx_t hipContext; + hipModule_t hipModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int pitch_alignment; + int hipDevId; + int hipDevArchitecture; + bool first_error; + + struct HIPMem { + HIPMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + hipTextureObject_t texobject; + hArray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, HIPMem> HIPMemMap; + HIPMemMap hip_mem_map; + thread_mutex hip_mem_map_mutex; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + HIPDeviceKernels kernels; + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~HIPDevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + virtual string compile_kernel_get_common_cflags(const uint kernel_features); + + string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "hip", + bool force_ptx = false); + + virtual bool load_kernels(const uint kernel_features) override; + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + /* Graphics resources interoperability. */ + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + protected: + bool get_device_attribute(hipDeviceAttribute_t attribute, int *value); + int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.cpp b/intern/cycles/device/hip/graphics_interop.cpp new file mode 100644 index 00000000000..add6dbed5e1 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.cpp @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/graphics_interop.h" + +# include "device/hip/device_impl.h" +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue) + : queue_(queue), device_(static_cast<HIPDevice *>(queue->device)) +{ +} + +HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop() +{ + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } +} + +void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination) +{ + const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + + if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + return; + } + + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } + + const hipError_t result = hipGraphicsGLRegisterBuffer( + &hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone); + if (result != hipSuccess) { + LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result); + } + + opengl_pbo_id_ = destination.opengl_pbo_id; + buffer_area_ = new_buffer_area; +} + +device_ptr HIPDeviceGraphicsInterop::map() +{ + if (!hip_graphics_resource_) { + return 0; + } + + HIPContextScope scope(device_); + + hipDeviceptr_t hip_buffer; + size_t bytes; + + hip_device_assert(device_, + hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream())); + hip_device_assert( + device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_)); + + return static_cast<device_ptr>(hip_buffer); +} + +void HIPDeviceGraphicsInterop::unmap() +{ + HIPContextScope scope(device_); + + hip_device_assert(device_, + hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream())); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h new file mode 100644 index 00000000000..adcaa13a2d7 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device_graphics_interop.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class HIPDeviceQueue; + +class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop { + public: + explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue); + + HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete; + + ~HIPDeviceGraphicsInterop(); + + HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete; + + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + + virtual device_ptr map() override; + virtual void unmap() override; + + protected: + HIPDeviceQueue *queue_ = nullptr; + HIPDevice *device_ = nullptr; + + /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + uint opengl_pbo_id_ = 0; + /* Buffer area in pixels of the corresponding PBO. */ + int64_t buffer_area_ = 0; + + hipGraphicsResource hip_graphics_resource_ = nullptr; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/kernel.cpp b/intern/cycles/device/hip/kernel.cpp new file mode 100644 index 00000000000..e0acd6f17c6 --- /dev/null +++ b/intern/cycles/device/hip/kernel.cpp @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/kernel.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +void HIPDeviceKernels::load(HIPDevice *device) +{ + hipModule_t hipModule = device->hipModule; + + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + HIPDeviceKernel &kernel = kernels_[i]; + + /* No megakernel used for GPU. */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + continue; + } + + const std::string function_name = std::string("kernel_gpu_") + + device_kernel_as_string((DeviceKernel)i); + hip_device_assert(device, + hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str())); + + if (kernel.function) { + hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1)); + + hip_device_assert( + device, + hipModuleOccupancyMaxPotentialBlockSize( + &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0)); + } + else { + LOG(ERROR) << "Unable to load kernel " << function_name; + } + } + + loaded = true; +} + +const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool HIPDeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].function != nullptr; +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP*/ diff --git a/intern/cycles/device/hip/kernel.h b/intern/cycles/device/hip/kernel.h new file mode 100644 index 00000000000..3301731f56e --- /dev/null +++ b/intern/cycles/device/hip/kernel.h @@ -0,0 +1,54 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* HIP kernel and associate occupancy information. */ +class HIPDeviceKernel { + public: + hipFunction_t function = nullptr; + + int num_threads_per_block = 0; + int min_blocks = 0; +}; + +/* Cache of HIP kernels for each DeviceKernel. */ +class HIPDeviceKernels { + public: + void load(HIPDevice *device); + const HIPDeviceKernel &get(DeviceKernel kernel) const; + bool available(DeviceKernel kernel) const; + + protected: + HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM]; + bool loaded = false; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp new file mode 100644 index 00000000000..78c77e5fdae --- /dev/null +++ b/intern/cycles/device/hip/queue.cpp @@ -0,0 +1,209 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/queue.h" + +# include "device/hip/device_impl.h" +# include "device/hip/graphics_interop.h" +# include "device/hip/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* HIPDeviceQueue */ + +HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device) + : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr) +{ + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking)); +} + +HIPDeviceQueue::~HIPDeviceQueue() +{ + const HIPContextScope scope(hip_device_); + hipStreamDestroy(hip_stream_); +} + +int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +{ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + return 14416128; +} + +int HIPDeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = hip_device_->get_num_multiprocessors() * + hip_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void HIPDeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + HIPContextScope scope(hip_device_); + hip_device_->load_texture_info(); + hip_device_assert(hip_device_, hipDeviceSynchronize()); + + debug_init_execution(); +} + +bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return hip_device_->kernels.available(kernel); +} + +bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (hip_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const HIPContextScope scope(hip_device_); + const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = hip_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + default: + break; + } + + /* Launch kernel. */ + hip_device_assert(hip_device_, + hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + args, + 0)); + return !(hip_device_->have_error()); +} + +bool HIPDeviceQueue::synchronize() +{ + if (hip_device_->have_error()) { + return false; + } + + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_)); + debug_synchronize(); + + return !(hip_device_->have_error()); +} + +void HIPDeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyHtoDAsync( + (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyDtoHAsync( + mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_)); +} + +// TODO : (Arya) Enable this after stabilizing dev branch +unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create() +{ + return make_unique<HIPDeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h new file mode 100644 index 00000000000..04c8a5982ce --- /dev/null +++ b/intern/cycles/device/hip/queue.h @@ -0,0 +1,68 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" +# include "device/device_memory.h" +# include "device/device_queue.h" + +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class device_memory; + +/* Base class for HIP queues. */ +class HIPDeviceQueue : public DeviceQueue { + public: + HIPDeviceQueue(HIPDevice *device); + ~HIPDeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual hipStream_t stream() + { + return hip_stream_; + } + + // TODO : (Arya) Enable this after stabilizing the dev branch + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; + + protected: + HIPDevice *hip_device_; + hipStream_t hip_stream_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.cpp b/intern/cycles/device/hip/util.cpp new file mode 100644 index 00000000000..44f52c4e17b --- /dev/null +++ b/intern/cycles/device/hip/util.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/util.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +HIPContextScope::HIPContextScope(HIPDevice *device) : device(device) +{ + hip_device_assert(device, hipCtxPushCurrent(device->hipContext)); +} + +HIPContextScope::~HIPContextScope() +{ + hip_device_assert(device, hipCtxPopCurrent(NULL)); +} + +# ifndef WITH_HIP_DYNLOAD +const char *hipewErrorString(hipError_t result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *hipewCompilerPath() +{ + return CYCLES_HIP_HIPCC_EXECUTABLE; +} + +int hipewCompilerVersion() +{ + return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10); +} +# endif + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h new file mode 100644 index 00000000000..f8468e0dc5c --- /dev/null +++ b/intern/cycles/device/hip/util.h @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* Utility to push/pop HIP context. */ +class HIPContextScope { + public: + HIPContextScope(HIPDevice *device); + ~HIPContextScope(); + + private: + HIPDevice *device; +}; + +/* Utility for checking return values of HIP function calls. */ +# define hip_device_assert(hip_device, stmt) \ + { \ + hipError_t result = stmt; \ + if (result != hipSuccess) { \ + const char *name = hipewErrorString(result); \ + hip_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define hip_assert(stmt) hip_device_assert(this, stmt) + +# ifndef WITH_HIP_DYNLOAD +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked HIP at all. */ +const char *hipewErrorString(hipError_t result); +const char *hipewCompilerPath(); +int hipewCompilerVersion(); +# endif /* WITH_HIP_DYNLOAD */ + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */
\ No newline at end of file |