diff options
author | Sayak Biswas <sayakAMD> | 2021-10-20 14:37:39 +0300 |
---|---|---|
committer | William Leeson <william@blender.org> | 2021-10-20 15:04:28 +0300 |
commit | ba4e227def13b4b953775ff4a2fd9c154bb07751 (patch) | |
tree | 2ebf929123c0e1cec40fda60c4e8b86f1c38a28c /intern/cycles/device | |
parent | d28aaf6139c8cfa8555542f4f228f390485dd7ed (diff) |
HIP device code cleanup and fix for high VRAM usage
This patch cleans up code for HIP device and makes it more consistent with the CUDA code.
It also fixes the issue with high VRAM usage on AMD cards using HIP allowing better performance and usage on cards like 6600XT.
Added a check in intern/cycles/kernel/bvh/bvh_util.h to prevent compiler error with hipcc
Reviewed By: brecht, leesonw
Maniphest Tasks: T92124
Differential Revision: https://developer.blender.org/D12834
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/hip/device_impl.cpp | 142 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.cpp | 82 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.h | 3 |
3 files changed, 62 insertions, 165 deletions
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 0e5ac6ce401..964783a08bf 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) return; } + /* hipDeviceMapHost for mapping host memory when out of device memory. + * hipDeviceLmemResizeToMax for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); hip_assert( @@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad } else if (map_host_used + size < map_host_limit) { /* Allocate host memory ourselves. */ - mem_alloc_result = hipHostMalloc(&shared_pointer, size); + mem_alloc_result = hipHostMalloc( + &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined); assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || (mem_alloc_result != hipSuccess && shared_pointer == 0)); @@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size) size_t bytes; hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); - assert(bytes == size); hip_assert(hipMemcpyHtoD(mem, host, size)); } @@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem) } } -# if 0 -void HIPDevice::render(DeviceTask &task, - RenderTile &rtile, - device_vector<KernelWorkTile> &work_tiles) -{ - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - HIPContextScope scope(this); - hipFunction_t hipRender; - - /* Get kernel function. */ - if (rtile.task == RenderTile::BAKE) { - hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake")); - } - else { - hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace")); - } - - if (have_error()) { - return; - } - - hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - KernelWorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer; - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - hip_assert( - hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - uint start_sample = rtile.start_sample; - uint end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample;) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile->num_samples = min(wtile->num_samples, end_sample - sample); - work_tiles.copy_to_device(); - - hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - hip_assert( - hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ - uint filter_sample = sample + wtile->num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); - } - - hip_assert(hipDeviceSynchronize()); - - /* Update progress. */ - sample += wtile->num_samples; - rtile.sample = sample; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - /* Finalize adaptive sampling. */ - if (task.adaptive_sampling.use) { - hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; - adaptive_sampling_post(rtile, wtile, d_work_tiles); - hip_assert(hipDeviceSynchronize()); - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - } -} - -void HIPDevice::thread_run(DeviceTask &task) -{ - HIPContextScope scope(this); - - if (task.type == DeviceTask::RENDER) { - device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, task); - - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, work_tiles); - } - - task.release_tile(tile); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } -} -# endif - unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create() { return make_unique<HIPDeviceQueue>(this); diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 78c77e5fdae..0d9f5916d30 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue() hipStreamDestroy(hip_stream_); } -int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const { - /* TODO: compute automatically. */ - /* TODO: must have at least num_threads_per_block. */ - return 14416128; + int num_states = 0; + const int max_num_threads = hip_device_->get_num_multiprocessors() * + hip_device_->get_max_num_threads_per_multiprocessor(); + if (max_num_threads == 0) { + num_states = 1048576; // 65536 * 16 + } + else { + num_states = max_num_threads * 16; + } + + const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR"); + if (factor_str) { + float factor = atof(factor_str); + if (!factor) + VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0"; + num_states = max((int)(num_states * factor), 1024); + } + + VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; } int HIPDeviceQueue::num_concurrent_busy_states() const @@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg } /* Launch kernel. */ - hip_device_assert(hip_device_, - hipModuleLaunchKernel(hip_kernel.function, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - shared_mem_bytes, - hip_stream_, - args, - 0)); + assert_success(hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + args, + 0), + "enqueue"); + return !(hip_device_->have_error()); } @@ -127,7 +147,7 @@ bool HIPDeviceQueue::synchronize() } const HIPContextScope scope(hip_device_); - hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_)); + assert_success(hipStreamSynchronize(hip_stream_), "synchronize"); debug_synchronize(); return !(hip_device_->have_error()); @@ -150,9 +170,9 @@ void HIPDeviceQueue::zero_to_device(device_memory &mem) assert(mem.device_pointer != 0); const HIPContextScope scope(hip_device_); - hip_device_assert( - hip_device_, - hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_)); + assert_success( + hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_), + "zero_to_device"); } void HIPDeviceQueue::copy_to_device(device_memory &mem) @@ -173,10 +193,10 @@ void HIPDeviceQueue::copy_to_device(device_memory &mem) /* Copy memory to device. */ const HIPContextScope scope(hip_device_); - hip_device_assert( - hip_device_, + assert_success( hipMemcpyHtoDAsync( - (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_)); + (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_), + "copy_to_device"); } void HIPDeviceQueue::copy_from_device(device_memory &mem) @@ -192,13 +212,21 @@ void HIPDeviceQueue::copy_from_device(device_memory &mem) /* Copy memory from device. */ const HIPContextScope scope(hip_device_); - hip_device_assert( - hip_device_, + assert_success( hipMemcpyDtoHAsync( - mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_)); + mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_), + "copy_from_device"); +} + +void HIPDeviceQueue::assert_success(hipError_t result, const char *operation) +{ + if (result != hipSuccess) { + const char *name = hipewErrorString(result); + hip_device_->set_error( + string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str())); + } } -// TODO : (Arya) Enable this after stabilizing dev branch unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create() { return make_unique<HIPDeviceGraphicsInterop>(this); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 04c8a5982ce..b92f7de7e4b 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -55,12 +55,13 @@ class HIPDeviceQueue : public DeviceQueue { return hip_stream_; } - // TODO : (Arya) Enable this after stabilizing the dev branch virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; protected: HIPDevice *hip_device_; hipStream_t hip_stream_; + + void assert_success(hipError_t result, const char *operation); }; CCL_NAMESPACE_END |