diff options
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | extern/hipew/include/hipew.h | 6 | ||||
-rw-r--r-- | intern/cycles/device/hip/device_impl.cpp | 142 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.cpp | 82 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_util.h | 2 |
6 files changed, 69 insertions, 168 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 544e22f342b..ac11b495948 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -430,7 +430,7 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY) option(WITH_CYCLES_DEVICE_CUDA "Enable Cycles CUDA compute support" ON) option(WITH_CYCLES_DEVICE_OPTIX "Enable Cycles OptiX support" ON) -option(WITH_CYCLES_DEVICE_HIP "Enable Cycles HIP support" OFF) +option(WITH_CYCLES_DEVICE_HIP "Enable Cycles HIP support" ON) mark_as_advanced(WITH_CYCLES_DEVICE_HIP) mark_as_advanced(WITH_CYCLES_DEVICE_CUDA) diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h index 02fffc331bf..aa42fdf8ecd 100644 --- a/extern/hipew/include/hipew.h +++ b/extern/hipew/include/hipew.h @@ -24,9 +24,13 @@ extern "C" { #include <stdlib.h> #define HIP_IPC_HANDLE_SIZE 64 +#define hipHostMallocDefault 0x00 #define hipHostMallocPortable 0x01 #define hipHostMallocMapped 0x02 #define hipHostMallocWriteCombined 0x04 +#define hipHostMallocNumaUser 0x20000000 +#define hipHostMallocCoherent 0x40000000 +#define hipHostMallocNonCoherent 0x80000000 #define hipHostRegisterPortable 0x01 #define hipHostRegisterMapped 0x02 #define hipHostRegisterIoMemory 0x04 @@ -989,7 +993,7 @@ typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize); typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr); typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr); -typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize); +typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize, unsigned int flags); typedef hipError_t HIPAPI thipHostFree(void* p); typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags); typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags); diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 0e5ac6ce401..964783a08bf 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) return; } + /* hipDeviceMapHost for mapping host memory when out of device memory. + * hipDeviceLmemResizeToMax for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); hip_assert( @@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad } else if (map_host_used + size < map_host_limit) { /* Allocate host memory ourselves. */ - mem_alloc_result = hipHostMalloc(&shared_pointer, size); + mem_alloc_result = hipHostMalloc( + &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined); assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || (mem_alloc_result != hipSuccess && shared_pointer == 0)); @@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size) size_t bytes; hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); - assert(bytes == size); hip_assert(hipMemcpyHtoD(mem, host, size)); } @@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem) } } -# if 0 -void HIPDevice::render(DeviceTask &task, - RenderTile &rtile, - device_vector<KernelWorkTile> &work_tiles) -{ - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - HIPContextScope scope(this); - hipFunction_t hipRender; - - /* Get kernel function. */ - if (rtile.task == RenderTile::BAKE) { - hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake")); - } - else { - hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace")); - } - - if (have_error()) { - return; - } - - hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - KernelWorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer; - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - hip_assert( - hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - uint start_sample = rtile.start_sample; - uint end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample;) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile->num_samples = min(wtile->num_samples, end_sample - sample); - work_tiles.copy_to_device(); - - hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - hip_assert( - hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ - uint filter_sample = sample + wtile->num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); - } - - hip_assert(hipDeviceSynchronize()); - - /* Update progress. */ - sample += wtile->num_samples; - rtile.sample = sample; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - /* Finalize adaptive sampling. */ - if (task.adaptive_sampling.use) { - hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; - adaptive_sampling_post(rtile, wtile, d_work_tiles); - hip_assert(hipDeviceSynchronize()); - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - } -} - -void HIPDevice::thread_run(DeviceTask &task) -{ - HIPContextScope scope(this); - - if (task.type == DeviceTask::RENDER) { - device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, task); - - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, work_tiles); - } - - task.release_tile(tile); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } -} -# endif - unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create() { return make_unique<HIPDeviceQueue>(this); diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 78c77e5fdae..0d9f5916d30 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue() hipStreamDestroy(hip_stream_); } -int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const { - /* TODO: compute automatically. */ - /* TODO: must have at least num_threads_per_block. */ - return 14416128; + int num_states = 0; + const int max_num_threads = hip_device_->get_num_multiprocessors() * + hip_device_->get_max_num_threads_per_multiprocessor(); + if (max_num_threads == 0) { + num_states = 1048576; // 65536 * 16 + } + else { + num_states = max_num_threads * 16; + } + + const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR"); + if (factor_str) { + float factor = atof(factor_str); + if (!factor) + VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0"; + num_states = max((int)(num_states * factor), 1024); + } + + VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; } int HIPDeviceQueue::num_concurrent_busy_states() const @@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg } /* Launch kernel. */ - hip_device_assert(hip_device_, - hipModuleLaunchKernel(hip_kernel.function, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - shared_mem_bytes, - hip_stream_, - args, - 0)); + assert_success(hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + args, + 0), + "enqueue"); + return !(hip_device_->have_error()); } @@ -127,7 +147,7 @@ bool HIPDeviceQueue::synchronize() } const HIPContextScope scope(hip_device_); - hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_)); + assert_success(hipStreamSynchronize(hip_stream_), "synchronize"); debug_synchronize(); return !(hip_device_->have_error()); @@ -150,9 +170,9 @@ void HIPDeviceQueue::zero_to_device(device_memory &mem) assert(mem.device_pointer != 0); const HIPContextScope scope(hip_device_); - hip_device_assert( - hip_device_, - hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_)); + assert_success( + hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_), + "zero_to_device"); } void HIPDeviceQueue::copy_to_device(device_memory &mem) @@ -173,10 +193,10 @@ void HIPDeviceQueue::copy_to_device(device_memory &mem) /* Copy memory to device. */ const HIPContextScope scope(hip_device_); - hip_device_assert( - hip_device_, + assert_success( hipMemcpyHtoDAsync( - (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_)); + (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_), + "copy_to_device"); } void HIPDeviceQueue::copy_from_device(device_memory &mem) @@ -192,13 +212,21 @@ void HIPDeviceQueue::copy_from_device(device_memory &mem) /* Copy memory from device. */ const HIPContextScope scope(hip_device_); - hip_device_assert( - hip_device_, + assert_success( hipMemcpyDtoHAsync( - mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_)); + mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_), + "copy_from_device"); +} + +void HIPDeviceQueue::assert_success(hipError_t result, const char *operation) +{ + if (result != hipSuccess) { + const char *name = hipewErrorString(result); + hip_device_->set_error( + string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str())); + } } -// TODO : (Arya) Enable this after stabilizing dev branch unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create() { return make_unique<HIPDeviceGraphicsInterop>(this); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 04c8a5982ce..b92f7de7e4b 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -55,12 +55,13 @@ class HIPDeviceQueue : public DeviceQueue { return hip_stream_; } - // TODO : (Arya) Enable this after stabilizing the dev branch virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; protected: HIPDevice *hip_device_; hipStream_t hip_stream_; + + void assert_success(hipError_t result, const char *operation); }; CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h index 309f0eeb1e2..8686f887021 100644 --- a/intern/cycles/kernel/bvh/bvh_util.h +++ b/intern/cycles/kernel/bvh/bvh_util.h @@ -98,7 +98,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection * for (int j = 0; j < num_hits - 1; ++j) { if (hits[j].t > hits[j + 1].t) { struct Intersection tmp_hit = hits[j]; - struct float3 tmp_Ng = Ng[j]; + float3 tmp_Ng = Ng[j]; hits[j] = hits[j + 1]; Ng[j] = Ng[j + 1]; hits[j + 1] = tmp_hit; |