Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--extern/hipew/include/hipew.h6
-rw-r--r--intern/cycles/device/hip/device_impl.cpp142
-rw-r--r--intern/cycles/device/hip/queue.cpp82
-rw-r--r--intern/cycles/device/hip/queue.h3
-rw-r--r--intern/cycles/kernel/bvh/bvh_util.h2
6 files changed, 69 insertions, 168 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 544e22f342b..ac11b495948 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -430,7 +430,7 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
option(WITH_CYCLES_DEVICE_CUDA "Enable Cycles CUDA compute support" ON)
option(WITH_CYCLES_DEVICE_OPTIX "Enable Cycles OptiX support" ON)
-option(WITH_CYCLES_DEVICE_HIP "Enable Cycles HIP support" OFF)
+option(WITH_CYCLES_DEVICE_HIP "Enable Cycles HIP support" ON)
mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
diff --git a/extern/hipew/include/hipew.h b/extern/hipew/include/hipew.h
index 02fffc331bf..aa42fdf8ecd 100644
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -24,9 +24,13 @@ extern "C" {
#include <stdlib.h>
#define HIP_IPC_HANDLE_SIZE 64
+#define hipHostMallocDefault 0x00
#define hipHostMallocPortable 0x01
#define hipHostMallocMapped 0x02
#define hipHostMallocWriteCombined 0x04
+#define hipHostMallocNumaUser 0x20000000
+#define hipHostMallocCoherent 0x40000000
+#define hipHostMallocNonCoherent 0x80000000
#define hipHostRegisterPortable 0x01
#define hipHostRegisterMapped 0x02
#define hipHostRegisterIoMemory 0x04
@@ -989,7 +993,7 @@ typedef hipError_t HIPAPI thipMalloc(hipDeviceptr_t* dptr, size_t bytesize);
typedef hipError_t HIPAPI thipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
typedef hipError_t HIPAPI thipFree(hipDeviceptr_t dptr);
typedef hipError_t HIPAPI thipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
-typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize);
+typedef hipError_t HIPAPI thipHostMalloc(void** pp, size_t bytesize, unsigned int flags);
typedef hipError_t HIPAPI thipHostFree(void* p);
typedef hipError_t HIPAPI thipMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags);
typedef hipError_t HIPAPI thipHostGetDevicePointer(hipDeviceptr_t* pdptr, void* p, unsigned int Flags);
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 0e5ac6ce401..964783a08bf 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -108,6 +108,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
return;
}
+ /* hipDeviceMapHost for mapping host memory when out of device memory.
+ * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
+ * so we can predict which memory to map to host. */
hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
hip_assert(
@@ -657,7 +660,8 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
- mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+ mem_alloc_result = hipHostMalloc(
+ &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
(mem_alloc_result != hipSuccess && shared_pointer == 0));
@@ -874,7 +878,6 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
size_t bytes;
hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
- assert(bytes == size);
hip_assert(hipMemcpyHtoD(mem, host, size));
}
@@ -1142,141 +1145,6 @@ void HIPDevice::tex_free(device_texture &mem)
}
}
-# if 0
-void HIPDevice::render(DeviceTask &task,
- RenderTile &rtile,
- device_vector<KernelWorkTile> &work_tiles)
-{
- scoped_timer timer(&rtile.buffers->render_time);
-
- if (have_error())
- return;
-
- HIPContextScope scope(this);
- hipFunction_t hipRender;
-
- /* Get kernel function. */
- if (rtile.task == RenderTile::BAKE) {
- hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
- }
- else {
- hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
- }
-
- if (have_error()) {
- return;
- }
-
- hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
-
- /* Allocate work tile. */
- work_tiles.alloc(1);
-
- KernelWorkTile *wtile = work_tiles.data();
- wtile->x = rtile.x;
- wtile->y = rtile.y;
- wtile->w = rtile.w;
- wtile->h = rtile.h;
- wtile->offset = rtile.offset;
- wtile->stride = rtile.stride;
- wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
-
- /* Prepare work size. More step samples render faster, but for now we
- * remain conservative for GPUs connected to a display to avoid driver
- * timeouts and display freezing. */
- int min_blocks, num_threads_per_block;
- hip_assert(
- hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
- if (!info.display_device) {
- min_blocks *= 8;
- }
-
- uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
- /* Render all samples. */
- uint start_sample = rtile.start_sample;
- uint end_sample = rtile.start_sample + rtile.num_samples;
-
- for (int sample = start_sample; sample < end_sample;) {
- /* Setup and copy work tile to device. */
- wtile->start_sample = sample;
- wtile->num_samples = step_samples;
- if (task.adaptive_sampling.use) {
- wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
- }
- wtile->num_samples = min(wtile->num_samples, end_sample - sample);
- work_tiles.copy_to_device();
-
- hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
- uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
- /* Launch kernel. */
- void *args[] = {&d_work_tiles, &total_work_size};
-
- hip_assert(
- hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
- /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
- uint filter_sample = sample + wtile->num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
- }
-
- hip_assert(hipDeviceSynchronize());
-
- /* Update progress. */
- sample += wtile->num_samples;
- rtile.sample = sample;
- task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- /* Finalize adaptive sampling. */
- if (task.adaptive_sampling.use) {
- hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
- adaptive_sampling_post(rtile, wtile, d_work_tiles);
- hip_assert(hipDeviceSynchronize());
- task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
- }
-}
-
-void HIPDevice::thread_run(DeviceTask &task)
-{
- HIPContextScope scope(this);
-
- if (task.type == DeviceTask::RENDER) {
- device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
- /* keep rendering tiles until done */
- RenderTile tile;
- DenoisingTask denoising(this, task);
-
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- render(task, tile, work_tiles);
- }
- else if (tile.task == RenderTile::BAKE) {
- render(task, tile, work_tiles);
- }
-
- task.release_tile(tile);
-
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- work_tiles.free();
- }
-}
-# endif
-
unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
{
return make_unique<HIPDeviceQueue>(this);
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 78c77e5fdae..0d9f5916d30 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -39,11 +39,30 @@ HIPDeviceQueue::~HIPDeviceQueue()
hipStreamDestroy(hip_stream_);
}
-int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
{
- /* TODO: compute automatically. */
- /* TODO: must have at least num_threads_per_block. */
- return 14416128;
+ int num_states = 0;
+ const int max_num_threads = hip_device_->get_num_multiprocessors() *
+ hip_device_->get_max_num_threads_per_multiprocessor();
+ if (max_num_threads == 0) {
+ num_states = 1048576; // 65536 * 16
+ }
+ else {
+ num_states = max_num_threads * 16;
+ }
+
+ const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+ if (factor_str) {
+ float factor = atof(factor_str);
+ if (!factor)
+ VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
+ num_states = max((int)(num_states * factor), 1024);
+ }
+
+ VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+ << string_human_readable_size(num_states * state_size);
+
+ return num_states;
}
int HIPDeviceQueue::num_concurrent_busy_states() const
@@ -105,18 +124,19 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
}
/* Launch kernel. */
- hip_device_assert(hip_device_,
- hipModuleLaunchKernel(hip_kernel.function,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- shared_mem_bytes,
- hip_stream_,
- args,
- 0));
+ assert_success(hipModuleLaunchKernel(hip_kernel.function,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ shared_mem_bytes,
+ hip_stream_,
+ args,
+ 0),
+ "enqueue");
+
return !(hip_device_->have_error());
}
@@ -127,7 +147,7 @@ bool HIPDeviceQueue::synchronize()
}
const HIPContextScope scope(hip_device_);
- hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+ assert_success(hipStreamSynchronize(hip_stream_), "synchronize");
debug_synchronize();
return !(hip_device_->have_error());
@@ -150,9 +170,9 @@ void HIPDeviceQueue::zero_to_device(device_memory &mem)
assert(mem.device_pointer != 0);
const HIPContextScope scope(hip_device_);
- hip_device_assert(
- hip_device_,
- hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+ assert_success(
+ hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_),
+ "zero_to_device");
}
void HIPDeviceQueue::copy_to_device(device_memory &mem)
@@ -173,10 +193,10 @@ void HIPDeviceQueue::copy_to_device(device_memory &mem)
/* Copy memory to device. */
const HIPContextScope scope(hip_device_);
- hip_device_assert(
- hip_device_,
+ assert_success(
hipMemcpyHtoDAsync(
- (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+ (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_),
+ "copy_to_device");
}
void HIPDeviceQueue::copy_from_device(device_memory &mem)
@@ -192,13 +212,21 @@ void HIPDeviceQueue::copy_from_device(device_memory &mem)
/* Copy memory from device. */
const HIPContextScope scope(hip_device_);
- hip_device_assert(
- hip_device_,
+ assert_success(
hipMemcpyDtoHAsync(
- mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+ mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_),
+ "copy_from_device");
+}
+
+void HIPDeviceQueue::assert_success(hipError_t result, const char *operation)
+{
+ if (result != hipSuccess) {
+ const char *name = hipewErrorString(result);
+ hip_device_->set_error(
+ string_printf("%s in HIP queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+ }
}
-// TODO : (Arya) Enable this after stabilizing dev branch
unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
{
return make_unique<HIPDeviceGraphicsInterop>(this);
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 04c8a5982ce..b92f7de7e4b 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -55,12 +55,13 @@ class HIPDeviceQueue : public DeviceQueue {
return hip_stream_;
}
- // TODO : (Arya) Enable this after stabilizing the dev branch
virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
protected:
HIPDevice *hip_device_;
hipStream_t hip_stream_;
+
+ void assert_success(hipError_t result, const char *operation);
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index 309f0eeb1e2..8686f887021 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -98,7 +98,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
for (int j = 0; j < num_hits - 1; ++j) {
if (hits[j].t > hits[j + 1].t) {
struct Intersection tmp_hit = hits[j];
- struct float3 tmp_Ng = Ng[j];
+ float3 tmp_Ng = Ng[j];
hits[j] = hits[j + 1];
Ng[j] = Ng[j + 1];
hits[j + 1] = tmp_hit;