diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/CMakeLists.txt | 12 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda.h | 33 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 73 | ||||
-rw-r--r-- | intern/cycles/device/device.cpp | 55 | ||||
-rw-r--r-- | intern/cycles/device/device.h | 11 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 177 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 10 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_network.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_opencl.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_optix.cpp | 51 | ||||
-rw-r--r-- | intern/cycles/device/device_split_kernel.cpp | 18 | ||||
-rw-r--r-- | intern/cycles/device/device_split_kernel.h | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_task.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_task.h | 54 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl.h | 18 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl_impl.cpp | 103 |
19 files changed, 431 insertions, 201 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index aa5b65a2b73..ca366722eb7 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -99,6 +99,18 @@ if(WITH_CYCLES_DEVICE_MULTI) add_definitions(-DWITH_MULTI) endif() +if(WITH_OPENIMAGEDENOISE) + add_definitions(-DWITH_OPENIMAGEDENOISE) + add_definitions(-DOIDN_STATIC_LIB) + list(APPEND INC_SYS + ${OPENIMAGEDENOISE_INCLUDE_DIRS} + ) + list(APPEND LIB + ${OPENIMAGEDENOISE_LIBRARIES} + ${TBB_LIBRARIES} + ) +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h index 1aa2fdd0967..e5e3e24165d 100644 --- a/intern/cycles/device/cuda/device_cuda.h +++ b/intern/cycles/device/cuda/device_cuda.h @@ -21,6 +21,7 @@ # include "device/device_split_kernel.h" # include "util/util_map.h" +# include "util/util_task.h" # ifdef WITH_CUDA_DYNLOAD # include "cuew.h" @@ -96,9 +97,9 @@ class CUDADevice : public Device { static bool have_precompiled_kernels(); - virtual bool show_samples() const; + virtual bool show_samples() const override; - virtual BVHLayoutMask get_bvh_layout_mask() const; + virtual BVHLayoutMask get_bvh_layout_mask() const override; void set_error(const string &error) override; @@ -108,7 +109,7 @@ class CUDADevice : public Device { bool support_device(const DeviceRequestedFeatures & /*requested_features*/); - bool check_peer_access(Device *peer_device); + bool check_peer_access(Device *peer_device) override; bool use_adaptive_compilation(); @@ -122,7 +123,7 @@ class CUDADevice : public Device { const char *base = "cuda", bool force_ptx = false); - virtual bool load_kernels(const DeviceRequestedFeatures &requested_features); + virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override; void load_functions(); @@ -140,19 +141,19 @@ class CUDADevice : public Device { void generic_free(device_memory &mem); - void mem_alloc(device_memory &mem); + void mem_alloc(device_memory &mem) override; - void mem_copy_to(device_memory &mem); + void mem_copy_to(device_memory &mem) override; - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; - void mem_zero(device_memory &mem); + void mem_zero(device_memory &mem) override; - void mem_free(device_memory &mem); + void mem_free(device_memory &mem) override; - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/); + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; - virtual void const_copy_to(const char *name, void *host, size_t size); + virtual void const_copy_to(const char *name, void *host, size_t size) override; void global_alloc(device_memory &mem); @@ -252,15 +253,15 @@ class CUDADevice : public Device { int dw, int dh, bool transparent, - const DeviceDrawParams &draw_params); + const DeviceDrawParams &draw_params) override; - void thread_run(DeviceTask *task); + void thread_run(DeviceTask &task); - virtual void task_add(DeviceTask &task); + virtual void task_add(DeviceTask &task) override; - virtual void task_wait(); + virtual void task_wait() override; - virtual void task_cancel(); + virtual void task_cancel() override; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index 7aa63ff48c3..b9bbeb9a25b 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -105,7 +105,7 @@ class CUDASplitKernel : public DeviceSplitKernel { virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); }; /* Utility to push/pop CUDA context. */ @@ -243,7 +243,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool CUDADevice::~CUDADevice() { - task_pool.stop(); + task_pool.cancel(); delete split_kernel; @@ -2326,11 +2326,11 @@ void CUDADevice::draw_pixels(device_memory &mem, Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); } -void CUDADevice::thread_run(DeviceTask *task) +void CUDADevice::thread_run(DeviceTask &task) { CUDAContextScope scope(this); - if (task->type == DeviceTask::RENDER) { + if (task.type == DeviceTask::RENDER) { DeviceRequestedFeatures requested_features; if (use_split_kernel()) { if (split_kernel == NULL) { @@ -2343,72 +2343,64 @@ void CUDADevice::thread_run(DeviceTask *task) /* keep rendering tiles until done */ RenderTile tile; - DenoisingTask denoising(this, *task); + DenoisingTask denoising(this, task); - while (task->acquire_tile(this, tile, task->tile_types)) { + while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel()) { device_only_memory<uchar> void_buffer(this, "void_buffer"); split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - render(*task, tile, work_tiles); + render(task, tile, work_tiles); } } else if (tile.task == RenderTile::BAKE) { - render(*task, tile, work_tiles); + render(task, tile, work_tiles); } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } - task->release_tile(tile); + task.release_tile(tile); - if (task->get_cancel()) { - if (task->need_finish_queue == false) + if (task.get_cancel()) { + if (task.need_finish_queue == false) break; } } work_tiles.free(); } - else if (task->type == DeviceTask::SHADER) { - shader(*task); + else if (task.type == DeviceTask::SHADER) { + shader(task); cuda_assert(cuCtxSynchronize()); } - else if (task->type == DeviceTask::DENOISE_BUFFER) { + else if (task.type == DeviceTask::DENOISE_BUFFER) { RenderTile tile; - tile.x = task->x; - tile.y = task->y; - tile.w = task->w; - tile.h = task->h; - tile.buffer = task->buffer; - tile.sample = task->sample + task->num_samples; - tile.num_samples = task->num_samples; - tile.start_sample = task->sample; - tile.offset = task->offset; - tile.stride = task->stride; - tile.buffers = task->buffers; - - DenoisingTask denoising(this, *task); + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + DenoisingTask denoising(this, task); denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } } -class CUDADeviceTask : public DeviceTask { - public: - CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CUDADevice::thread_run, device, this); - } -}; - void CUDADevice::task_add(DeviceTask &task) { CUDAContextScope scope(this); @@ -2424,7 +2416,10 @@ void CUDADevice::task_add(DeviceTask &task) film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); } else { - task_pool.push(new CUDADeviceTask(this, task)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); } } @@ -2652,7 +2647,7 @@ int2 CUDASplitKernel::split_kernel_local_size() int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { CUDAContextScope scope(device); size_t free; diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 41dd7894d93..9dbb33980b4 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -77,7 +77,7 @@ std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &reques /* Device */ -Device::~Device() +Device::~Device() noexcept(false) { if (!background) { if (vertex_buffer != 0) { @@ -603,6 +603,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_osl = true; info.has_profiling = true; info.has_peer_memory = false; + info.denoisers = DENOISER_ALL; foreach (const DeviceInfo &device, subdevices) { /* Ensure CPU device does not slow down GPU. */ @@ -647,6 +648,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; info.has_peer_memory |= device.has_peer_memory; + info.denoisers &= device.denoisers; } return info; @@ -667,4 +669,55 @@ void Device::free_memory() network_devices.free_memory(); } +/* DeviceInfo */ + +void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type) +{ + assert(denoising_devices.empty()); + + if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) { + vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX); + if (!optix_devices.empty()) { + /* Convert to a special multi device with separate denoising devices. */ + if (multi_devices.empty()) { + multi_devices.push_back(*this); + } + + /* Try to use the same physical devices for denoising. */ + for (const DeviceInfo &cuda_device : multi_devices) { + if (cuda_device.type == DEVICE_CUDA) { + for (const DeviceInfo &optix_device : optix_devices) { + if (cuda_device.num == optix_device.num) { + id += optix_device.id; + denoising_devices.push_back(optix_device); + break; + } + } + } + } + + if (denoising_devices.empty()) { + /* Simply use the first available OptiX device. */ + const DeviceInfo optix_device = optix_devices.front(); + id += optix_device.id; /* Uniquely identify this special multi device. */ + denoising_devices.push_back(optix_device); + } + + denoisers = denoiser_type; + } + } + else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) { + /* Convert to a special multi device with separate denoising devices. */ + if (multi_devices.empty()) { + multi_devices.push_back(*this); + } + + /* Add CPU denoising devices. */ + DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front(); + denoising_devices.push_back(cpu_device); + + denoisers = denoiser_type; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index dff981080a5..a5833369a17 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -83,6 +83,7 @@ class DeviceInfo { bool use_split_kernel; /* Use split or mega kernel. */ bool has_profiling; /* Supports runtime collection of profiling info. */ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ + DenoiserTypeMask denoisers; /* Supported denoiser types. */ int cpu_threads; vector<DeviceInfo> multi_devices; vector<DeviceInfo> denoising_devices; @@ -101,6 +102,7 @@ class DeviceInfo { use_split_kernel = false; has_profiling = false; has_peer_memory = false; + denoisers = DENOISER_NONE; } bool operator==(const DeviceInfo &info) @@ -110,6 +112,9 @@ class DeviceInfo { (type == info.type && num == info.num && description == info.description)); return id == info.id; } + + /* Add additional devices needed for the specified denoiser. */ + void add_denoising_devices(DenoiserType denoiser_type); }; class DeviceRequestedFeatures { @@ -132,6 +137,7 @@ class DeviceRequestedFeatures { /* BVH/sampling kernel features. */ bool use_hair; + bool use_hair_thick; bool use_object_motion; bool use_camera_motion; @@ -178,6 +184,7 @@ class DeviceRequestedFeatures { max_nodes_group = 0; nodes_features = 0; use_hair = false; + use_hair_thick = false; use_object_motion = false; use_camera_motion = false; use_baking = false; @@ -200,6 +207,7 @@ class DeviceRequestedFeatures { max_nodes_group == requested_features.max_nodes_group && nodes_features == requested_features.nodes_features && use_hair == requested_features.use_hair && + use_hair_thick == requested_features.use_hair_thick && use_object_motion == requested_features.use_object_motion && use_camera_motion == requested_features.use_camera_motion && use_baking == requested_features.use_baking && @@ -319,7 +327,8 @@ class Device { virtual void mem_free_sub_ptr(device_ptr /*ptr*/){}; public: - virtual ~Device(); + /* noexcept needed to silence TBB warning. */ + virtual ~Device() noexcept(false); /* info */ DeviceInfo info; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index fc6febd8cee..8f68e66a1b4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -51,10 +51,12 @@ #include "util/util_function.h" #include "util/util_logging.h" #include "util/util_map.h" +#include "util/util_openimagedenoise.h" #include "util/util_opengl.h" #include "util/util_optimization.h" #include "util/util_progress.h" #include "util/util_system.h" +#include "util/util_task.h" #include "util/util_thread.h" CCL_NAMESPACE_BEGIN @@ -161,7 +163,7 @@ class CPUSplitKernel : public DeviceSplitKernel { virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); }; @@ -176,6 +178,10 @@ class CPUDevice : public Device { #ifdef WITH_OSL OSLGlobals osl_globals; #endif +#ifdef WITH_OPENIMAGEDENOISE + oidn::DeviceRef oidn_device; + oidn::FilterRef oidn_filter; +#endif bool use_split_kernel; @@ -332,7 +338,7 @@ class CPUDevice : public Device { ~CPUDevice() { - task_pool.stop(); + task_pool.cancel(); texture_info.free(); } @@ -344,17 +350,6 @@ class CPUDevice : public Device { virtual BVHLayoutMask get_bvh_layout_mask() const { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; - if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH4; - } - /* MSVC does not support the -march=native switch and you always end up */ - /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */ - /* that kernel BVH8 even if the CPU flags would allow for it. */ -#if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE)) - if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH8; - } -#endif #ifdef WITH_EMBREE bvh_layout_mask |= BVH_LAYOUT_EMBREE; #endif /* WITH_EMBREE */ @@ -527,26 +522,18 @@ class CPUDevice : public Device { #endif } - void thread_run(DeviceTask *task) + void thread_run(DeviceTask &task) { - if (task->type == DeviceTask::RENDER) - thread_render(*task); - else if (task->type == DeviceTask::SHADER) - thread_shader(*task); - else if (task->type == DeviceTask::FILM_CONVERT) - thread_film_convert(*task); - else if (task->type == DeviceTask::DENOISE_BUFFER) - thread_denoise(*task); + if (task.type == DeviceTask::RENDER) + thread_render(task); + else if (task.type == DeviceTask::SHADER) + thread_shader(task); + else if (task.type == DeviceTask::FILM_CONVERT) + thread_film_convert(task); + else if (task.type == DeviceTask::DENOISE_BUFFER) + thread_denoise(task); } - class CPUDeviceTask : public DeviceTask { - public: - CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CPUDevice::thread_run, device, this); - } - }; - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, @@ -961,7 +948,71 @@ class CPUDevice : public Device { } } - void denoise(DenoisingTask &denoising, RenderTile &tile) + void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + { +#ifdef WITH_OPENIMAGEDENOISE + assert(openimagedenoise_supported()); + + /* Only one at a time, since OpenImageDenoise itself is multithreaded. */ + static thread_mutex mutex; + thread_scoped_lock lock(mutex); + + /* Create device and filter, cached for reuse. */ + if (!oidn_device) { + oidn_device = oidn::newDevice(); + oidn_device.commit(); + } + if (!oidn_filter) { + oidn_filter = oidn_device.newFilter("RT"); + } + + /* Copy pixels from compute device to CPU (no-op for CPU device). */ + rtile.buffers->buffer.copy_from_device(); + + /* Set images with appropriate stride for our interleaved pass storage. */ + const struct { + const char *name; + int offset; + } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR}, + {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL}, + {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO}, + {"output", 0}, + { NULL, + 0 }}; + + for (int i = 0; passes[i].name; i++) { + const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride; + const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float); + const int64_t pixel_stride = task.pass_stride * sizeof(float); + const int64_t row_stride = rtile.stride * pixel_stride; + + oidn_filter.setImage(passes[i].name, + (char *)rtile.buffer + buffer_offset, + oidn::Format::Float3, + rtile.w, + rtile.h, + 0, + pixel_stride, + row_stride); + } + + /* Execute filter. */ + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); + oidn_filter.commit(); + oidn_filter.execute(); + + /* todo: it may be possible to avoid this copy, but we have to ensure that + * when other code copies data from the device it doesn't overwrite the + * denoiser buffers. */ + rtile.buffers->buffer.copy_to_device(); +#else + (void)task; + (void)rtile; +#endif + } + + void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) { ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); @@ -1019,15 +1070,14 @@ class CPUDevice : public Device { } } - RenderTile tile; - DenoisingTask denoising(this, task); - denoising.profiler = &kg->profiler; + DenoisingTask *denoising = NULL; + RenderTile tile; while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel) { device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); + split_kernel->path_trace(task, tile, kgbuffer, void_buffer); } else { render(task, tile, kg); @@ -1037,7 +1087,16 @@ class CPUDevice : public Device { render(task, tile, kg); } else if (tile.task == RenderTile::DENOISE) { - denoise(denoising, tile); + if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + denoise_openimagedenoise(task, tile); + } + else if (task.denoising.type == DENOISER_NLM) { + if (denoising == NULL) { + denoising = new DenoisingTask(this, task); + denoising->profiler = &kg->profiler; + } + denoise_nlm(*denoising, tile); + } task.update_progress(&tile, tile.w * tile.h); } @@ -1055,6 +1114,7 @@ class CPUDevice : public Device { kg->~KernelGlobals(); kgbuffer.free(); delete split_kernel; + delete denoising; } void thread_denoise(DeviceTask &task) @@ -1072,16 +1132,22 @@ class CPUDevice : public Device { tile.stride = task.stride; tile.buffers = task.buffers; - DenoisingTask denoising(this, task); + if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + denoise_openimagedenoise(task, tile); + } + else { + DenoisingTask denoising(this, task); - ProfilingState denoising_profiler_state; - profiler.add_state(&denoising_profiler_state); - denoising.profiler = &denoising_profiler_state; + ProfilingState denoising_profiler_state; + profiler.add_state(&denoising_profiler_state); + denoising.profiler = &denoising_profiler_state; - denoise(denoising, tile); - task.update_progress(&tile, tile.w * tile.h); + denoise_nlm(denoising, tile); + + profiler.remove_state(&denoising_profiler_state); + } - profiler.remove_state(&denoising_profiler_state); + task.update_progress(&tile, tile.w * tile.h); } void thread_film_convert(DeviceTask &task) @@ -1155,13 +1221,24 @@ class CPUDevice : public Device { /* split task into smaller ones */ list<DeviceTask> tasks; - if (task.type == DeviceTask::SHADER) + if (task.type == DeviceTask::DENOISE_BUFFER && + task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + /* Denoise entire buffer at once with OIDN, it has own threading. */ + tasks.push_back(task); + } + else if (task.type == DeviceTask::SHADER) { task.split(tasks, info.cpu_threads, 256); - else + } + else { task.split(tasks, info.cpu_threads); + } - foreach (DeviceTask &task, tasks) - task_pool.push(new CPUDeviceTask(this, task)); + foreach (DeviceTask &task, tasks) { + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); + } } void task_wait() @@ -1326,7 +1403,7 @@ int2 CPUSplitKernel::split_kernel_local_size() int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, device_memory & /*data*/, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { return make_int2(1, 1); } @@ -1358,6 +1435,10 @@ void device_cpu_info(vector<DeviceInfo> &devices) info.has_osl = true; info.has_half_images = true; info.has_profiling = true; + info.denoisers = DENOISER_NLM; + if (openimagedenoise_supported()) { + info.denoisers |= DENOISER_OPENIMAGEDENOISE; + } devices.insert(devices.begin(), info); } diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 04c04761311..d9ffcceb06e 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -130,6 +130,7 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.has_half_images = (major >= 3); info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; + info.denoisers = DENOISER_NLM; /* Check if the device has P2P access to any other device in the system. */ for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index ac17c02a427..89de80a5bcd 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -56,8 +56,8 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) tile_info->frames[i] = task.denoising_frames[i - 1]; } - write_passes = task.denoising_write_passes; - do_filter = task.denoising_do_filter; + do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM; + do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM; } DenoisingTask::~DenoisingTask() @@ -91,7 +91,7 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles) target_buffer.stride = rtiles[9].stride; target_buffer.ptr = rtiles[9].buffer; - if (write_passes && rtiles[9].buffers) { + if (do_prefilter && rtiles[9].buffers) { target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset(); } @@ -111,7 +111,7 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.use_intensity = write_passes || (tile_info->num_frames > 1); + buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1); buffer.passes = buffer.use_intensity ? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); @@ -343,7 +343,7 @@ void DenoisingTask::run_denoising(RenderTile *tile) reconstruct(); } - if (write_passes) { + if (do_prefilter) { write_buffer(); } diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index bd1d0193dbd..4c122e981eb 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -60,7 +60,7 @@ class DenoisingTask { int4 rect; int4 filter_area; - bool write_passes; + bool do_prefilter; bool do_filter; struct DeviceFunctions { diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 020b9e10e60..fd14bbdccc5 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -396,8 +396,8 @@ class MultiDevice : public Device { size_t existing_size = mem.device_size; /* This is a hack to only allocate the tile buffers on denoising devices - * Similarily the tile buffers also need to be allocated separately on all devices so any - * overlap rendered for denoising does not interfer with each other */ + * Similarly the tile buffers also need to be allocated separately on all devices so any + * overlap rendered for denoising does not interfere with each other */ if (strcmp(mem.name, "RenderBuffers") == 0) { vector<device_ptr> device_pointers; device_pointers.reserve(devices.size()); diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 0933d51f321..8904b517e92 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -313,6 +313,7 @@ void device_network_info(vector<DeviceInfo> &devices) info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; info.has_osl = false; + info.denoisers = DENOISER_NONE; devices.push_back(info); } diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 8a0b128697f..39b9ef70192 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -120,6 +120,7 @@ void device_opencl_info(vector<DeviceInfo> &devices) info.use_split_kernel = true; info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; + info.denoisers = DENOISER_NLM; info.id = id; /* Check OpenCL extensions */ diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index fbf6a914744..ececca3df53 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -246,7 +246,7 @@ class OptiXDevice : public CUDADevice { ~OptiXDevice() { // Stop processing any more tasks - task_pool.stop(); + task_pool.cancel(); // Make CUDA context current const CUDAContextScope scope(cuContext); @@ -428,11 +428,20 @@ class OptiXDevice : public CUDADevice { group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; if (requested_features.use_hair) { - // Add curve intersection programs group_descs[PG_HITD].hitgroup.moduleIS = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve"; group_descs[PG_HITS].hitgroup.moduleIS = optix_module; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve"; + + // Add curve intersection programs + if (requested_features.use_hair_thick) { + // Slower programs for thick hair since that also slows down ribbons. + // Ideally this should not be needed. + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; + } + else { + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + } } if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { @@ -712,7 +721,7 @@ class OptiXDevice : public CUDADevice { const CUDAContextScope scope(cuContext); // Choose between OptiX and NLM denoising - if (task.denoising_use_optix) { + if (task.denoising.type == DENOISER_OPTIX) { // Map neighboring tiles onto this device, indices are as following: // Where index 4 is the center tile and index 9 is the target for the result. // 0 1 2 @@ -1436,21 +1445,21 @@ class OptiXDevice : public CUDADevice { KernelData *const data = (KernelData *)host; *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; - update_launch_params(name, offsetof(KernelParams, data), host, size); + update_launch_params(offsetof(KernelParams, data), host, size); return; } // Update data storage pointers in launch parameters # define KERNEL_TEX(data_type, tex_name) \ if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \ + update_launch_params(offsetof(KernelParams, tex_name), host, size); \ return; \ } # include "kernel/kernel_textures.h" # undef KERNEL_TEX } - void update_launch_params(const char *name, size_t offset, void *data, size_t data_size) + void update_launch_params(size_t offset, void *data, size_t data_size) { const CUDAContextScope scope(cuContext); @@ -1463,15 +1472,6 @@ class OptiXDevice : public CUDADevice { void task_add(DeviceTask &task) override { - struct OptiXDeviceTask : public DeviceTask { - OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task) - { - // Using task index parameter instead of thread index, since number of CUDA streams may - // differ from number of threads - run = function_bind(&OptiXDevice::thread_run, device, *this, task_index); - } - }; - // Upload texture information to device if it has changed since last launch load_texture_info(); @@ -1483,7 +1483,10 @@ class OptiXDevice : public CUDADevice { if (task.type == DeviceTask::DENOISE_BUFFER) { // Execute denoising in a single thread (e.g. to avoid race conditions during creation) - task_pool.push(new OptiXDeviceTask(this, task, 0)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy, 0); + }); return; } @@ -1493,8 +1496,15 @@ class OptiXDevice : public CUDADevice { // Queue tasks in internal task pool int task_index = 0; - for (DeviceTask &task : tasks) - task_pool.push(new OptiXDeviceTask(this, task, task_index++)); + for (DeviceTask &task : tasks) { + task_pool.push([=] { + // Using task index parameter instead of thread index, since number of CUDA streams may + // differ from number of threads + DeviceTask task_copy = task; + thread_run(task_copy, task_index); + }); + task_index++; + } } void task_wait() override @@ -1551,6 +1561,7 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo info.type = DEVICE_OPTIX; info.id += "_OptiX"; + info.denoisers |= DENOISER_OPTIX; devices.push_back(info); } diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index f22d8761058..4c288f60c16 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -145,7 +145,7 @@ size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg, return max_buffer_size / size_per_element; } -bool DeviceSplitKernel::path_trace(DeviceTask *task, +bool DeviceSplitKernel::path_trace(DeviceTask &task, RenderTile &tile, device_memory &kgbuffer, device_memory &kernel_data) @@ -222,9 +222,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, subtile.start_sample = tile.sample; subtile.num_samples = samples_per_second; - if (task->adaptive_sampling.use) { - subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample, - subtile.num_samples); + if (task.adaptive_sampling.use) { + subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample, + subtile.num_samples); } /* Don't go beyond requested number of samples. */ @@ -286,7 +286,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); - if (task->get_cancel() && cancel_time == DBL_MAX) { + if (task.get_cancel() && cancel_time == DBL_MAX) { /* Wait up to twice as many seconds for current samples to finish * to avoid artifacts in render result from ending too soon. */ @@ -323,7 +323,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } int filter_sample = tile.sample + subtile.num_samples - 1; - if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) { + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { size_t buffer_size[2]; buffer_size[0] = round_up(tile.w, local_size[0]); buffer_size[1] = round_up(tile.h, local_size[1]); @@ -352,16 +352,16 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, #undef ENQUEUE_SPLIT_KERNEL tile.sample += subtile.num_samples; - task->update_progress(&tile, tile.w * tile.h * subtile.num_samples); + task.update_progress(&tile, tile.w * tile.h * subtile.num_samples); time_multiplier = min(time_multiplier << 1, 10); - if (task->get_cancel()) { + if (task.get_cancel()) { return true; } } - if (task->adaptive_sampling.use) { + if (task.adaptive_sampling.use) { /* Reset the start samples. */ RenderTile subtile = tile; subtile.start_sample = tile.start_sample; diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 9d6b9efdd62..07a21b10299 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -109,7 +109,7 @@ class DeviceSplitKernel { virtual ~DeviceSplitKernel(); bool load_kernels(const DeviceRequestedFeatures &requested_features); - bool path_trace(DeviceTask *task, + bool path_trace(DeviceTask &task, RenderTile &rtile, device_memory &kgbuffer, device_memory &kernel_data); @@ -137,7 +137,7 @@ class DeviceSplitKernel { virtual int2 split_kernel_local_size() = 0; virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask *task) = 0; + DeviceTask &task) = 0; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 7485e1b41de..6e7c184c6c9 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -50,7 +50,7 @@ DeviceTask::DeviceTask(Type type_) last_update_time = time_dt(); } -int DeviceTask::get_subtask_count(int num, int max_size) +int DeviceTask::get_subtask_count(int num, int max_size) const { if (max_size != 0) { int max_size_num; @@ -78,7 +78,7 @@ int DeviceTask::get_subtask_count(int num, int max_size) return num; } -void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) +void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const { num = get_subtask_count(num, max_size); diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 8c4e682adb1..600973b8100 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -21,7 +21,6 @@ #include "util/util_function.h" #include "util/util_list.h" -#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -32,8 +31,33 @@ class RenderBuffers; class RenderTile; class Tile; +enum DenoiserType { + DENOISER_NLM = 1, + DENOISER_OPTIX = 2, + DENOISER_OPENIMAGEDENOISE = 4, + DENOISER_NUM, + + DENOISER_NONE = 0, + DENOISER_ALL = ~0, +}; + +typedef int DenoiserTypeMask; + class DenoiseParams { public: + /* Apply denoiser to image. */ + bool use; + /* Output denoising data passes (possibly without applying the denoiser). */ + bool store_passes; + + /* Denoiser type. */ + DenoiserType type; + + /* Viewport start sample. */ + int start_sample; + + /** Native Denoiser **/ + /* Pixel radius for neighboring pixels to take into account. */ int radius; /* Controls neighbor pixel weighting for the denoising filter. */ @@ -47,18 +71,36 @@ class DenoiseParams { int neighbor_frames; /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ bool clamp_input; + + /** Optix Denoiser **/ + /* Passes handed over to the OptiX denoiser (default to color + albedo). */ int optix_input_passes; DenoiseParams() { + use = false; + store_passes = false; + + type = DENOISER_NLM; + radius = 8; strength = 0.5f; feature_strength = 0.5f; relative_pca = false; neighbor_frames = 2; clamp_input = true; + optix_input_passes = 2; + + start_sample = 0; + } + + /* Test if a denoising task needs to run, also to prefilter passes for the native + * denoiser when we are not applying denoising to the combined image. */ + bool need_denoising_task() const + { + return (use || (store_passes && type == DENOISER_NLM)); } }; @@ -75,7 +117,7 @@ class AdaptiveSampling { int min_samples; }; -class DeviceTask : public Task { +class DeviceTask { public: typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type; Type type; @@ -98,8 +140,8 @@ class DeviceTask : public Task { explicit DeviceTask(Type type = RENDER); - int get_subtask_count(int num, int max_size = 0); - void split(list<DeviceTask> &tasks, int num, int max_size = 0); + int get_subtask_count(int num, int max_size = 0) const; + void split(list<DeviceTask> &tasks, int num, int max_size = 0) const; void update_progress(RenderTile *rtile, int pixel_samples = -1); @@ -116,10 +158,6 @@ class DeviceTask : public Task { bool denoising_from_render; vector<int> denoising_frames; - bool denoising_do_filter; - bool denoising_use_optix; - bool denoising_write_passes; - int pass_stride; int frame_stride; int target_pass_stride; diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h index 389268e1c2a..e0140996cf0 100644 --- a/intern/cycles/device/opencl/device_opencl.h +++ b/intern/cycles/device/opencl/device_opencl.h @@ -23,6 +23,7 @@ # include "util/util_map.h" # include "util/util_param.h" # include "util/util_string.h" +# include "util/util_task.h" # include "clew.h" @@ -258,6 +259,8 @@ class OpenCLDevice : public Device { TaskPool load_required_kernel_task_pool; /* Task pool for optional kernels (feature kernels during foreground rendering) */ TaskPool load_kernel_task_pool; + std::atomic<int> load_kernel_num_compiling; + cl_context cxContext; cl_command_queue cqCommandQueue; cl_platform_id cpPlatform; @@ -455,14 +458,6 @@ class OpenCLDevice : public Device { void denoise(RenderTile &tile, DenoisingTask &denoising); - class OpenCLDeviceTask : public DeviceTask { - public: - OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&OpenCLDevice::thread_run, device, this); - } - }; - int get_split_task_count(DeviceTask & /*task*/) { return 1; @@ -470,7 +465,10 @@ class OpenCLDevice : public Device { void task_add(DeviceTask &task) { - task_pool.push(new OpenCLDeviceTask(this, task)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); } void task_wait() @@ -483,7 +481,7 @@ class OpenCLDevice : public Device { task_pool.cancel(); } - void thread_run(DeviceTask *task); + void thread_run(DeviceTask &task); virtual BVHLayoutMask get_bvh_layout_mask() const { diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp index beb3174b111..8c94815b193 100644 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ b/intern/cycles/device/opencl/device_opencl_impl.cpp @@ -542,7 +542,7 @@ class OpenCLSplitKernel : public DeviceSplitKernel { virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ @@ -610,6 +610,7 @@ void OpenCLDevice::opencl_assert_err(cl_int err, const char *where) OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) : Device(info, stats, profiler, background), + load_kernel_num_compiling(0), kernel_programs(this), preview_programs(this), memory_manager(this), @@ -684,9 +685,9 @@ OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, b OpenCLDevice::~OpenCLDevice() { - task_pool.stop(); - load_required_kernel_task_pool.stop(); - load_kernel_task_pool.stop(); + task_pool.cancel(); + load_required_kernel_task_pool.cancel(); + load_kernel_task_pool.cancel(); memory_manager.free(); @@ -798,7 +799,11 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_feature * internally within a single process. */ foreach (OpenCLProgram *program, programs) { if (!program->load()) { - load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + load_kernel_num_compiling++; + load_kernel_task_pool.push([=] { + program->compile(); + load_kernel_num_compiling--; + }); } } return true; @@ -868,7 +873,7 @@ bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requeste * Better to check on device level than per kernel as mixing preview and * non-preview kernels does not work due to different data types */ if (use_preview_kernels) { - use_preview_kernels = !load_kernel_task_pool.finished(); + use_preview_kernels = load_kernel_num_compiling.load() > 0; } } return split_kernel->load_kernels(requested_features); @@ -895,7 +900,7 @@ DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state() return DEVICE_KERNEL_USING_FEATURE_KERNEL; } - bool other_kernels_finished = load_kernel_task_pool.finished(); + bool other_kernels_finished = load_kernel_num_compiling.load() == 0; if (use_preview_kernels) { if (other_kernels_finished) { return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE; @@ -1336,20 +1341,20 @@ void OpenCLDevice::flush_texture_buffers() memory_manager.alloc("texture_info", texture_info); } -void OpenCLDevice::thread_run(DeviceTask *task) +void OpenCLDevice::thread_run(DeviceTask &task) { flush_texture_buffers(); - if (task->type == DeviceTask::RENDER) { + if (task.type == DeviceTask::RENDER) { RenderTile tile; - DenoisingTask denoising(this, *task); + DenoisingTask denoising(this, task); /* Allocate buffer for kernel globals */ device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); kgbuffer.alloc_to_device(1); /* Keep rendering tiles until done. */ - while (task->acquire_tile(this, tile, task->tile_types)) { + while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { assert(tile.task == RenderTile::PATH_TRACE); scoped_timer timer(&tile.buffers->render_time); @@ -1368,42 +1373,42 @@ void OpenCLDevice::thread_run(DeviceTask *task) clFinish(cqCommandQueue); } else if (tile.task == RenderTile::BAKE) { - bake(*task, tile); + bake(task, tile); } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } - task->release_tile(tile); + task.release_tile(tile); } kgbuffer.free(); } - else if (task->type == DeviceTask::SHADER) { - shader(*task); + else if (task.type == DeviceTask::SHADER) { + shader(task); } - else if (task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + else if (task.type == DeviceTask::FILM_CONVERT) { + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); } - else if (task->type == DeviceTask::DENOISE_BUFFER) { + else if (task.type == DeviceTask::DENOISE_BUFFER) { RenderTile tile; - tile.x = task->x; - tile.y = task->y; - tile.w = task->w; - tile.h = task->h; - tile.buffer = task->buffer; - tile.sample = task->sample + task->num_samples; - tile.num_samples = task->num_samples; - tile.start_sample = task->sample; - tile.offset = task->offset; - tile.stride = task->stride; - tile.buffers = task->buffers; - - DenoisingTask denoising(this, *task); + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + DenoisingTask denoising(this, task); denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } } @@ -1937,10 +1942,8 @@ void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) clFinish(cqCommandQueue); } -string OpenCLDevice::kernel_build_options(const string *debug_src) +static bool kernel_build_opencl_2(cl_device_id cdDevice) { - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - /* Build with OpenCL 2.0 if available, this improves performance * with AMD OpenCL drivers on Windows and Linux (legacy drivers). * Note that OpenCL selects the highest 1.x version by default, @@ -1948,10 +1951,36 @@ string OpenCLDevice::kernel_build_options(const string *debug_src) int version_major, version_minor; if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) { if (version_major >= 2) { - build_options += "-cl-std=CL2.0 "; + /* This appears to trigger a driver bug in Radeon RX cards with certain + * driver version, so don't use OpenCL 2.0 for those. */ + string device_name = OpenCLInfo::get_readable_device_name(cdDevice); + if (string_startswith(device_name, "Radeon RX 4") || + string_startswith(device_name, "Radeon (TM) RX 4") || + string_startswith(device_name, "Radeon RX 5") || + string_startswith(device_name, "Radeon (TM) RX 5")) { + char version[256] = ""; + int driver_major, driver_minor; + clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); + if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) { + return !(driver_major == 3075 && driver_minor <= 12); + } + } + + return true; } } + return false; +} + +string OpenCLDevice::kernel_build_options(const string *debug_src) +{ + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; + + if (kernel_build_opencl_2(cdDevice)) { + build_options += "-cl-std=CL2.0 "; + } + if (platform_name == "NVIDIA CUDA") { build_options += "-D__KERNEL_OPENCL_NVIDIA__ " |