Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt12
-rw-r--r--intern/cycles/device/cuda/device_cuda.h33
-rw-r--r--intern/cycles/device/cuda/device_cuda_impl.cpp73
-rw-r--r--intern/cycles/device/device.cpp55
-rw-r--r--intern/cycles/device/device.h11
-rw-r--r--intern/cycles/device/device_cpu.cpp177
-rw-r--r--intern/cycles/device/device_cuda.cpp1
-rw-r--r--intern/cycles/device/device_denoising.cpp10
-rw-r--r--intern/cycles/device/device_denoising.h2
-rw-r--r--intern/cycles/device/device_multi.cpp4
-rw-r--r--intern/cycles/device/device_network.cpp1
-rw-r--r--intern/cycles/device/device_opencl.cpp1
-rw-r--r--intern/cycles/device/device_optix.cpp51
-rw-r--r--intern/cycles/device/device_split_kernel.cpp18
-rw-r--r--intern/cycles/device/device_split_kernel.h4
-rw-r--r--intern/cycles/device/device_task.cpp4
-rw-r--r--intern/cycles/device/device_task.h54
-rw-r--r--intern/cycles/device/opencl/device_opencl.h18
-rw-r--r--intern/cycles/device/opencl/device_opencl_impl.cpp103
19 files changed, 431 insertions, 201 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index aa5b65a2b73..ca366722eb7 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -99,6 +99,18 @@ if(WITH_CYCLES_DEVICE_MULTI)
add_definitions(-DWITH_MULTI)
endif()
+if(WITH_OPENIMAGEDENOISE)
+ add_definitions(-DWITH_OPENIMAGEDENOISE)
+ add_definitions(-DOIDN_STATIC_LIB)
+ list(APPEND INC_SYS
+ ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+ )
+ list(APPEND LIB
+ ${OPENIMAGEDENOISE_LIBRARIES}
+ ${TBB_LIBRARIES}
+ )
+endif()
+
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 1aa2fdd0967..e5e3e24165d 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -21,6 +21,7 @@
# include "device/device_split_kernel.h"
# include "util/util_map.h"
+# include "util/util_task.h"
# ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
@@ -96,9 +97,9 @@ class CUDADevice : public Device {
static bool have_precompiled_kernels();
- virtual bool show_samples() const;
+ virtual bool show_samples() const override;
- virtual BVHLayoutMask get_bvh_layout_mask() const;
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
void set_error(const string &error) override;
@@ -108,7 +109,7 @@ class CUDADevice : public Device {
bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
- bool check_peer_access(Device *peer_device);
+ bool check_peer_access(Device *peer_device) override;
bool use_adaptive_compilation();
@@ -122,7 +123,7 @@ class CUDADevice : public Device {
const char *base = "cuda",
bool force_ptx = false);
- virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
+ virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
void load_functions();
@@ -140,19 +141,19 @@ class CUDADevice : public Device {
void generic_free(device_memory &mem);
- void mem_alloc(device_memory &mem);
+ void mem_alloc(device_memory &mem) override;
- void mem_copy_to(device_memory &mem);
+ void mem_copy_to(device_memory &mem) override;
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
- void mem_zero(device_memory &mem);
+ void mem_zero(device_memory &mem) override;
- void mem_free(device_memory &mem);
+ void mem_free(device_memory &mem) override;
- device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/);
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
- virtual void const_copy_to(const char *name, void *host, size_t size);
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
void global_alloc(device_memory &mem);
@@ -252,15 +253,15 @@ class CUDADevice : public Device {
int dw,
int dh,
bool transparent,
- const DeviceDrawParams &draw_params);
+ const DeviceDrawParams &draw_params) override;
- void thread_run(DeviceTask *task);
+ void thread_run(DeviceTask &task);
- virtual void task_add(DeviceTask &task);
+ virtual void task_add(DeviceTask &task) override;
- virtual void task_wait();
+ virtual void task_wait() override;
- virtual void task_cancel();
+ virtual void task_cancel() override;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 7aa63ff48c3..b9bbeb9a25b 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -105,7 +105,7 @@ class CUDASplitKernel : public DeviceSplitKernel {
virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
const DeviceRequestedFeatures &);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
};
/* Utility to push/pop CUDA context. */
@@ -243,7 +243,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
CUDADevice::~CUDADevice()
{
- task_pool.stop();
+ task_pool.cancel();
delete split_kernel;
@@ -2326,11 +2326,11 @@ void CUDADevice::draw_pixels(device_memory &mem,
Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
}
-void CUDADevice::thread_run(DeviceTask *task)
+void CUDADevice::thread_run(DeviceTask &task)
{
CUDAContextScope scope(this);
- if (task->type == DeviceTask::RENDER) {
+ if (task.type == DeviceTask::RENDER) {
DeviceRequestedFeatures requested_features;
if (use_split_kernel()) {
if (split_kernel == NULL) {
@@ -2343,72 +2343,64 @@ void CUDADevice::thread_run(DeviceTask *task)
/* keep rendering tiles until done */
RenderTile tile;
- DenoisingTask denoising(this, *task);
+ DenoisingTask denoising(this, task);
- while (task->acquire_tile(this, tile, task->tile_types)) {
+ while (task.acquire_tile(this, tile, task.tile_types)) {
if (tile.task == RenderTile::PATH_TRACE) {
if (use_split_kernel()) {
device_only_memory<uchar> void_buffer(this, "void_buffer");
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
}
else {
- render(*task, tile, work_tiles);
+ render(task, tile, work_tiles);
}
}
else if (tile.task == RenderTile::BAKE) {
- render(*task, tile, work_tiles);
+ render(task, tile, work_tiles);
}
else if (tile.task == RenderTile::DENOISE) {
tile.sample = tile.start_sample + tile.num_samples;
denoise(tile, denoising);
- task->update_progress(&tile, tile.w * tile.h);
+ task.update_progress(&tile, tile.w * tile.h);
}
- task->release_tile(tile);
+ task.release_tile(tile);
- if (task->get_cancel()) {
- if (task->need_finish_queue == false)
+ if (task.get_cancel()) {
+ if (task.need_finish_queue == false)
break;
}
}
work_tiles.free();
}
- else if (task->type == DeviceTask::SHADER) {
- shader(*task);
+ else if (task.type == DeviceTask::SHADER) {
+ shader(task);
cuda_assert(cuCtxSynchronize());
}
- else if (task->type == DeviceTask::DENOISE_BUFFER) {
+ else if (task.type == DeviceTask::DENOISE_BUFFER) {
RenderTile tile;
- tile.x = task->x;
- tile.y = task->y;
- tile.w = task->w;
- tile.h = task->h;
- tile.buffer = task->buffer;
- tile.sample = task->sample + task->num_samples;
- tile.num_samples = task->num_samples;
- tile.start_sample = task->sample;
- tile.offset = task->offset;
- tile.stride = task->stride;
- tile.buffers = task->buffers;
-
- DenoisingTask denoising(this, *task);
+ tile.x = task.x;
+ tile.y = task.y;
+ tile.w = task.w;
+ tile.h = task.h;
+ tile.buffer = task.buffer;
+ tile.sample = task.sample + task.num_samples;
+ tile.num_samples = task.num_samples;
+ tile.start_sample = task.sample;
+ tile.offset = task.offset;
+ tile.stride = task.stride;
+ tile.buffers = task.buffers;
+
+ DenoisingTask denoising(this, task);
denoise(tile, denoising);
- task->update_progress(&tile, tile.w * tile.h);
+ task.update_progress(&tile, tile.w * tile.h);
}
}
-class CUDADeviceTask : public DeviceTask {
- public:
- CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
- {
- run = function_bind(&CUDADevice::thread_run, device, this);
- }
-};
-
void CUDADevice::task_add(DeviceTask &task)
{
CUDAContextScope scope(this);
@@ -2424,7 +2416,10 @@ void CUDADevice::task_add(DeviceTask &task)
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
}
else {
- task_pool.push(new CUDADeviceTask(this, task));
+ task_pool.push([=] {
+ DeviceTask task_copy = task;
+ thread_run(task_copy);
+ });
}
}
@@ -2652,7 +2647,7 @@ int2 CUDASplitKernel::split_kernel_local_size()
int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
device_memory &data,
- DeviceTask * /*task*/)
+ DeviceTask & /*task*/)
{
CUDAContextScope scope(device);
size_t free;
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 41dd7894d93..9dbb33980b4 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -77,7 +77,7 @@ std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &reques
/* Device */
-Device::~Device()
+Device::~Device() noexcept(false)
{
if (!background) {
if (vertex_buffer != 0) {
@@ -603,6 +603,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.has_osl = true;
info.has_profiling = true;
info.has_peer_memory = false;
+ info.denoisers = DENOISER_ALL;
foreach (const DeviceInfo &device, subdevices) {
/* Ensure CPU device does not slow down GPU. */
@@ -647,6 +648,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.has_osl &= device.has_osl;
info.has_profiling &= device.has_profiling;
info.has_peer_memory |= device.has_peer_memory;
+ info.denoisers &= device.denoisers;
}
return info;
@@ -667,4 +669,55 @@ void Device::free_memory()
network_devices.free_memory();
}
+/* DeviceInfo */
+
+void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+{
+ assert(denoising_devices.empty());
+
+ if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
+ vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
+ if (!optix_devices.empty()) {
+ /* Convert to a special multi device with separate denoising devices. */
+ if (multi_devices.empty()) {
+ multi_devices.push_back(*this);
+ }
+
+ /* Try to use the same physical devices for denoising. */
+ for (const DeviceInfo &cuda_device : multi_devices) {
+ if (cuda_device.type == DEVICE_CUDA) {
+ for (const DeviceInfo &optix_device : optix_devices) {
+ if (cuda_device.num == optix_device.num) {
+ id += optix_device.id;
+ denoising_devices.push_back(optix_device);
+ break;
+ }
+ }
+ }
+ }
+
+ if (denoising_devices.empty()) {
+ /* Simply use the first available OptiX device. */
+ const DeviceInfo optix_device = optix_devices.front();
+ id += optix_device.id; /* Uniquely identify this special multi device. */
+ denoising_devices.push_back(optix_device);
+ }
+
+ denoisers = denoiser_type;
+ }
+ }
+ else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
+ /* Convert to a special multi device with separate denoising devices. */
+ if (multi_devices.empty()) {
+ multi_devices.push_back(*this);
+ }
+
+ /* Add CPU denoising devices. */
+ DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
+ denoising_devices.push_back(cpu_device);
+
+ denoisers = denoiser_type;
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index dff981080a5..a5833369a17 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -83,6 +83,7 @@ class DeviceInfo {
bool use_split_kernel; /* Use split or mega kernel. */
bool has_profiling; /* Supports runtime collection of profiling info. */
bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
+ DenoiserTypeMask denoisers; /* Supported denoiser types. */
int cpu_threads;
vector<DeviceInfo> multi_devices;
vector<DeviceInfo> denoising_devices;
@@ -101,6 +102,7 @@ class DeviceInfo {
use_split_kernel = false;
has_profiling = false;
has_peer_memory = false;
+ denoisers = DENOISER_NONE;
}
bool operator==(const DeviceInfo &info)
@@ -110,6 +112,9 @@ class DeviceInfo {
(type == info.type && num == info.num && description == info.description));
return id == info.id;
}
+
+ /* Add additional devices needed for the specified denoiser. */
+ void add_denoising_devices(DenoiserType denoiser_type);
};
class DeviceRequestedFeatures {
@@ -132,6 +137,7 @@ class DeviceRequestedFeatures {
/* BVH/sampling kernel features. */
bool use_hair;
+ bool use_hair_thick;
bool use_object_motion;
bool use_camera_motion;
@@ -178,6 +184,7 @@ class DeviceRequestedFeatures {
max_nodes_group = 0;
nodes_features = 0;
use_hair = false;
+ use_hair_thick = false;
use_object_motion = false;
use_camera_motion = false;
use_baking = false;
@@ -200,6 +207,7 @@ class DeviceRequestedFeatures {
max_nodes_group == requested_features.max_nodes_group &&
nodes_features == requested_features.nodes_features &&
use_hair == requested_features.use_hair &&
+ use_hair_thick == requested_features.use_hair_thick &&
use_object_motion == requested_features.use_object_motion &&
use_camera_motion == requested_features.use_camera_motion &&
use_baking == requested_features.use_baking &&
@@ -319,7 +327,8 @@ class Device {
virtual void mem_free_sub_ptr(device_ptr /*ptr*/){};
public:
- virtual ~Device();
+ /* noexcept needed to silence TBB warning. */
+ virtual ~Device() noexcept(false);
/* info */
DeviceInfo info;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index fc6febd8cee..8f68e66a1b4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -51,10 +51,12 @@
#include "util/util_function.h"
#include "util/util_logging.h"
#include "util/util_map.h"
+#include "util/util_openimagedenoise.h"
#include "util/util_opengl.h"
#include "util/util_optimization.h"
#include "util/util_progress.h"
#include "util/util_system.h"
+#include "util/util_task.h"
#include "util/util_thread.h"
CCL_NAMESPACE_BEGIN
@@ -161,7 +163,7 @@ class CPUSplitKernel : public DeviceSplitKernel {
virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
const DeviceRequestedFeatures &);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
};
@@ -176,6 +178,10 @@ class CPUDevice : public Device {
#ifdef WITH_OSL
OSLGlobals osl_globals;
#endif
+#ifdef WITH_OPENIMAGEDENOISE
+ oidn::DeviceRef oidn_device;
+ oidn::FilterRef oidn_filter;
+#endif
bool use_split_kernel;
@@ -332,7 +338,7 @@ class CPUDevice : public Device {
~CPUDevice()
{
- task_pool.stop();
+ task_pool.cancel();
texture_info.free();
}
@@ -344,17 +350,6 @@ class CPUDevice : public Device {
virtual BVHLayoutMask get_bvh_layout_mask() const
{
BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
- if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
- bvh_layout_mask |= BVH_LAYOUT_BVH4;
- }
- /* MSVC does not support the -march=native switch and you always end up */
- /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */
- /* that kernel BVH8 even if the CPU flags would allow for it. */
-#if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE))
- if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
- bvh_layout_mask |= BVH_LAYOUT_BVH8;
- }
-#endif
#ifdef WITH_EMBREE
bvh_layout_mask |= BVH_LAYOUT_EMBREE;
#endif /* WITH_EMBREE */
@@ -527,26 +522,18 @@ class CPUDevice : public Device {
#endif
}
- void thread_run(DeviceTask *task)
+ void thread_run(DeviceTask &task)
{
- if (task->type == DeviceTask::RENDER)
- thread_render(*task);
- else if (task->type == DeviceTask::SHADER)
- thread_shader(*task);
- else if (task->type == DeviceTask::FILM_CONVERT)
- thread_film_convert(*task);
- else if (task->type == DeviceTask::DENOISE_BUFFER)
- thread_denoise(*task);
+ if (task.type == DeviceTask::RENDER)
+ thread_render(task);
+ else if (task.type == DeviceTask::SHADER)
+ thread_shader(task);
+ else if (task.type == DeviceTask::FILM_CONVERT)
+ thread_film_convert(task);
+ else if (task.type == DeviceTask::DENOISE_BUFFER)
+ thread_denoise(task);
}
- class CPUDeviceTask : public DeviceTask {
- public:
- CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task)
- {
- run = function_bind(&CPUDevice::thread_run, device, this);
- }
- };
-
bool denoising_non_local_means(device_ptr image_ptr,
device_ptr guide_ptr,
device_ptr variance_ptr,
@@ -961,7 +948,71 @@ class CPUDevice : public Device {
}
}
- void denoise(DenoisingTask &denoising, RenderTile &tile)
+ void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+ {
+#ifdef WITH_OPENIMAGEDENOISE
+ assert(openimagedenoise_supported());
+
+ /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+ static thread_mutex mutex;
+ thread_scoped_lock lock(mutex);
+
+ /* Create device and filter, cached for reuse. */
+ if (!oidn_device) {
+ oidn_device = oidn::newDevice();
+ oidn_device.commit();
+ }
+ if (!oidn_filter) {
+ oidn_filter = oidn_device.newFilter("RT");
+ }
+
+ /* Copy pixels from compute device to CPU (no-op for CPU device). */
+ rtile.buffers->buffer.copy_from_device();
+
+ /* Set images with appropriate stride for our interleaved pass storage. */
+ const struct {
+ const char *name;
+ int offset;
+ } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR},
+ {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL},
+ {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO},
+ {"output", 0},
+ { NULL,
+ 0 }};
+
+ for (int i = 0; passes[i].name; i++) {
+ const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
+ const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
+ const int64_t pixel_stride = task.pass_stride * sizeof(float);
+ const int64_t row_stride = rtile.stride * pixel_stride;
+
+ oidn_filter.setImage(passes[i].name,
+ (char *)rtile.buffer + buffer_offset,
+ oidn::Format::Float3,
+ rtile.w,
+ rtile.h,
+ 0,
+ pixel_stride,
+ row_stride);
+ }
+
+ /* Execute filter. */
+ oidn_filter.set("hdr", true);
+ oidn_filter.set("srgb", false);
+ oidn_filter.commit();
+ oidn_filter.execute();
+
+ /* todo: it may be possible to avoid this copy, but we have to ensure that
+ * when other code copies data from the device it doesn't overwrite the
+ * denoiser buffers. */
+ rtile.buffers->buffer.copy_to_device();
+#else
+ (void)task;
+ (void)rtile;
+#endif
+ }
+
+ void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
{
ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
@@ -1019,15 +1070,14 @@ class CPUDevice : public Device {
}
}
- RenderTile tile;
- DenoisingTask denoising(this, task);
- denoising.profiler = &kg->profiler;
+ DenoisingTask *denoising = NULL;
+ RenderTile tile;
while (task.acquire_tile(this, tile, task.tile_types)) {
if (tile.task == RenderTile::PATH_TRACE) {
if (use_split_kernel) {
device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
+ split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
}
else {
render(task, tile, kg);
@@ -1037,7 +1087,16 @@ class CPUDevice : public Device {
render(task, tile, kg);
}
else if (tile.task == RenderTile::DENOISE) {
- denoise(denoising, tile);
+ if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+ denoise_openimagedenoise(task, tile);
+ }
+ else if (task.denoising.type == DENOISER_NLM) {
+ if (denoising == NULL) {
+ denoising = new DenoisingTask(this, task);
+ denoising->profiler = &kg->profiler;
+ }
+ denoise_nlm(*denoising, tile);
+ }
task.update_progress(&tile, tile.w * tile.h);
}
@@ -1055,6 +1114,7 @@ class CPUDevice : public Device {
kg->~KernelGlobals();
kgbuffer.free();
delete split_kernel;
+ delete denoising;
}
void thread_denoise(DeviceTask &task)
@@ -1072,16 +1132,22 @@ class CPUDevice : public Device {
tile.stride = task.stride;
tile.buffers = task.buffers;
- DenoisingTask denoising(this, task);
+ if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+ denoise_openimagedenoise(task, tile);
+ }
+ else {
+ DenoisingTask denoising(this, task);
- ProfilingState denoising_profiler_state;
- profiler.add_state(&denoising_profiler_state);
- denoising.profiler = &denoising_profiler_state;
+ ProfilingState denoising_profiler_state;
+ profiler.add_state(&denoising_profiler_state);
+ denoising.profiler = &denoising_profiler_state;
- denoise(denoising, tile);
- task.update_progress(&tile, tile.w * tile.h);
+ denoise_nlm(denoising, tile);
+
+ profiler.remove_state(&denoising_profiler_state);
+ }
- profiler.remove_state(&denoising_profiler_state);
+ task.update_progress(&tile, tile.w * tile.h);
}
void thread_film_convert(DeviceTask &task)
@@ -1155,13 +1221,24 @@ class CPUDevice : public Device {
/* split task into smaller ones */
list<DeviceTask> tasks;
- if (task.type == DeviceTask::SHADER)
+ if (task.type == DeviceTask::DENOISE_BUFFER &&
+ task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+ /* Denoise entire buffer at once with OIDN, it has own threading. */
+ tasks.push_back(task);
+ }
+ else if (task.type == DeviceTask::SHADER) {
task.split(tasks, info.cpu_threads, 256);
- else
+ }
+ else {
task.split(tasks, info.cpu_threads);
+ }
- foreach (DeviceTask &task, tasks)
- task_pool.push(new CPUDeviceTask(this, task));
+ foreach (DeviceTask &task, tasks) {
+ task_pool.push([=] {
+ DeviceTask task_copy = task;
+ thread_run(task_copy);
+ });
+ }
}
void task_wait()
@@ -1326,7 +1403,7 @@ int2 CPUSplitKernel::split_kernel_local_size()
int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
device_memory & /*data*/,
- DeviceTask * /*task*/)
+ DeviceTask & /*task*/)
{
return make_int2(1, 1);
}
@@ -1358,6 +1435,10 @@ void device_cpu_info(vector<DeviceInfo> &devices)
info.has_osl = true;
info.has_half_images = true;
info.has_profiling = true;
+ info.denoisers = DENOISER_NLM;
+ if (openimagedenoise_supported()) {
+ info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+ }
devices.insert(devices.begin(), info);
}
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 04c04761311..d9ffcceb06e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -130,6 +130,7 @@ void device_cuda_info(vector<DeviceInfo> &devices)
info.has_half_images = (major >= 3);
info.has_volume_decoupled = false;
info.has_adaptive_stop_per_sample = false;
+ info.denoisers = DENOISER_NLM;
/* Check if the device has P2P access to any other device in the system. */
for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index ac17c02a427..89de80a5bcd 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -56,8 +56,8 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
tile_info->frames[i] = task.denoising_frames[i - 1];
}
- write_passes = task.denoising_write_passes;
- do_filter = task.denoising_do_filter;
+ do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
+ do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
}
DenoisingTask::~DenoisingTask()
@@ -91,7 +91,7 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
target_buffer.stride = rtiles[9].stride;
target_buffer.ptr = rtiles[9].buffer;
- if (write_passes && rtiles[9].buffers) {
+ if (do_prefilter && rtiles[9].buffers) {
target_buffer.denoising_output_offset =
rtiles[9].buffers->params.get_denoising_prefiltered_offset();
}
@@ -111,7 +111,7 @@ void DenoisingTask::setup_denoising_buffer()
rect = rect_clip(rect,
make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
- buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
+ buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
buffer.passes = buffer.use_intensity ? 15 : 14;
buffer.width = rect.z - rect.x;
buffer.stride = align_up(buffer.width, 4);
@@ -343,7 +343,7 @@ void DenoisingTask::run_denoising(RenderTile *tile)
reconstruct();
}
- if (write_passes) {
+ if (do_prefilter) {
write_buffer();
}
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index bd1d0193dbd..4c122e981eb 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -60,7 +60,7 @@ class DenoisingTask {
int4 rect;
int4 filter_area;
- bool write_passes;
+ bool do_prefilter;
bool do_filter;
struct DeviceFunctions {
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 020b9e10e60..fd14bbdccc5 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -396,8 +396,8 @@ class MultiDevice : public Device {
size_t existing_size = mem.device_size;
/* This is a hack to only allocate the tile buffers on denoising devices
- * Similarily the tile buffers also need to be allocated separately on all devices so any
- * overlap rendered for denoising does not interfer with each other */
+ * Similarly the tile buffers also need to be allocated separately on all devices so any
+ * overlap rendered for denoising does not interfere with each other */
if (strcmp(mem.name, "RenderBuffers") == 0) {
vector<device_ptr> device_pointers;
device_pointers.reserve(devices.size());
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 0933d51f321..8904b517e92 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -313,6 +313,7 @@ void device_network_info(vector<DeviceInfo> &devices)
info.has_volume_decoupled = false;
info.has_adaptive_stop_per_sample = false;
info.has_osl = false;
+ info.denoisers = DENOISER_NONE;
devices.push_back(info);
}
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 8a0b128697f..39b9ef70192 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -120,6 +120,7 @@ void device_opencl_info(vector<DeviceInfo> &devices)
info.use_split_kernel = true;
info.has_volume_decoupled = false;
info.has_adaptive_stop_per_sample = false;
+ info.denoisers = DENOISER_NLM;
info.id = id;
/* Check OpenCL extensions */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index fbf6a914744..ececca3df53 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -246,7 +246,7 @@ class OptiXDevice : public CUDADevice {
~OptiXDevice()
{
// Stop processing any more tasks
- task_pool.stop();
+ task_pool.cancel();
// Make CUDA context current
const CUDAContextScope scope(cuContext);
@@ -428,11 +428,20 @@ class OptiXDevice : public CUDADevice {
group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
if (requested_features.use_hair) {
- // Add curve intersection programs
group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve";
group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve";
+
+ // Add curve intersection programs
+ if (requested_features.use_hair_thick) {
+ // Slower programs for thick hair since that also slows down ribbons.
+ // Ideally this should not be needed.
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
+ }
+ else {
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ }
}
if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@@ -712,7 +721,7 @@ class OptiXDevice : public CUDADevice {
const CUDAContextScope scope(cuContext);
// Choose between OptiX and NLM denoising
- if (task.denoising_use_optix) {
+ if (task.denoising.type == DENOISER_OPTIX) {
// Map neighboring tiles onto this device, indices are as following:
// Where index 4 is the center tile and index 9 is the target for the result.
// 0 1 2
@@ -1436,21 +1445,21 @@ class OptiXDevice : public CUDADevice {
KernelData *const data = (KernelData *)host;
*(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
- update_launch_params(name, offsetof(KernelParams, data), host, size);
+ update_launch_params(offsetof(KernelParams, data), host, size);
return;
}
// Update data storage pointers in launch parameters
# define KERNEL_TEX(data_type, tex_name) \
if (strcmp(name, #tex_name) == 0) { \
- update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \
+ update_launch_params(offsetof(KernelParams, tex_name), host, size); \
return; \
}
# include "kernel/kernel_textures.h"
# undef KERNEL_TEX
}
- void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
+ void update_launch_params(size_t offset, void *data, size_t data_size)
{
const CUDAContextScope scope(cuContext);
@@ -1463,15 +1472,6 @@ class OptiXDevice : public CUDADevice {
void task_add(DeviceTask &task) override
{
- struct OptiXDeviceTask : public DeviceTask {
- OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
- {
- // Using task index parameter instead of thread index, since number of CUDA streams may
- // differ from number of threads
- run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
- }
- };
-
// Upload texture information to device if it has changed since last launch
load_texture_info();
@@ -1483,7 +1483,10 @@ class OptiXDevice : public CUDADevice {
if (task.type == DeviceTask::DENOISE_BUFFER) {
// Execute denoising in a single thread (e.g. to avoid race conditions during creation)
- task_pool.push(new OptiXDeviceTask(this, task, 0));
+ task_pool.push([=] {
+ DeviceTask task_copy = task;
+ thread_run(task_copy, 0);
+ });
return;
}
@@ -1493,8 +1496,15 @@ class OptiXDevice : public CUDADevice {
// Queue tasks in internal task pool
int task_index = 0;
- for (DeviceTask &task : tasks)
- task_pool.push(new OptiXDeviceTask(this, task, task_index++));
+ for (DeviceTask &task : tasks) {
+ task_pool.push([=] {
+ // Using task index parameter instead of thread index, since number of CUDA streams may
+ // differ from number of threads
+ DeviceTask task_copy = task;
+ thread_run(task_copy, task_index);
+ });
+ task_index++;
+ }
}
void task_wait() override
@@ -1551,6 +1561,7 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo
info.type = DEVICE_OPTIX;
info.id += "_OptiX";
+ info.denoisers |= DENOISER_OPTIX;
devices.push_back(info);
}
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index f22d8761058..4c288f60c16 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -145,7 +145,7 @@ size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
return max_buffer_size / size_per_element;
}
-bool DeviceSplitKernel::path_trace(DeviceTask *task,
+bool DeviceSplitKernel::path_trace(DeviceTask &task,
RenderTile &tile,
device_memory &kgbuffer,
device_memory &kernel_data)
@@ -222,9 +222,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
subtile.start_sample = tile.sample;
subtile.num_samples = samples_per_second;
- if (task->adaptive_sampling.use) {
- subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample,
- subtile.num_samples);
+ if (task.adaptive_sampling.use) {
+ subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+ subtile.num_samples);
}
/* Don't go beyond requested number of samples. */
@@ -286,7 +286,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
- if (task->get_cancel() && cancel_time == DBL_MAX) {
+ if (task.get_cancel() && cancel_time == DBL_MAX) {
/* Wait up to twice as many seconds for current samples to finish
* to avoid artifacts in render result from ending too soon.
*/
@@ -323,7 +323,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
}
int filter_sample = tile.sample + subtile.num_samples - 1;
- if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) {
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
size_t buffer_size[2];
buffer_size[0] = round_up(tile.w, local_size[0]);
buffer_size[1] = round_up(tile.h, local_size[1]);
@@ -352,16 +352,16 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
#undef ENQUEUE_SPLIT_KERNEL
tile.sample += subtile.num_samples;
- task->update_progress(&tile, tile.w * tile.h * subtile.num_samples);
+ task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
time_multiplier = min(time_multiplier << 1, 10);
- if (task->get_cancel()) {
+ if (task.get_cancel()) {
return true;
}
}
- if (task->adaptive_sampling.use) {
+ if (task.adaptive_sampling.use) {
/* Reset the start samples. */
RenderTile subtile = tile;
subtile.start_sample = tile.start_sample;
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 9d6b9efdd62..07a21b10299 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -109,7 +109,7 @@ class DeviceSplitKernel {
virtual ~DeviceSplitKernel();
bool load_kernels(const DeviceRequestedFeatures &requested_features);
- bool path_trace(DeviceTask *task,
+ bool path_trace(DeviceTask &task,
RenderTile &rtile,
device_memory &kgbuffer,
device_memory &kernel_data);
@@ -137,7 +137,7 @@ class DeviceSplitKernel {
virtual int2 split_kernel_local_size() = 0;
virtual int2 split_kernel_global_size(device_memory &kg,
device_memory &data,
- DeviceTask *task) = 0;
+ DeviceTask &task) = 0;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 7485e1b41de..6e7c184c6c9 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -50,7 +50,7 @@ DeviceTask::DeviceTask(Type type_)
last_update_time = time_dt();
}
-int DeviceTask::get_subtask_count(int num, int max_size)
+int DeviceTask::get_subtask_count(int num, int max_size) const
{
if (max_size != 0) {
int max_size_num;
@@ -78,7 +78,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
return num;
}
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
+void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
{
num = get_subtask_count(num, max_size);
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8c4e682adb1..600973b8100 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -21,7 +21,6 @@
#include "util/util_function.h"
#include "util/util_list.h"
-#include "util/util_task.h"
CCL_NAMESPACE_BEGIN
@@ -32,8 +31,33 @@ class RenderBuffers;
class RenderTile;
class Tile;
+enum DenoiserType {
+ DENOISER_NLM = 1,
+ DENOISER_OPTIX = 2,
+ DENOISER_OPENIMAGEDENOISE = 4,
+ DENOISER_NUM,
+
+ DENOISER_NONE = 0,
+ DENOISER_ALL = ~0,
+};
+
+typedef int DenoiserTypeMask;
+
class DenoiseParams {
public:
+ /* Apply denoiser to image. */
+ bool use;
+ /* Output denoising data passes (possibly without applying the denoiser). */
+ bool store_passes;
+
+ /* Denoiser type. */
+ DenoiserType type;
+
+ /* Viewport start sample. */
+ int start_sample;
+
+ /** Native Denoiser **/
+
/* Pixel radius for neighboring pixels to take into account. */
int radius;
/* Controls neighbor pixel weighting for the denoising filter. */
@@ -47,18 +71,36 @@ class DenoiseParams {
int neighbor_frames;
/* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
bool clamp_input;
+
+ /** Optix Denoiser **/
+
/* Passes handed over to the OptiX denoiser (default to color + albedo). */
int optix_input_passes;
DenoiseParams()
{
+ use = false;
+ store_passes = false;
+
+ type = DENOISER_NLM;
+
radius = 8;
strength = 0.5f;
feature_strength = 0.5f;
relative_pca = false;
neighbor_frames = 2;
clamp_input = true;
+
optix_input_passes = 2;
+
+ start_sample = 0;
+ }
+
+ /* Test if a denoising task needs to run, also to prefilter passes for the native
+ * denoiser when we are not applying denoising to the combined image. */
+ bool need_denoising_task() const
+ {
+ return (use || (store_passes && type == DENOISER_NLM));
}
};
@@ -75,7 +117,7 @@ class AdaptiveSampling {
int min_samples;
};
-class DeviceTask : public Task {
+class DeviceTask {
public:
typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
Type type;
@@ -98,8 +140,8 @@ class DeviceTask : public Task {
explicit DeviceTask(Type type = RENDER);
- int get_subtask_count(int num, int max_size = 0);
- void split(list<DeviceTask> &tasks, int num, int max_size = 0);
+ int get_subtask_count(int num, int max_size = 0) const;
+ void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
void update_progress(RenderTile *rtile, int pixel_samples = -1);
@@ -116,10 +158,6 @@ class DeviceTask : public Task {
bool denoising_from_render;
vector<int> denoising_frames;
- bool denoising_do_filter;
- bool denoising_use_optix;
- bool denoising_write_passes;
-
int pass_stride;
int frame_stride;
int target_pass_stride;
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 389268e1c2a..e0140996cf0 100644
--- a/intern/cycles/device/opencl/device_opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -23,6 +23,7 @@
# include "util/util_map.h"
# include "util/util_param.h"
# include "util/util_string.h"
+# include "util/util_task.h"
# include "clew.h"
@@ -258,6 +259,8 @@ class OpenCLDevice : public Device {
TaskPool load_required_kernel_task_pool;
/* Task pool for optional kernels (feature kernels during foreground rendering) */
TaskPool load_kernel_task_pool;
+ std::atomic<int> load_kernel_num_compiling;
+
cl_context cxContext;
cl_command_queue cqCommandQueue;
cl_platform_id cpPlatform;
@@ -455,14 +458,6 @@ class OpenCLDevice : public Device {
void denoise(RenderTile &tile, DenoisingTask &denoising);
- class OpenCLDeviceTask : public DeviceTask {
- public:
- OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task)
- {
- run = function_bind(&OpenCLDevice::thread_run, device, this);
- }
- };
-
int get_split_task_count(DeviceTask & /*task*/)
{
return 1;
@@ -470,7 +465,10 @@ class OpenCLDevice : public Device {
void task_add(DeviceTask &task)
{
- task_pool.push(new OpenCLDeviceTask(this, task));
+ task_pool.push([=] {
+ DeviceTask task_copy = task;
+ thread_run(task_copy);
+ });
}
void task_wait()
@@ -483,7 +481,7 @@ class OpenCLDevice : public Device {
task_pool.cancel();
}
- void thread_run(DeviceTask *task);
+ void thread_run(DeviceTask &task);
virtual BVHLayoutMask get_bvh_layout_mask() const
{
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index beb3174b111..8c94815b193 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -542,7 +542,7 @@ class OpenCLSplitKernel : public DeviceSplitKernel {
virtual int2 split_kernel_global_size(device_memory &kg,
device_memory &data,
- DeviceTask * /*task*/)
+ DeviceTask & /*task*/)
{
cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
/* Use small global size on CPU devices as it seems to be much faster. */
@@ -610,6 +610,7 @@ void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
: Device(info, stats, profiler, background),
+ load_kernel_num_compiling(0),
kernel_programs(this),
preview_programs(this),
memory_manager(this),
@@ -684,9 +685,9 @@ OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, b
OpenCLDevice::~OpenCLDevice()
{
- task_pool.stop();
- load_required_kernel_task_pool.stop();
- load_kernel_task_pool.stop();
+ task_pool.cancel();
+ load_required_kernel_task_pool.cancel();
+ load_kernel_task_pool.cancel();
memory_manager.free();
@@ -798,7 +799,11 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_feature
* internally within a single process. */
foreach (OpenCLProgram *program, programs) {
if (!program->load()) {
- load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+ load_kernel_num_compiling++;
+ load_kernel_task_pool.push([=] {
+ program->compile();
+ load_kernel_num_compiling--;
+ });
}
}
return true;
@@ -868,7 +873,7 @@ bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requeste
* Better to check on device level than per kernel as mixing preview and
* non-preview kernels does not work due to different data types */
if (use_preview_kernels) {
- use_preview_kernels = !load_kernel_task_pool.finished();
+ use_preview_kernels = load_kernel_num_compiling.load() > 0;
}
}
return split_kernel->load_kernels(requested_features);
@@ -895,7 +900,7 @@ DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
return DEVICE_KERNEL_USING_FEATURE_KERNEL;
}
- bool other_kernels_finished = load_kernel_task_pool.finished();
+ bool other_kernels_finished = load_kernel_num_compiling.load() == 0;
if (use_preview_kernels) {
if (other_kernels_finished) {
return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
@@ -1336,20 +1341,20 @@ void OpenCLDevice::flush_texture_buffers()
memory_manager.alloc("texture_info", texture_info);
}
-void OpenCLDevice::thread_run(DeviceTask *task)
+void OpenCLDevice::thread_run(DeviceTask &task)
{
flush_texture_buffers();
- if (task->type == DeviceTask::RENDER) {
+ if (task.type == DeviceTask::RENDER) {
RenderTile tile;
- DenoisingTask denoising(this, *task);
+ DenoisingTask denoising(this, task);
/* Allocate buffer for kernel globals */
device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
kgbuffer.alloc_to_device(1);
/* Keep rendering tiles until done. */
- while (task->acquire_tile(this, tile, task->tile_types)) {
+ while (task.acquire_tile(this, tile, task.tile_types)) {
if (tile.task == RenderTile::PATH_TRACE) {
assert(tile.task == RenderTile::PATH_TRACE);
scoped_timer timer(&tile.buffers->render_time);
@@ -1368,42 +1373,42 @@ void OpenCLDevice::thread_run(DeviceTask *task)
clFinish(cqCommandQueue);
}
else if (tile.task == RenderTile::BAKE) {
- bake(*task, tile);
+ bake(task, tile);
}
else if (tile.task == RenderTile::DENOISE) {
tile.sample = tile.start_sample + tile.num_samples;
denoise(tile, denoising);
- task->update_progress(&tile, tile.w * tile.h);
+ task.update_progress(&tile, tile.w * tile.h);
}
- task->release_tile(tile);
+ task.release_tile(tile);
}
kgbuffer.free();
}
- else if (task->type == DeviceTask::SHADER) {
- shader(*task);
+ else if (task.type == DeviceTask::SHADER) {
+ shader(task);
}
- else if (task->type == DeviceTask::FILM_CONVERT) {
- film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+ else if (task.type == DeviceTask::FILM_CONVERT) {
+ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
}
- else if (task->type == DeviceTask::DENOISE_BUFFER) {
+ else if (task.type == DeviceTask::DENOISE_BUFFER) {
RenderTile tile;
- tile.x = task->x;
- tile.y = task->y;
- tile.w = task->w;
- tile.h = task->h;
- tile.buffer = task->buffer;
- tile.sample = task->sample + task->num_samples;
- tile.num_samples = task->num_samples;
- tile.start_sample = task->sample;
- tile.offset = task->offset;
- tile.stride = task->stride;
- tile.buffers = task->buffers;
-
- DenoisingTask denoising(this, *task);
+ tile.x = task.x;
+ tile.y = task.y;
+ tile.w = task.w;
+ tile.h = task.h;
+ tile.buffer = task.buffer;
+ tile.sample = task.sample + task.num_samples;
+ tile.num_samples = task.num_samples;
+ tile.start_sample = task.sample;
+ tile.offset = task.offset;
+ tile.stride = task.stride;
+ tile.buffers = task.buffers;
+
+ DenoisingTask denoising(this, task);
denoise(tile, denoising);
- task->update_progress(&tile, tile.w * tile.h);
+ task.update_progress(&tile, tile.w * tile.h);
}
}
@@ -1937,10 +1942,8 @@ void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
clFinish(cqCommandQueue);
}
-string OpenCLDevice::kernel_build_options(const string *debug_src)
+static bool kernel_build_opencl_2(cl_device_id cdDevice)
{
- string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
/* Build with OpenCL 2.0 if available, this improves performance
* with AMD OpenCL drivers on Windows and Linux (legacy drivers).
* Note that OpenCL selects the highest 1.x version by default,
@@ -1948,10 +1951,36 @@ string OpenCLDevice::kernel_build_options(const string *debug_src)
int version_major, version_minor;
if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
if (version_major >= 2) {
- build_options += "-cl-std=CL2.0 ";
+ /* This appears to trigger a driver bug in Radeon RX cards with certain
+ * driver version, so don't use OpenCL 2.0 for those. */
+ string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
+ if (string_startswith(device_name, "Radeon RX 4") ||
+ string_startswith(device_name, "Radeon (TM) RX 4") ||
+ string_startswith(device_name, "Radeon RX 5") ||
+ string_startswith(device_name, "Radeon (TM) RX 5")) {
+ char version[256] = "";
+ int driver_major, driver_minor;
+ clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+ if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
+ return !(driver_major == 3075 && driver_minor <= 12);
+ }
+ }
+
+ return true;
}
}
+ return false;
+}
+
+string OpenCLDevice::kernel_build_options(const string *debug_src)
+{
+ string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
+
+ if (kernel_build_opencl_2(cdDevice)) {
+ build_options += "-cl-std=CL2.0 ";
+ }
+
if (platform_name == "NVIDIA CUDA") {
build_options +=
"-D__KERNEL_OPENCL_NVIDIA__ "