Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt2
-rw-r--r--intern/cycles/device/device.cpp12
-rw-r--r--intern/cycles/device/device.h13
-rw-r--r--intern/cycles/device/device_cpu.cpp846
-rw-r--r--intern/cycles/device/device_cuda.cpp495
-rw-r--r--intern/cycles/device/device_denoising.cpp218
-rw-r--r--intern/cycles/device/device_denoising.h145
-rw-r--r--intern/cycles/device/device_memory.h44
-rw-r--r--intern/cycles/device/device_multi.cpp54
-rw-r--r--intern/cycles/device/device_split_kernel.cpp6
-rw-r--r--intern/cycles/device/device_split_kernel.h6
-rw-r--r--intern/cycles/device/device_task.cpp6
-rw-r--r--intern/cycles/device/device_task.h14
-rw-r--r--intern/cycles/device/opencl/opencl.h53
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp465
-rw-r--r--intern/cycles/device/opencl/opencl_mega.cpp54
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp45
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp14
18 files changed, 1982 insertions, 510 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6ef2aa1caad..74ec57ddf74 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
device.cpp
device_cpu.cpp
device_cuda.cpp
+ device_denoising.cpp
device_multi.cpp
device_opencl.cpp
device_split_kernel.cpp
@@ -48,6 +49,7 @@ endif()
set(SRC_HEADERS
device.h
+ device_denoising.h
device_memory.h
device_intern.h
device_network.h
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 4c4e862ed1f..949c5f932a4 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -549,4 +549,16 @@ void Device::free_memory()
devices.free_memory();
}
+
+device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
+ : device(device)
+{
+ ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+ device->mem_free_sub_ptr(ptr);
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 21d29a801ae..c22969d7dc6 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -228,6 +228,7 @@ struct DeviceDrawParams {
};
class Device {
+ friend class device_sub_ptr;
protected:
enum {
FALLBACK_SHADER_STATUS_NONE = 0,
@@ -250,6 +251,14 @@ protected:
bool bind_fallback_display_space_shader(const float width, const float height);
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
+ {
+ /* Only required for devices that implement denoising. */
+ assert(false);
+ return (device_ptr) 0;
+ }
+ virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
public:
virtual ~Device();
@@ -278,6 +287,8 @@ public:
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
+ virtual int mem_address_alignment() { return 16; }
+
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
@@ -326,6 +337,8 @@ public:
/* multi device */
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
virtual int device_number(Device * /*sub_device*/) { return 0; }
+ virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+ virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
/* static */
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 84cce605182..1ecce8bd565 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -25,6 +25,7 @@
#endif
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
@@ -34,6 +35,8 @@
#include "kernel/split/kernel_split_data.h"
#include "kernel/kernel_globals.h"
+#include "kernel/filter/filter.h"
+
#include "kernel/osl/osl_shader.h"
#include "kernel/osl/osl_globals.h"
@@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN
class CPUDevice;
-class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs);
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
-class CPUDevice : public Device
-{
- static unordered_map<string, void*> kernel_functions;
-
- static void register_kernel_function(const char* name, void* func)
+template<typename F>
+class KernelFunctions {
+public:
+ KernelFunctions()
{
- kernel_functions[name] = func;
+ kernel = (F)NULL;
}
- static const char* get_arch_name()
+ KernelFunctions(F kernel_default,
+ F kernel_sse2,
+ F kernel_sse3,
+ F kernel_sse41,
+ F kernel_avx,
+ F kernel_avx2)
{
+ const char *architecture_name = "default";
+ kernel = kernel_default;
+
+ /* Silence potential warnings about unused variables
+ * when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
- return "cpu_avx2";
+ architecture_name = "AVX2";
+ kernel = kernel_avx2;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
- return "cpu_avx";
+ architecture_name = "AVX";
+ kernel = kernel_avx;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
- return "cpu_sse41";
+ architecture_name = "SSE4.1";
+ kernel = kernel_sse41;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
- return "cpu_sse3";
+ architecture_name = "SSE3";
+ kernel = kernel_sse3;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
- return "cpu_sse2";
+ architecture_name = "SSE2";
+ kernel = kernel_sse2;
}
- else
#endif
- {
- return "cpu";
+
+ if(strstr(architecture_name, logged_architecture) != 0) {
+ VLOG(1) << "Will be using " << architecture_name << " kernels.";
+ logged_architecture = architecture_name;
}
}
- template<typename F>
- static F get_kernel_function(string name)
- {
- name = string("kernel_") + get_arch_name() + "_" + name;
-
- unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+ inline F operator()() const {
+ assert(kernel);
+ return kernel;
+ }
+protected:
+ F kernel;
+};
- if(it == kernel_functions.end()) {
- assert(!"kernel function not found");
- return NULL;
- }
+class CPUSplitKernel : public DeviceSplitKernel {
+ CPUDevice *device;
+public:
+ explicit CPUSplitKernel(CPUDevice *device);
- return (F)it->second;
- }
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
- friend class CPUSplitKernel;
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+class CPUDevice : public Device
+{
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@@ -149,77 +168,89 @@ public:
bool use_split_kernel;
DeviceRequestedFeatures requested_features;
-
+
+ KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
+
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+
+ KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+
+ KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+ ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
+ ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
+ unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+ KERNEL_NAME_EVAL(cpu, name), \
+ KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), \
+ KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), \
+ KERNEL_NAME_EVAL(cpu_avx2, name)
+
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
- : Device(info, stats, background)
+ : Device(info, stats, background),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+ REGISTER_KERNEL(path_trace),
+ REGISTER_KERNEL(convert_to_half_float),
+ REGISTER_KERNEL(convert_to_byte),
+ REGISTER_KERNEL(shader),
+ REGISTER_KERNEL(filter_divide_shadow),
+ REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_combine_halves),
+ REGISTER_KERNEL(filter_nlm_calc_difference),
+ REGISTER_KERNEL(filter_nlm_blur),
+ REGISTER_KERNEL(filter_nlm_calc_weight),
+ REGISTER_KERNEL(filter_nlm_update_output),
+ REGISTER_KERNEL(filter_nlm_normalize),
+ REGISTER_KERNEL(filter_construct_transform),
+ REGISTER_KERNEL(filter_nlm_construct_gramian),
+ REGISTER_KERNEL(filter_finalize),
+ REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
-
- /* do now to avoid thread issues */
- system_cpu_support_sse2();
- system_cpu_support_sse3();
- system_cpu_support_sse41();
- system_cpu_support_avx();
- system_cpu_support_avx2();
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- VLOG(1) << "Will be using AVX2 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- VLOG(1) << "Will be using AVX kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- VLOG(1) << "Will be using SSE4.1 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- VLOG(1) << "Will be using SSE3kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- VLOG(1) << "Will be using SSE2 kernels.";
- }
- else
-#endif
- {
- VLOG(1) << "Will be using regular kernels.";
- }
-
use_split_kernel = DebugFlags().cpu.split_kernel;
if(use_split_kernel) {
VLOG(1) << "Will be using split kernel.";
}
- kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+ REGISTER_SPLIT_KERNEL(path_init);
+ REGISTER_SPLIT_KERNEL(scene_intersect);
+ REGISTER_SPLIT_KERNEL(lamp_emission);
+ REGISTER_SPLIT_KERNEL(do_volume);
+ REGISTER_SPLIT_KERNEL(queue_enqueue);
+ REGISTER_SPLIT_KERNEL(indirect_background);
+ REGISTER_SPLIT_KERNEL(shader_setup);
+ REGISTER_SPLIT_KERNEL(shader_sort);
+ REGISTER_SPLIT_KERNEL(shader_eval);
+ REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ REGISTER_SPLIT_KERNEL(subsurface_scatter);
+ REGISTER_SPLIT_KERNEL(direct_lighting);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+ REGISTER_SPLIT_KERNEL(next_iteration_setup);
+ REGISTER_SPLIT_KERNEL(indirect_subsurface);
+ REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
}
~CPUDevice()
@@ -273,13 +304,17 @@ public:
if(!mem.data_pointer) {
free((void*)mem.device_pointer);
}
-
mem.device_pointer = 0;
stats.mem_free(mem.device_size);
mem.device_size = 0;
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
kernel_const_copy(&kernel_globals, name, host, size);
@@ -326,13 +361,8 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
- if(!use_split_kernel) {
- thread_path_trace(*task);
- }
- else {
- thread_path_trace_split(*task);
- }
+ if(task->type == DeviceTask::RENDER) {
+ thread_render(*task);
}
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
@@ -349,116 +379,319 @@ public:
}
};
- void thread_path_trace(DeviceTask& task)
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
{
- if(task_pool.canceled()) {
- if(task.need_finish_queue == false)
- return;
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
}
- KernelGlobals kg = thread_kernel_globals_init();
- RenderTile tile;
+ return true;
+ }
- void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ int4 rect = task->rect;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+ float *difference = (float*) task->nlm_state.temporary_2_ptr;
+ float *weightAccum = (float*) task->nlm_state.temporary_3_ptr;
+
+ memset(weightAccum, 0, sizeof(float)*w*h);
+ memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) variance_ptr,
+ difference,
+ local_rect,
+ w, 0,
+ a, k_2);
+
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+
+ filter_nlm_update_output_kernel()(dx, dy,
+ blurDifference,
+ (float*) image_ptr,
+ (float*) out_ptr,
+ weightAccum,
+ local_rect,
+ w, f);
+ }
+
+ int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+ filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- path_trace_kernel = kernel_cpu_avx2_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- path_trace_kernel = kernel_cpu_avx_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- path_trace_kernel = kernel_cpu_sse41_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- path_trace_kernel = kernel_cpu_sse3_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- path_trace_kernel = kernel_cpu_sse2_path_trace;
- }
- else
-#endif
- {
- path_trace_kernel = kernel_cpu_path_trace;
+ return true;
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ y*task->filter_area.z + x,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ &task->rect.x,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+ }
}
+ return true;
+ }
- while(task.acquire_tile(this, tile)) {
- float *render_buffer = (float*)tile.buffer;
- uint *rng_state = (uint*)tile.rng_state;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task.get_cancel() || task_pool.canceled()) {
- if(task.need_finish_queue == false)
- break;
- }
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ float *difference = (float*) task->reconstruction_state.temporary_1_ptr;
+ float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) guide_variance_ptr,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ 1.0f,
+ task->nlm_k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_construct_gramian_kernel()(dx, dy,
+ blurDifference,
+ (float*) task->buffer.mem.device_pointer,
+ (float*) color_ptr,
+ (float*) color_variance_ptr,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ local_rect,
+ &task->reconstruction_state.filter_rect.x,
+ task->buffer.w,
+ task->buffer.h,
+ 4,
+ task->buffer.pass_stride);
+ }
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_finalize_kernel()(x,
+ y,
+ y*task->filter_area.z + x,
+ task->buffer.w,
+ task->buffer.h,
+ (float*) output_ptr,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.buffer_params.x,
+ task->render_buffer.samples);
+ }
+ }
+ return true;
+ }
- for(int y = tile.y; y < tile.y + tile.h; y++) {
- for(int x = tile.x; x < tile.x + tile.w; x++) {
- path_trace_kernel(&kg, render_buffer, rng_state,
- sample, x, y, tile.offset, tile.stride);
- }
- }
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ (void) task;
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ filter_combine_halves_kernel()(x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ &rect.x,
+ r);
+ }
+ }
+ return true;
+ }
- tile.sample = sample + 1;
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_divide_shadow_kernel()(task->render_buffer.samples,
+ task->tiles,
+ x, y,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ (float*) sample_variance_ptr,
+ (float*) sv_variance_ptr,
+ (float*) buffer_variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
+ }
+ }
+ return true;
+ }
- task.update_progress(&tile, tile.w*tile.h);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_get_feature_kernel()(task->render_buffer.samples,
+ task->tiles,
+ mean_offset,
+ variance_offset,
+ x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
}
+ }
+ return true;
+ }
- task.release_tile(tile);
+ void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+ {
+ float *render_buffer = (float*)tile.buffer;
+ uint *rng_state = (uint*)tile.rng_state;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- if(task_pool.canceled()) {
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
+
+ for(int y = tile.y; y < tile.y + tile.h; y++) {
+ for(int x = tile.x; x < tile.x + tile.w; x++) {
+ path_trace_kernel()(kg, render_buffer, rng_state,
+ sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(&tile, tile.w*tile.h);
}
+ }
+
+ void denoise(DeviceTask &task, RenderTile &tile)
+ {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ DenoisingTask denoising(this);
- thread_kernel_globals_free(&kg);
+ denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+ denoising.render_buffer.samples = tile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = tile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+
+ task.update_progress(&tile, tile.w*tile.h);
}
- void thread_path_trace_split(DeviceTask& task)
+ void thread_render(DeviceTask& task)
{
if(task_pool.canceled()) {
if(task.need_finish_queue == false)
return;
}
- RenderTile tile;
-
- CPUSplitKernel split_kernel(this);
-
/* allocate buffer for kernel globals */
- device_memory kgbuffer;
- kgbuffer.resize(sizeof(KernelGlobals));
+ device_only_memory<KernelGlobals> kgbuffer;
+ kgbuffer.resize(1);
mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
- requested_features.max_closure = MAX_CLOSURE;
- if(!split_kernel.load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
- mem_free(kgbuffer);
+ CPUSplitKernel *split_kernel = NULL;
+ if(use_split_kernel) {
+ split_kernel = new CPUSplitKernel(this);
+ requested_features.max_closure = MAX_CLOSURE;
+ if(!split_kernel->load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
- return;
+ delete split_kernel;
+ return;
+ }
}
+ RenderTile tile;
while(task.acquire_tile(this, tile)) {
- device_memory data;
- split_kernel.path_trace(&task, tile, kgbuffer, data);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel) {
+ device_memory data;
+ split_kernel->path_trace(&task, tile, kgbuffer, data);
+ }
+ else {
+ path_trace(task, tile, kg);
+ }
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ denoise(task, tile);
+ }
task.release_tile(tile);
@@ -470,6 +703,7 @@ public:
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
mem_free(kgbuffer);
+ delete split_kernel;
}
void thread_film_convert(DeviceTask& task)
@@ -477,86 +711,16 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
- void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
- }
- else
-#endif
- {
- convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
else {
- void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
- }
- else
-#endif
- {
- convert_to_byte_kernel = kernel_cpu_convert_to_byte;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
}
@@ -568,53 +732,17 @@ public:
#ifdef WITH_OSL
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
- void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- shader_kernel = kernel_cpu_avx2_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- shader_kernel = kernel_cpu_avx_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- shader_kernel = kernel_cpu_sse41_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- shader_kernel = kernel_cpu_sse3_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- shader_kernel = kernel_cpu_sse2_shader;
- }
- else
-#endif
- {
- shader_kernel = kernel_cpu_shader;
- }
-
for(int sample = 0; sample < task.num_samples; sample++) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel(&kg,
- (uint4*)task.shader_input,
- (float4*)task.shader_output,
- (float*)task.shader_output_luma,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
+ shader_kernel()(&kg,
+ (uint4*)task.shader_input,
+ (float4*)task.shader_output,
+ (float*)task.shader_output_luma,
+ task.shader_eval_type,
+ task.shader_filter,
+ x,
+ task.offset,
+ sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
device_memory& use_queues_flags,
device_memory& work_pool_wgs)
{
- typedef void(*data_init_t)(KernelGlobals *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- ccl_global uint *rng_state,
- int start_sample,
- int end_sample,
- int sx, int sy, int sw, int sh, int offset, int stride,
- ccl_global int *Queue_index,
- int queuesize,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
- ccl_global float *buffer);
-
- data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- data_init = kernel_cpu_avx2_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- data_init = kernel_cpu_avx_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- data_init = kernel_cpu_sse41_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- data_init = kernel_cpu_sse3_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- data_init = kernel_cpu_sse2_data_init;
- }
- else
-#endif
- {
- data_init = kernel_cpu_data_init;
- }
-
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
@@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
for(int x = 0; x < dim.global_size[0]; x++) {
kg->global_id = make_int2(x, y);
- data_init((KernelGlobals*)kernel_globals.device_pointer,
- (KernelData*)data.device_pointer,
- (void*)split_data.device_pointer,
- num_global_elements,
- (char*)ray_state.device_pointer,
- (uint*)rtile.rng_state,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int*)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char*)use_queues_flags.device_pointer,
- (uint*)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float*)rtile.buffer);
+ device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+ (KernelData*)data.device_pointer,
+ (void*)split_data.device_pointer,
+ num_global_elements,
+ (char*)ray_state.device_pointer,
+ (uint*)rtile.rng_state,
+ rtile.start_sample,
+ rtile.start_sample + rtile.num_samples,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ (int*)queue_index.device_pointer,
+ dim.global_size[0] * dim.global_size[1],
+ (char*)use_queues_flags.device_pointer,
+ (uint*)work_pool_wgs.device_pointer,
+ rtile.num_samples,
+ (float*)rtile.buffer);
}
}
@@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam
{
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
- kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+ kernel->func = device->split_kernels[kernel_name]();
if(!kernel->func) {
delete kernel;
return NULL;
@@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
return split_data_buffer_size(kg, num_threads);
}
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index e497ec6b0e1..9a8537a6722 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
#include <string.h>
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
#include "render/buffers.h"
+#include "kernel/filter/filter_defines.h"
+
#ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
#else
@@ -113,7 +116,7 @@ public:
DedicatedTaskPool task_pool;
CUdevice cuDevice;
CUcontext cuContext;
- CUmodule cuModule;
+ CUmodule cuModule, cuFilterModule;
map<device_ptr, bool> tex_interp_map;
map<device_ptr, uint> tex_bindless_map;
int cuDevId;
@@ -170,7 +173,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -301,7 +304,8 @@ public:
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features, bool split=false)
+ const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -316,7 +320,7 @@ public:
machine,
cuda_version,
include_path.c_str());
- if(use_adaptive_compilation()) {
+ if(!filter && use_adaptive_compilation()) {
cflags += " " + requested_features.get_build_options();
}
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -364,8 +368,22 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
+ const char *name, *source;
+ if(filter) {
+ name = "filter";
+ source = "filter.cu";
+ }
+ else if(split) {
+ name = "kernel_split";
+ source = "kernel_split.cu";
+ }
+ else {
+ name = "kernel";
+ source = "kernel.cu";
+ }
/* Compute cubin name. */
int major, minor;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -373,9 +391,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
- : "lib/kernel_sm_%d%d.cubin",
- major, minor));
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+ name, major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
VLOG(1) << "Using precompiled kernel.";
@@ -384,7 +401,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features, split);
+ compile_kernel_get_common_cflags(requested_features, filter, split);
/* Try to use locally compiled kernel. */
const string source_path = path_get("source");
@@ -395,9 +412,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
- : "cycles_kernel_sm%d%d_%s.cubin",
- major, minor,
+ const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+ name, major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -432,7 +448,7 @@ public:
const string kernel = path_join(
path_join(source_path, "kernel"),
path_join("kernels",
- path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+ path_join("cuda", source)));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -480,11 +496,14 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features, use_split_kernel());
-
+ string cubin = compile_kernel(requested_features, false, use_split_kernel());
if(cubin == "")
return false;
+ string filter_cubin = compile_kernel(requested_features, true, false);
+ if(filter_cubin == "")
+ return false;
+
/* open module */
cuda_push_context();
@@ -499,6 +518,14 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+ if(path_read_text(filter_cubin, cubin_data))
+ result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if(cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
cuda_pop_context();
return (result == CUDA_SUCCESS);
@@ -581,6 +608,11 @@ public:
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
CUdeviceptr mem;
@@ -881,6 +913,368 @@ public:
}
}
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+ {
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
+ }
+
+ mem_copy_to(task->tiles_mem);
+
+ return !have_error();
+ }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1)/threads; \
+ int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads, threads, 1, \
+ 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ int4 rect = task->rect;
+ int w = rect.z-rect.x;
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+ int dx, dy;
+ int4 local_rect;
+ int channel_offset = 0;
+ void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterConstructTransform;
+ cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+ task->storage.w,
+ task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cuda_push_context();
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ void *calc_difference_args[] = {&dx, &dy,
+ &guide_ptr,
+ &guide_variance_ptr,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &task->buffer.pass_stride,
+ &a,
+ &task->nlm_k_2};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+ void *blur_args[] = {&difference,
+ &blurDifference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *calc_weight_args[] = {&blurDifference,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+ /* Reuse previous arguments. */
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *construct_gramian_args[] = {&dx, &dy,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &color_ptr,
+ &color_variance_ptr,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &local_rect,
+ &task->reconstruction_state.filter_rect,
+ &task->buffer.w,
+ &task->buffer.h,
+ &f,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+ }
+
+ void *finalize_args[] = {&task->buffer.w,
+ &task->buffer.h,
+ &output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ (void) task;
+
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterCombineHalves;
+ cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&mean_ptr,
+ &variance_ptr,
+ &a_ptr,
+ &b_ptr,
+ &rect,
+ &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ (void) task;
+
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterDivideShadow;
+ cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterGetFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ void denoise(RenderTile &rtile, const DeviceTask &task)
+ {
+ DenoisingTask denoising(this);
+
+ denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+ }
+
void path_trace(RenderTile& rtile, int sample, bool branched)
{
if(have_error())
@@ -1326,7 +1720,7 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
+ if(task->type == DeviceTask::RENDER) {
RenderTile tile;
bool branched = task->integrator_branched;
@@ -1334,30 +1728,8 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- if(!use_split_kernel()) {
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
-
- path_trace(tile, sample, branched);
-
- tile.sample = sample + 1;
-
- task->update_progress(&tile, tile.w*tile.h);
- }
-
- task->release_tile(tile);
- }
- }
- else {
- DeviceRequestedFeatures requested_features;
+ DeviceRequestedFeatures requested_features;
+ if(use_split_kernel()) {
if(!use_adaptive_compilation()) {
requested_features.max_closure = 64;
}
@@ -1366,18 +1738,47 @@ public:
split_kernel = new CUDASplitKernel(this);
split_kernel->load_kernels(requested_features);
}
+ }
+
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel()) {
+ device_memory void_buffer;
+ split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ }
+ else {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
+
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
- while(task->acquire_tile(this, tile)) {
- device_memory void_buffer;
- split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ path_trace(tile, sample, branched);
- task->release_tile(tile);
+ tile.sample = sample + 1;
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
}
}
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ denoise(tile, *task);
+
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+
+ task->release_tile(tile);
+
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
}
}
else if(task->type == DeviceTask::SHADER) {
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
new file mode 100644
index 00000000000..39c8cf30105
--- /dev/null
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+ radius = task.denoising_radius;
+ nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+ if(task.denoising_relative_pca) {
+ pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+ }
+ else {
+ pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+ }
+
+ render_buffer.pass_stride = task.pass_stride;
+ render_buffer.denoising_data_offset = task.pass_denoising_data;
+ render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+ /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+ rect = make_int4(max(tiles->x[0], filter_area.x - radius),
+ max(tiles->y[0], filter_area.y - radius),
+ min(tiles->x[3], filter_area.x + filter_area.z + radius),
+ min(tiles->y[3], filter_area.y + filter_area.w + radius));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+ tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
+
+ device_ptr buffers[9];
+ for(int i = 0; i < 9; i++) {
+ buffers[i] = rtiles[i].buffer;
+ tiles->offsets[i] = rtiles[i].offset;
+ tiles->strides[i] = rtiles[i].stride;
+ }
+ tiles->x[0] = rtiles[3].x;
+ tiles->x[1] = rtiles[4].x;
+ tiles->x[2] = rtiles[5].x;
+ tiles->x[3] = rtiles[5].x + rtiles[5].w;
+ tiles->y[0] = rtiles[1].y;
+ tiles->y[1] = rtiles[4].y;
+ tiles->y[2] = rtiles[7].y;
+ tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+ render_buffer.offset = rtiles[4].offset;
+ render_buffer.stride = rtiles[4].stride;
+ render_buffer.ptr = rtiles[4].buffer;
+
+ functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+ /* Allocate denoising buffer. */
+ buffer.passes = 14;
+ buffer.w = align_up(rect.z - rect.x, 4);
+ buffer.h = rect.w - rect.y;
+ buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+ buffer.mem.resize(buffer.pass_stride * buffer.passes);
+ device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
+
+ device_ptr null_ptr = (device_ptr) 0;
+
+ /* Prefilter shadow feature. */
+ {
+ device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+ nlm_state.temporary_1_ptr = *nlm_temporary_1;
+ nlm_state.temporary_2_ptr = *nlm_temporary_2;
+ nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+ /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+ functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+ functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+ /* Reuse memory, the previous data isn't needed anymore. */
+ device_ptr filtered_a = *buffer_var,
+ filtered_b = *sample_var;
+ /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+ functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+ functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+ device_ptr residual_var = *sample_var_var;
+ /* Estimate the residual variance between the two filtered halves. */
+ functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+ device_ptr final_a = *unfiltered_a,
+ final_b = *unfiltered_b;
+ /* Use the residual variance for a second filter pass. */
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+ functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+ functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+ /* Combine the two double-filtered halves to a final shadow feature. */
+ device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+ }
+
+ /* Prefilter general features. */
+ {
+ device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+ nlm_state.temporary_1_ptr = *nlm_temporary_1;
+ nlm_state.temporary_2_ptr = *nlm_temporary_2;
+ nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+ int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
+ int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
+ int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 };
+ for(int pass = 0; pass < 7; pass++) {
+ device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ /* Get the unfiltered pass and its variance from the RenderBuffers. */
+ functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+ /* Smooth the pass and store the result in the denoising buffers. */
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+ functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+ }
+ }
+
+ /* Copy color passes. */
+ {
+ int mean_from[] = {20, 21, 22};
+ int variance_from[] = {23, 24, 25};
+ int mean_to[] = { 8, 9, 10};
+ int variance_to[] = {11, 12, 13};
+ int num_color_passes = 3;
+ for(int pass = 0; pass < num_color_passes; pass++) {
+ device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+ }
+ }
+
+ storage.w = filter_area.z;
+ storage.h = filter_area.w;
+ storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
+ storage.rank.resize(storage.w*storage.h);
+ device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
+ device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
+
+ functions.construct_transform();
+
+ device_only_memory<float> temporary_1;
+ device_only_memory<float> temporary_2;
+ temporary_1.resize(buffer.w*buffer.h);
+ temporary_2.resize(buffer.w*buffer.h);
+ device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
+ device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
+ reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+ reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+ storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
+ storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
+ device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
+ device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
+
+ reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+ int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+ reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+ render_buffer.stride,
+ render_buffer.pass_stride,
+ render_buffer.denoising_clean_offset);
+ reconstruction_state.source_w = rect.z-rect.x;
+ reconstruction_state.source_h = rect.w-rect.y;
+
+ {
+ device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
+ }
+
+ device->mem_free(storage.XtWX);
+ device->mem_free(storage.XtWY);
+ device->mem_free(storage.transform);
+ device->mem_free(storage.rank);
+ device->mem_free(temporary_1);
+ device->mem_free(temporary_2);
+ device->mem_free(buffer.mem);
+ device->mem_free(tiles_mem);
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
new file mode 100644
index 00000000000..86d8eb64386
--- /dev/null
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+ /* Parameters of the denoising algorithm. */
+ int radius;
+ float nlm_k_2;
+ float pca_threshold;
+
+ /* Pointer and parameters of the RenderBuffers. */
+ struct RenderBuffers {
+ int denoising_data_offset;
+ int denoising_clean_offset;
+ int pass_stride;
+ int offset;
+ int stride;
+ device_ptr ptr;
+ int samples;
+ } render_buffer;
+
+ TilesInfo *tiles;
+ device_vector<int> tiles_mem;
+ void tiles_from_rendertiles(RenderTile *rtiles);
+
+ int4 rect;
+ int4 filter_area;
+
+ struct DeviceFunctions {
+ function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
+ device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
+ device_ptr variance_ptr, /* Contains the variance of the guide image. */
+ device_ptr out_ptr /* The filtered output is written into this image. */
+ )> non_local_means;
+ function<bool(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr
+ )> reconstruct;
+ function<bool()> construct_transform;
+
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect
+ )> combine_halves;
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr
+ )> divide_shadow;
+ function<bool(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr
+ )> get_feature;
+ function<bool(device_ptr*)> set_tiles;
+ } functions;
+
+ /* Stores state of the current Reconstruction operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct ReconstructionState {
+ device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+ device_ptr temporary_2_ptr;
+
+ int4 filter_rect;
+ int4 buffer_params;
+
+ int source_w;
+ int source_h;
+ } reconstruction_state;
+
+ /* Stores state of the current NLM operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct NLMState {
+ device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+ device_ptr temporary_2_ptr;
+ device_ptr temporary_3_ptr;
+
+ int r; /* Search radius of the filter. */
+ int f; /* Patch size of the filter. */
+ float a; /* Variance compensation factor in the MSE estimation. */
+ float k_2; /* Squared value of the k parameter of the filter. */
+
+ void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+ } nlm_state;
+
+ struct Storage {
+ device_only_memory<float> transform;
+ device_only_memory<int> rank;
+ device_only_memory<float> XtWX;
+ device_only_memory<float3> XtWY;
+ int w;
+ int h;
+ } storage;
+
+ DenoisingTask(Device *device) : device(device) {}
+
+ void init_from_devicetask(const DeviceTask &task);
+
+ bool run_denoising();
+
+ struct DenoiseBuffers {
+ int pass_stride;
+ int passes;
+ int w;
+ int h;
+ device_only_memory<float> mem;
+ } buffer;
+
+protected:
+ Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 4b10514a9d2..b63dd00068b 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -35,6 +35,8 @@
CCL_NAMESPACE_BEGIN
+class Device;
+
enum MemoryType {
MEM_READ_ONLY,
MEM_WRITE_ONLY,
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
template<> struct device_type_traits<float3> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 3;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<float4> {
@@ -173,6 +175,9 @@ class device_memory
{
public:
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+ size_t memory_elements_size(int elements) {
+ return elements*data_elements*datatype_size(data_type);
+ }
/* data information */
DataType data_type;
@@ -213,6 +218,22 @@ protected:
device_memory& operator = (const device_memory&);
};
+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+ device_only_memory()
+ {
+ data_type = device_type_traits<T>::data_type;
+ data_elements = max(device_type_traits<T>::num_elements, 1);
+ }
+
+ void resize(size_t num)
+ {
+ device_memory::resize(num*sizeof(T));
+ }
+};
+
/* Device Vector */
template<typename T> class device_vector : public device_memory
@@ -299,6 +320,27 @@ private:
array<T> data;
};
+/* A device_sub_ptr is a pointer into another existing memory.
+ * Therefore, it is not allocated separately, but just created from the already allocated base memory.
+ * It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
+ * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
+class device_sub_ptr
+{
+public:
+ device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
+ ~device_sub_ptr();
+ /* No copying. */
+ device_sub_ptr& operator = (const device_sub_ptr&);
+
+ device_ptr operator*() const
+ {
+ return ptr;
+ }
+protected:
+ Device *device;
+ device_ptr ptr;
+};
+
CCL_NAMESPACE_END
#endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 8616e31d3b9..35ae0303d6e 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -302,6 +302,60 @@ public:
return -1;
}
+ void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+ {
+ for(int i = 0; i < 9; i++) {
+ if(!tiles[i].buffers) {
+ continue;
+ }
+ /* If the tile was rendered on another device, copy its memory to
+ * to the current device now, for the duration of the denoising task.
+ * Note that this temporarily modifies the RenderBuffers and calls
+ * the device, so this function is not thread safe. */
+ if(tiles[i].buffers->device != sub_device) {
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+
+ tiles[i].buffers->copy_from_device();
+ device_ptr original_ptr = mem.device_pointer;
+ mem.device_pointer = 0;
+ sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
+ sub_device->mem_copy_to(mem);
+ tiles[i].buffer = mem.device_pointer;
+ mem.device_pointer = original_ptr;
+ }
+ }
+ }
+
+ void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+ {
+ for(int i = 0; i < 9; i++) {
+ if(!tiles[i].buffers) {
+ continue;
+ }
+ if(tiles[i].buffers->device != sub_device) {
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+
+ device_ptr original_ptr = mem.device_pointer;
+ mem.device_pointer = tiles[i].buffer;
+
+ /* Copy denoised tile to the host. */
+ if(i == 4) {
+ tiles[i].buffers->copy_from_device(sub_device);
+ }
+
+ size_t mem_size = mem.device_size;
+ sub_device->mem_free(mem);
+ mem.device_pointer = original_ptr;
+ mem.device_size = mem_size;
+
+ /* Copy denoised tile to the original device. */
+ if(i == 4) {
+ tiles[i].buffers->device->mem_copy_to(mem);
+ }
+ }
+ }
+ }
+
int get_split_task_count(DeviceTask& task)
{
int total_tasks = 0;
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 9118793aad6..dddd19f179f 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
/* Allocate work_pool_wgs memory. */
- work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+ work_pool_wgs.resize(max_work_groups);
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
- queue_index.resize(NUM_QUEUES * sizeof(int));
+ queue_index.resize(NUM_QUEUES);
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
- use_queues_flag.resize(sizeof(char));
+ use_queues_flag.resize(1);
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
ray_state.resize(num_global_elements);
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 58c2fdbb077..68c2ba974a5 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -80,16 +80,16 @@ private:
*/
device_memory split_data;
device_vector<uchar> ray_state;
- device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+ device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
/* Flag to make sceneintersect and lampemission kernel use queues. */
- device_memory use_queues_flag;
+ device_only_memory<char> use_queues_flag;
/* Approximate time it takes to complete one sample */
double avg_time_per_sample;
/* Work pool with respect to each work group. */
- device_memory work_pool_wgs;
+ device_only_memory<unsigned int> work_pool_wgs;
/* clos_max value for which the kernels have been loaded currently. */
int current_max_closure;
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index ca303365627..3bc4c310283 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
if(type == SHADER) {
num = min(shader_w, num);
}
- else if(type == PATH_TRACE) {
+ else if(type == RENDER) {
}
else {
num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
tasks.push_back(task);
}
}
- else if(type == PATH_TRACE) {
+ else if(type == RENDER) {
for(int i = 0; i < num; i++)
tasks.push_back(*this);
}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
{
- if((type != PATH_TRACE) &&
+ if((type != RENDER) &&
(type != SHADER))
return;
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index feee89fd6e4..44a1efff1f5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -34,7 +34,7 @@ class Tile;
class DeviceTask : public Task {
public:
- typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+ typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
Type type;
int x, y, w, h;
@@ -53,7 +53,7 @@ public:
int passes_size;
- explicit DeviceTask(Type type = PATH_TRACE);
+ explicit DeviceTask(Type type = RENDER);
int get_subtask_count(int num, int max_size = 0);
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -65,6 +65,16 @@ public:
function<void(RenderTile&)> update_tile_sample;
function<void(RenderTile&)> release_tile;
function<bool(void)> get_cancel;
+ function<void(RenderTile*, Device*)> map_neighbor_tiles;
+ function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+ int denoising_radius;
+ float denoising_strength;
+ float denoising_feature_strength;
+ bool denoising_relative_pca;
+ int pass_stride;
+ int pass_denoising_data;
+ int pass_denoising_clean;
bool need_finish_queue;
bool integrator_branched;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index d061973dcb7..a458ca6bf64 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -17,6 +17,7 @@
#ifdef WITH_OPENCL
#include "device/device.h"
+#include "device/device_denoising.h"
#include "util/util_map.h"
#include "util/util_param.h"
@@ -129,6 +130,8 @@ public:
cl_int* error = NULL);
static cl_device_type get_device_type(cl_device_id device_id);
+ static int mem_address_alignment(cl_device_id device_id);
+
/* Get somewhat more readable device name.
* Main difference is AMD OpenCL here which only gives code name
* for the regular device name. This will give more sane device
@@ -218,7 +221,7 @@ public:
cl_int err = stmt; \
\
if(err != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+ string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -282,7 +285,7 @@ public:
map<ustring, cl_kernel> kernels;
};
- OpenCLProgram base_program;
+ OpenCLProgram base_program, denoising_program;
typedef map<string, device_vector<uchar>*> ConstMemMap;
typedef map<string, device_ptr> MemMap;
@@ -320,6 +323,9 @@ public:
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
void mem_free(device_memory& mem);
+
+ int mem_address_alignment();
+
void const_copy_to(const char *name, void *host, size_t size);
void tex_alloc(const char *name,
device_memory& mem,
@@ -328,12 +334,14 @@ public:
void tex_free(device_memory& mem);
size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+ void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
void shader(DeviceTask& task);
+ void denoise(RenderTile& tile, const DeviceTask& task);
+
class OpenCLDeviceTask : public DeviceTask {
public:
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -367,9 +375,48 @@ public:
virtual void thread_run(DeviceTask * /*task*/) = 0;
+ virtual bool is_split_kernel() = 0;
+
protected:
string kernel_build_options(const string *debug_src = NULL);
+ void mem_zero_kernel(device_ptr ptr, size_t size);
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task);
+ bool denoising_construct_transform(DenoisingTask *task);
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task);
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task);
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task);
+ bool denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task);
+
+ device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
+ void mem_free_sub_ptr(device_ptr ptr);
+
class ArgumentWrapper {
public:
ArgumentWrapper() : size(0), pointer(NULL)
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 22aeaddcde8..ae1a7b917c3 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("bake"));
base_program.add_kernel(ustring("zero_buffer"));
+ denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+ denoising_program.add_kernel(ustring("filter_divide_shadow"));
+ denoising_program.add_kernel(ustring("filter_get_feature"));
+ denoising_program.add_kernel(ustring("filter_combine_halves"));
+ denoising_program.add_kernel(ustring("filter_construct_transform"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+ denoising_program.add_kernel(ustring("filter_nlm_blur"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+ denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+ denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+ denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+ denoising_program.add_kernel(ustring("filter_finalize"));
+ denoising_program.add_kernel(ustring("filter_set_tiles"));
+
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
+ programs.push_back(&denoising_program);
/* Call actual class to fill the vector with its programs. */
if(!load_kernels(requested_features, programs)) {
return false;
@@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
NULL, NULL));
}
-void OpenCLDeviceBase::mem_zero(device_memory& mem)
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
{
- if(mem.device_pointer) {
- if(base_program.is_loaded()) {
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+ cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
+ size_t global_size[] = {1024, 1024};
+ size_t num_threads = global_size[0] * global_size[1];
- cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
+ cl_mem d_buffer = CL_MEM_PTR(mem);
+ cl_ulong d_offset = 0;
+ cl_ulong d_size = 0;
- while(d_offset < mem.memory_size()) {
- d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+ while(d_offset < size) {
+ d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+ kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
- ckZeroBuffer,
- 2,
- NULL,
- global_size,
- NULL,
- 0,
- NULL,
- NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+ ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+ ckZeroBuffer,
+ 2,
+ NULL,
+ global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
- d_offset += d_size;
- }
+ d_offset += d_size;
+ }
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+ if(mem.device_pointer) {
+ if(base_program.is_loaded()) {
+ mem_zero_kernel(mem.device_pointer, mem.memory_size());
}
if(mem.data_pointer) {
@@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
}
}
+int OpenCLDeviceBase::mem_address_alignment()
+{
+ return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+ cl_mem_flags mem_flag;
+ if(type == MEM_READ_ONLY)
+ mem_flag = CL_MEM_READ_ONLY;
+ else if(type == MEM_WRITE_ONLY)
+ mem_flag = CL_MEM_WRITE_ONLY;
+ else
+ mem_flag = CL_MEM_READ_WRITE;
+
+ cl_buffer_region info;
+ info.origin = mem.memory_elements_size(offset);
+ info.size = mem.memory_elements_size(size);
+
+ device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+ mem_flag,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ &info,
+ &ciErr);
+ opencl_assert_err(ciErr, "clCreateSubBuffer");
+ return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+ if(device_pointer && device_pointer != null_mem) {
+ opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+ }
+}
+
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
{
ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
return global_size + ((r == 0)? 0: group_size - r);
}
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
{
size_t workgroup_size, max_work_items[3];
@@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
+ if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+ workgroup_size = max_workgroup_size;
+ }
+
/* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task)
+{
+ int4 rect = task->rect;
+ int w = rect.z-rect.x;
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+ cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+ mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+ mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
+ cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+ int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy, guide_mem, variance_mem,
+ difference, local_rect, w, 0, a, k_2);
+ kernel_set_args(ckNLMBlur, 0,
+ difference, blurDifference, local_rect, w, f);
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference, difference, local_rect, w, f);
+ kernel_set_args(ckNLMUpdateOutput, 0,
+ dx, dy, blurDifference, image_mem,
+ out_mem, weightAccum, local_rect, w, f);
+
+ enqueue_kernel(ckNLMCalcDifference, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMCalcWeight, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMUpdateOutput, w, h);
+ }
+
+ int4 local_rect = make_int4(0, 0, w, h);
+ kernel_set_args(ckNLMNormalize, 0,
+ out_mem, weightAccum, local_rect, w);
+ enqueue_kernel(ckNLMNormalize, w, h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+ cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+ kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ task->filter_area,
+ task->rect,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+
+ enqueue_kernel(ckFilterConstructTransform,
+ task->storage.w,
+ task->storage.h,
+ 256);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cl_mem color_mem = CL_MEM_PTR(color_ptr);
+ cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy,
+ guide_mem,
+ guide_variance_mem,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ a, task->nlm_k_2);
+ enqueue_kernel(ckNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMBlur, 0,
+ difference,
+ blurDifference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference,
+ difference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMCalcWeight,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ /* Reuse previous arguments. */
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMConstructGramian, 0,
+ dx, dy,
+ blurDifference,
+ buffer_mem,
+ color_mem,
+ color_variance_mem,
+ transform_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ local_rect,
+ task->reconstruction_state.filter_rect,
+ task->buffer.w,
+ task->buffer.h,
+ f,
+ task->buffer.pass_stride);
+ enqueue_kernel(ckNLMConstructGramian,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h,
+ 256);
+ }
+
+ kernel_set_args(ckFinalize, 0,
+ task->buffer.w,
+ task->buffer.h,
+ output_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->filter_area,
+ task->reconstruction_state.buffer_params,
+ task->render_buffer.samples);
+ enqueue_kernel(ckFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task)
+{
+ (void) task;
+
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+ kernel_set_args(ckFilterCombineHalves, 0,
+ mean_mem,
+ variance_mem,
+ a_mem,
+ b_mem,
+ rect,
+ r);
+ enqueue_kernel(ckFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task)
+{
+ (void) task;
+
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+ cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+ cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterDivideShadow, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ a_mem,
+ b_mem,
+ sample_variance_mem,
+ sv_variance_mem,
+ buffer_variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+{
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterGetFeature, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ mean_offset,
+ variance_offset,
+ mean_mem,
+ variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task)
+{
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+ mem_copy_to(task->tiles_mem);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+ kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+ for(int i = 0; i < 9; i++) {
+ cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+ kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+ }
+
+ enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+ return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+ DenoisingTask denoising(this);
+
+ denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+ denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+}
+
void OpenCLDeviceBase::shader(DeviceTask& task)
{
/* cast arguments to cl types */
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index a2fd1d71156..06c15bcf401 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -108,41 +108,53 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
- else if(task->type == DeviceTask::PATH_TRACE) {
+ else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ if(tile.task == RenderTile::PATH_TRACE) {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
+
+ path_trace(tile, sample);
- path_trace(tile, sample);
+ tile.sample = sample + 1;
- tile.sample = sample + 1;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+ /* Complete kernel execution before release tile */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, *task);
task->update_progress(&tile, tile.w*tile.h);
}
- /* Complete kernel execution before release tile */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
-
task->release_tile(tile);
}
}
}
+
+ bool is_split_kernel()
+ {
+ return false;
+ }
};
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index d175aae137a..76dcbd6fc9a 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -104,7 +104,7 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
- else if(task->type == DeviceTask::PATH_TRACE) {
+ else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -127,21 +127,29 @@ public:
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
- split_kernel->path_trace(task,
- tile,
- kgbuffer,
- *const_mem_map["__data"]);
-
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ assert(tile.task == RenderTile::PATH_TRACE);
+ split_kernel->path_trace(task,
+ tile,
+ kgbuffer,
+ *const_mem_map["__data"]);
+
+ /* Complete kernel execution before release tile. */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, *task);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
task->release_tile(tile);
}
@@ -150,6 +158,11 @@ public:
}
}
+ bool is_split_kernel()
+ {
+ return true;
+ }
+
protected:
/* ** Those guys are for workign around some compiler-specific bugs ** */
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 38003dd1e1e..642c1bfa11c 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
return get_device_name(device_id);
}
+int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
+{
+ int base_align_bits;
+ if(clGetDeviceInfo(device_id,
+ CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+ sizeof(int),
+ &base_align_bits,
+ NULL) == CL_SUCCESS)
+ {
+ return base_align_bits/8;
+ }
+ return 1;
+}
+
CCL_NAMESPACE_END
#endif