diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/CMakeLists.txt | 2 | ||||
-rw-r--r-- | intern/cycles/device/device.cpp | 12 | ||||
-rw-r--r-- | intern/cycles/device/device.h | 13 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 846 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 495 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 218 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 145 | ||||
-rw-r--r-- | intern/cycles/device/device_memory.h | 44 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 54 | ||||
-rw-r--r-- | intern/cycles/device/device_split_kernel.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/device/device_split_kernel.h | 6 | ||||
-rw-r--r-- | intern/cycles/device/device_task.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/device/device_task.h | 14 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl.h | 53 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 465 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_mega.cpp | 54 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_split.cpp | 45 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_util.cpp | 14 |
18 files changed, 1982 insertions, 510 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 6ef2aa1caad..74ec57ddf74 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -25,6 +25,7 @@ set(SRC device.cpp device_cpu.cpp device_cuda.cpp + device_denoising.cpp device_multi.cpp device_opencl.cpp device_split_kernel.cpp @@ -48,6 +49,7 @@ endif() set(SRC_HEADERS device.h + device_denoising.h device_memory.h device_intern.h device_network.h diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 4c4e862ed1f..949c5f932a4 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -549,4 +549,16 @@ void Device::free_memory() devices.free_memory(); } + +device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type) + : device(device) +{ + ptr = device->mem_alloc_sub_ptr(mem, offset, size, type); +} + +device_sub_ptr::~device_sub_ptr() +{ + device->mem_free_sub_ptr(ptr); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 21d29a801ae..c22969d7dc6 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -228,6 +228,7 @@ struct DeviceDrawParams { }; class Device { + friend class device_sub_ptr; protected: enum { FALLBACK_SHADER_STATUS_NONE = 0, @@ -250,6 +251,14 @@ protected: bool bind_fallback_display_space_shader(const float width, const float height); + virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/) + { + /* Only required for devices that implement denoising. */ + assert(false); + return (device_ptr) 0; + } + virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {}; + public: virtual ~Device(); @@ -278,6 +287,8 @@ public: virtual void mem_zero(device_memory& mem) = 0; virtual void mem_free(device_memory& mem) = 0; + virtual int mem_address_alignment() { return 16; } + /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; @@ -326,6 +337,8 @@ public: /* multi device */ virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {} virtual int device_number(Device * /*sub_device*/) { return 0; } + virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} + virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} /* static */ static Device *create(DeviceInfo& info, Stats &stats, bool background = true); diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 84cce605182..1ecce8bd565 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -25,6 +25,7 @@ #endif #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" @@ -34,6 +35,8 @@ #include "kernel/split/kernel_split_data.h" #include "kernel/kernel_globals.h" +#include "kernel/filter/filter.h" + #include "kernel/osl/osl_shader.h" #include "kernel/osl/osl_globals.h" @@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN class CPUDevice; -class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; -public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); +/* Has to be outside of the class to be shared across template instantiations. */ +static const char *logged_architecture = ""; - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); -}; - -class CPUDevice : public Device -{ - static unordered_map<string, void*> kernel_functions; - - static void register_kernel_function(const char* name, void* func) +template<typename F> +class KernelFunctions { +public: + KernelFunctions() { - kernel_functions[name] = func; + kernel = (F)NULL; } - static const char* get_arch_name() + KernelFunctions(F kernel_default, + F kernel_sse2, + F kernel_sse3, + F kernel_sse41, + F kernel_avx, + F kernel_avx2) { + const char *architecture_name = "default"; + kernel = kernel_default; + + /* Silence potential warnings about unused variables + * when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { - return "cpu_avx2"; + architecture_name = "AVX2"; + kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { - return "cpu_avx"; + architecture_name = "AVX"; + kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { - return "cpu_sse41"; + architecture_name = "SSE4.1"; + kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { - return "cpu_sse3"; + architecture_name = "SSE3"; + kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { - return "cpu_sse2"; + architecture_name = "SSE2"; + kernel = kernel_sse2; } - else #endif - { - return "cpu"; + + if(strstr(architecture_name, logged_architecture) != 0) { + VLOG(1) << "Will be using " << architecture_name << " kernels."; + logged_architecture = architecture_name; } } - template<typename F> - static F get_kernel_function(string name) - { - name = string("kernel_") + get_arch_name() + "_" + name; - - unordered_map<string, void*>::iterator it = kernel_functions.find(name); + inline F operator()() const { + assert(kernel); + return kernel; + } +protected: + F kernel; +}; - if(it == kernel_functions.end()) { - assert(!"kernel function not found"); - return NULL; - } +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); - return (F)it->second; - } + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); - friend class CPUSplitKernel; + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; +class CPUDevice : public Device +{ public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -149,77 +168,89 @@ public: bool use_split_kernel; DeviceRequestedFeatures requested_features; - + + KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; + KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel; + + KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; + + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + + KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, + ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int, + ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel; + unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels; + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), \ + KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), \ + KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), \ + KERNEL_NAME_EVAL(cpu_avx2, name) + CPUDevice(DeviceInfo& info, Stats &stats, bool background) - : Device(info, stats, background) + : Device(info, stats, background), +#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name)) + REGISTER_KERNEL(path_trace), + REGISTER_KERNEL(convert_to_half_float), + REGISTER_KERNEL(convert_to_byte), + REGISTER_KERNEL(shader), + REGISTER_KERNEL(filter_divide_shadow), + REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_combine_halves), + REGISTER_KERNEL(filter_nlm_calc_difference), + REGISTER_KERNEL(filter_nlm_blur), + REGISTER_KERNEL(filter_nlm_calc_weight), + REGISTER_KERNEL(filter_nlm_update_output), + REGISTER_KERNEL(filter_nlm_normalize), + REGISTER_KERNEL(filter_construct_transform), + REGISTER_KERNEL(filter_nlm_construct_gramian), + REGISTER_KERNEL(filter_finalize), + REGISTER_KERNEL(data_init) +#undef REGISTER_KERNEL { #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif - - /* do now to avoid thread issues */ - system_cpu_support_sse2(); - system_cpu_support_sse3(); - system_cpu_support_sse41(); - system_cpu_support_avx(); - system_cpu_support_avx2(); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - VLOG(1) << "Will be using AVX2 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - VLOG(1) << "Will be using AVX kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - VLOG(1) << "Will be using SSE4.1 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - VLOG(1) << "Will be using SSE3kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - VLOG(1) << "Will be using SSE2 kernels."; - } - else -#endif - { - VLOG(1) << "Will be using regular kernels."; - } - use_split_kernel = DebugFlags().cpu.split_kernel; if(use_split_kernel) { VLOG(1) << "Will be using split kernel."; } - kernel_cpu_register_functions(register_kernel_function); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - kernel_cpu_sse2_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - kernel_cpu_sse3_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - kernel_cpu_sse41_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - kernel_cpu_avx_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - kernel_cpu_avx2_register_functions(register_kernel_function); -#endif +#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name)) + REGISTER_SPLIT_KERNEL(path_init); + REGISTER_SPLIT_KERNEL(scene_intersect); + REGISTER_SPLIT_KERNEL(lamp_emission); + REGISTER_SPLIT_KERNEL(do_volume); + REGISTER_SPLIT_KERNEL(queue_enqueue); + REGISTER_SPLIT_KERNEL(indirect_background); + REGISTER_SPLIT_KERNEL(shader_setup); + REGISTER_SPLIT_KERNEL(shader_sort); + REGISTER_SPLIT_KERNEL(shader_eval); + REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); + REGISTER_SPLIT_KERNEL(subsurface_scatter); + REGISTER_SPLIT_KERNEL(direct_lighting); + REGISTER_SPLIT_KERNEL(shadow_blocked_ao); + REGISTER_SPLIT_KERNEL(shadow_blocked_dl); + REGISTER_SPLIT_KERNEL(next_iteration_setup); + REGISTER_SPLIT_KERNEL(indirect_subsurface); + REGISTER_SPLIT_KERNEL(buffer_update); +#undef REGISTER_SPLIT_KERNEL +#undef KERNEL_FUNCTIONS } ~CPUDevice() @@ -273,13 +304,17 @@ public: if(!mem.data_pointer) { free((void*)mem.device_pointer); } - mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { kernel_const_copy(&kernel_globals, name, host, size); @@ -326,13 +361,8 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { - if(!use_split_kernel) { - thread_path_trace(*task); - } - else { - thread_path_trace_split(*task); - } + if(task->type == DeviceTask::RENDER) { + thread_render(*task); } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); @@ -349,116 +379,319 @@ public: } }; - void thread_path_trace(DeviceTask& task) + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; } - KernelGlobals kg = thread_kernel_globals_init(); - RenderTile tile; + return true; + } - void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + + float *blurDifference = (float*) task->nlm_state.temporary_1_ptr; + float *difference = (float*) task->nlm_state.temporary_2_ptr; + float *weightAccum = (float*) task->nlm_state.temporary_3_ptr; + + memset(weightAccum, 0, sizeof(float)*w*h); + memset((float*) out_ptr, 0, sizeof(float)*w*h); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) variance_ptr, + difference, + local_rect, + w, 0, + a, k_2); + + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, dy, + blurDifference, + (float*) image_ptr, + (float*) out_ptr, + weightAccum, + local_rect, + w, f); + } + + int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; + filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - path_trace_kernel = kernel_cpu_avx2_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - path_trace_kernel = kernel_cpu_avx_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - path_trace_kernel = kernel_cpu_sse41_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - path_trace_kernel = kernel_cpu_sse3_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - path_trace_kernel = kernel_cpu_sse2_path_trace; - } - else -#endif - { - path_trace_kernel = kernel_cpu_path_trace; + return true; + } + + bool denoising_construct_transform(DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + x + task->filter_area.x, + y + task->filter_area.y, + y*task->filter_area.z + x, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + &task->rect.x, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + } } + return true; + } - while(task.acquire_tile(this, tile)) { - float *render_buffer = (float*)tile.buffer; - uint *rng_state = (uint*)tile.rng_state; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task.get_cancel() || task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + float *difference = (float*) task->reconstruction_state.temporary_1_ptr; + float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) guide_variance_ptr, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_construct_gramian_kernel()(dx, dy, + blurDifference, + (float*) task->buffer.mem.device_pointer, + (float*) color_ptr, + (float*) color_variance_ptr, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_rect.x, + task->buffer.w, + task->buffer.h, + 4, + task->buffer.pass_stride); + } + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y*task->filter_area.z + x, + task->buffer.w, + task->buffer.h, + (float*) output_ptr, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); + } + } + return true; + } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - path_trace_kernel(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); - } - } + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, y, + (float*) mean_ptr, + (float*) variance_ptr, + (float*) a_ptr, + (float*) b_ptr, + &rect.x, + r); + } + } + return true; + } - tile.sample = sample + 1; + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + task->tiles, + x, y, + (float*) a_ptr, + (float*) b_ptr, + (float*) sample_variance_ptr, + (float*) sv_variance_ptr, + (float*) buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); + } + } + return true; + } - task.update_progress(&tile, tile.w*tile.h); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + task->tiles, + mean_offset, + variance_offset, + x, y, + (float*) mean_ptr, + (float*) variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); } + } + return true; + } - task.release_tile(tile); + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + { + float *render_buffer = (float*)tile.buffer; + uint *rng_state = (uint*)tile.rng_state; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - if(task_pool.canceled()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if(task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + path_trace_kernel()(kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(&tile, tile.w*tile.h); } + } + + void denoise(DeviceTask &task, RenderTile &tile) + { + tile.sample = tile.start_sample + tile.num_samples; + + DenoisingTask denoising(this); - thread_kernel_globals_free(&kg); + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; + + RenderTile rtiles[9]; + rtiles[4] = tile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + + task.update_progress(&tile, tile.w*tile.h); } - void thread_path_trace_split(DeviceTask& task) + void thread_render(DeviceTask& task) { if(task_pool.canceled()) { if(task.need_finish_queue == false) return; } - RenderTile tile; - - CPUSplitKernel split_kernel(this); - /* allocate buffer for kernel globals */ - device_memory kgbuffer; - kgbuffer.resize(sizeof(KernelGlobals)); + device_only_memory<KernelGlobals> kgbuffer; + kgbuffer.resize(1); mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); - requested_features.max_closure = MAX_CLOSURE; - if(!split_kernel.load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); - mem_free(kgbuffer); + CPUSplitKernel *split_kernel = NULL; + if(use_split_kernel) { + split_kernel = new CPUSplitKernel(this); + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel->load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); - return; + delete split_kernel; + return; + } } + RenderTile tile; while(task.acquire_tile(this, tile)) { - device_memory data; - split_kernel.path_trace(&task, tile, kgbuffer, data); + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel) { + device_memory data; + split_kernel->path_trace(&task, tile, kgbuffer, data); + } + else { + path_trace(task, tile, kg); + } + } + else if(tile.task == RenderTile::DENOISE) { + denoise(task, tile); + } task.release_tile(tile); @@ -470,6 +703,7 @@ public: thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); mem_free(kgbuffer); + delete split_kernel; } void thread_film_convert(DeviceTask& task) @@ -477,86 +711,16 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { - void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; - } - else -#endif - { - convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } else { - void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; - } - else -#endif - { - convert_to_byte_kernel = kernel_cpu_convert_to_byte; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } } @@ -568,53 +732,17 @@ public: #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - shader_kernel = kernel_cpu_avx2_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - shader_kernel = kernel_cpu_avx_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - shader_kernel = kernel_cpu_sse41_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - shader_kernel = kernel_cpu_sse3_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - shader_kernel = kernel_cpu_sse2_shader; - } - else -#endif - { - shader_kernel = kernel_cpu_shader; - } - for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel(&kg, - (uint4*)task.shader_input, - (float4*)task.shader_output, - (float*)task.shader_output_luma, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); + shader_kernel()(&kg, + (uint4*)task.shader_input, + (float4*)task.shader_output, + (float*)task.shader_output_luma, + task.shader_eval_type, + task.shader_filter, + x, + task.offset, + sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, device_memory& use_queues_flags, device_memory& work_pool_wgs) { - typedef void(*data_init_t)(KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - ccl_global uint *rng_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer); - - data_init_t data_init; - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - data_init = kernel_cpu_avx2_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - data_init = kernel_cpu_avx_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - data_init = kernel_cpu_sse41_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - data_init = kernel_cpu_sse3_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - data_init = kernel_cpu_sse2_data_init; - } - else -#endif - { - data_init = kernel_cpu_data_init; - } - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); @@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, for(int x = 0; x < dim.global_size[0]; x++) { kg->global_id = make_int2(x, y); - data_init((KernelGlobals*)kernel_globals.device_pointer, - (KernelData*)data.device_pointer, - (void*)split_data.device_pointer, - num_global_elements, - (char*)ray_state.device_pointer, - (uint*)rtile.rng_state, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int*)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char*)use_queues_flags.device_pointer, - (uint*)work_pool_wgs.device_pointer, - rtile.num_samples, - (float*)rtile.buffer); + device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); } } @@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam { CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name); + kernel->func = device->split_kernels[kernel_name](); if(!kernel->func) { delete kernel; return NULL; @@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device return split_data_buffer_size(kg, num_threads); } -unordered_map<string, void*> CPUDevice::kernel_functions; - Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index e497ec6b0e1..9a8537a6722 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -21,11 +21,14 @@ #include <string.h> #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" #include "render/buffers.h" +#include "kernel/filter/filter_defines.h" + #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else @@ -113,7 +116,7 @@ public: DedicatedTaskPool task_pool; CUdevice cuDevice; CUcontext cuContext; - CUmodule cuModule; + CUmodule cuModule, cuFilterModule; map<device_ptr, bool> tex_interp_map; map<device_ptr, uint> tex_bindless_map; int cuDevId; @@ -170,7 +173,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -301,7 +304,8 @@ public: * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features, bool split=false) + const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); @@ -316,7 +320,7 @@ public: machine, cuda_version, include_path.c_str()); - if(use_adaptive_compilation()) { + if(!filter && use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); @@ -364,8 +368,22 @@ public: return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) + string compile_kernel(const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { + const char *name, *source; + if(filter) { + name = "filter"; + source = "filter.cu"; + } + else if(split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } /* Compute cubin name. */ int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -373,9 +391,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" - : "lib/kernel_sm_%d%d.cubin", - major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", + name, major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { VLOG(1) << "Using precompiled kernel."; @@ -384,7 +401,7 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features, split); + compile_kernel_get_common_cflags(requested_features, filter, split); /* Try to use locally compiled kernel. */ const string source_path = path_get("source"); @@ -395,9 +412,8 @@ public: */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" - : "cycles_kernel_sm%d%d_%s.cubin", - major, minor, + const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", + name, major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; @@ -432,7 +448,7 @@ public: const string kernel = path_join( path_join(source_path, "kernel"), path_join("kernels", - path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); + path_join("cuda", source))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -480,11 +496,14 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features, use_split_kernel()); - + string cubin = compile_kernel(requested_features, false, use_split_kernel()); if(cubin == "") return false; + string filter_cubin = compile_kernel(requested_features, true, false); + if(filter_cubin == "") + return false; + /* open module */ cuda_push_context(); @@ -499,6 +518,14 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + if(path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if(cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + cuda_pop_context(); return (result == CUDA_SUCCESS); @@ -581,6 +608,11 @@ public: } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; @@ -881,6 +913,368 @@ public: } } + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) + { + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; + } + + mem_copy_to(task->tiles_mem); + + return !have_error(); + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + CUdeviceptr difference = task->nlm_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + + int dx, dy; + int4 local_rect; + int channel_offset = 0; + void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; + void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + dy = i / (2*r+1) - r; + dx = i % (2*r+1) - r; + local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); + } + + local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); + void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cuda_push_context(); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + void *calc_difference_args[] = {&dx, &dy, + &guide_ptr, + &guide_variance_ptr, + &difference, + &local_rect, + &task->buffer.w, + &task->buffer.pass_stride, + &a, + &task->nlm_k_2}; + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + + void *blur_args[] = {&difference, + &blurDifference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *calc_weight_args[] = {&blurDifference, + &difference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + + /* Reuse previous arguments. */ + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *construct_gramian_args[] = {&dx, &dy, + &blurDifference, + &task->buffer.mem.device_pointer, + &color_ptr, + &color_variance_ptr, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &local_rect, + &task->reconstruction_state.filter_rect, + &task->buffer.w, + &task->buffer.h, + &f, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); + } + + void *finalize_args[] = {&task->buffer.w, + &task->buffer.h, + &output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + void denoise(RenderTile &rtile, const DeviceTask &task) + { + DenoisingTask denoising(this); + + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + } + void path_trace(RenderTile& rtile, int sample, bool branched) { if(have_error()) @@ -1326,7 +1720,7 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { + if(task->type == DeviceTask::RENDER) { RenderTile tile; bool branched = task->integrator_branched; @@ -1334,30 +1728,8 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - if(!use_split_kernel()) { - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - - path_trace(tile, sample, branched); - - tile.sample = sample + 1; - - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } - } - else { - DeviceRequestedFeatures requested_features; + DeviceRequestedFeatures requested_features; + if(use_split_kernel()) { if(!use_adaptive_compilation()) { requested_features.max_closure = 64; } @@ -1366,18 +1738,47 @@ public: split_kernel = new CUDASplitKernel(this); split_kernel->load_kernels(requested_features); } + } + + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel()) { + device_memory void_buffer; + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } - while(task->acquire_tile(this, tile)) { - device_memory void_buffer; - split_kernel->path_trace(task, tile, void_buffer, void_buffer); + path_trace(tile, sample, branched); - task->release_tile(tile); + tile.sample = sample + 1; - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; + task->update_progress(&tile, tile.w*tile.h); + } } } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + + denoise(tile, *task); + + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } } } else if(task->type == DeviceTask::SHADER) { diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp new file mode 100644 index 00000000000..39c8cf30105 --- /dev/null +++ b/intern/cycles/device/device_denoising.cpp @@ -0,0 +1,218 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_denoising.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +void DenoisingTask::init_from_devicetask(const DeviceTask &task) +{ + radius = task.denoising_radius; + nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength)); + if(task.denoising_relative_pca) { + pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength)); + } + else { + pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); + } + + render_buffer.pass_stride = task.pass_stride; + render_buffer.denoising_data_offset = task.pass_denoising_data; + render_buffer.denoising_clean_offset = task.pass_denoising_clean; + + /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ + rect = make_int4(max(tiles->x[0], filter_area.x - radius), + max(tiles->y[0], filter_area.y - radius), + min(tiles->x[3], filter_area.x + filter_area.z + radius), + min(tiles->y[3], filter_area.y + filter_area.w + radius)); +} + +void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) +{ + tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int)); + + device_ptr buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = rtiles[i].buffer; + tiles->offsets[i] = rtiles[i].offset; + tiles->strides[i] = rtiles[i].stride; + } + tiles->x[0] = rtiles[3].x; + tiles->x[1] = rtiles[4].x; + tiles->x[2] = rtiles[5].x; + tiles->x[3] = rtiles[5].x + rtiles[5].w; + tiles->y[0] = rtiles[1].y; + tiles->y[1] = rtiles[4].y; + tiles->y[2] = rtiles[7].y; + tiles->y[3] = rtiles[7].y + rtiles[7].h; + + render_buffer.offset = rtiles[4].offset; + render_buffer.stride = rtiles[4].stride; + render_buffer.ptr = rtiles[4].buffer; + + functions.set_tiles(buffers); +} + +bool DenoisingTask::run_denoising() +{ + /* Allocate denoising buffer. */ + buffer.passes = 14; + buffer.w = align_up(rect.z - rect.x, 4); + buffer.h = rect.w - rect.y; + buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); + buffer.mem.resize(buffer.pass_stride * buffer.passes); + device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE); + + device_ptr null_ptr = (device_ptr) 0; + + /* Prefilter shadow feature. */ + { + device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ + functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); + + /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ + nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); + + /* Reuse memory, the previous data isn't needed anymore. */ + device_ptr filtered_a = *buffer_var, + filtered_b = *sample_var; + /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ + nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); + functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); + + device_ptr residual_var = *sample_var_var; + /* Estimate the residual variance between the two filtered halves. */ + functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); + + device_ptr final_a = *unfiltered_a, + final_b = *unfiltered_b; + /* Use the residual variance for a second filter pass. */ + nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); + functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); + + /* Combine the two double-filtered halves to a final shadow feature. */ + device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); + } + + /* Prefilter general features. */ + { + device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 }; + int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 }; + int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 }; + for(int pass = 0; pass < 7; pass++) { + device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + /* Get the unfiltered pass and its variance from the RenderBuffers. */ + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + /* Smooth the pass and store the result in the denoising buffers. */ + nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); + } + } + + /* Copy color passes. */ + { + int mean_from[] = {20, 21, 22}; + int variance_from[] = {23, 24, 25}; + int mean_to[] = { 8, 9, 10}; + int variance_to[] = {11, 12, 13}; + int num_color_passes = 3; + for(int pass = 0; pass < num_color_passes; pass++) { + device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + } + } + + storage.w = filter_area.z; + storage.h = filter_area.w; + storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE); + storage.rank.resize(storage.w*storage.h); + device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE); + device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE); + + functions.construct_transform(); + + device_only_memory<float> temporary_1; + device_only_memory<float> temporary_2; + temporary_1.resize(buffer.w*buffer.h); + temporary_2.resize(buffer.w*buffer.h); + device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE); + device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE); + reconstruction_state.temporary_1_ptr = temporary_1.device_pointer; + reconstruction_state.temporary_2_ptr = temporary_2.device_pointer; + + storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE); + storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE); + device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE); + device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE); + + reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); + int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; + reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, + render_buffer.stride, + render_buffer.pass_stride, + render_buffer.denoising_clean_offset); + reconstruction_state.source_w = rect.z-rect.x; + reconstruction_state.source_h = rect.w-rect.y; + + { + device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr); + } + + device->mem_free(storage.XtWX); + device->mem_free(storage.XtWY); + device->mem_free(storage.transform); + device->mem_free(storage.rank); + device->mem_free(temporary_1); + device->mem_free(temporary_2); + device->mem_free(buffer.mem); + device->mem_free(tiles_mem); + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h new file mode 100644 index 00000000000..86d8eb64386 --- /dev/null +++ b/intern/cycles/device/device_denoising.h @@ -0,0 +1,145 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_DENOISING_H__ +#define __DEVICE_DENOISING_H__ + +#include "device/device.h" + +#include "render/buffers.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +class DenoisingTask { +public: + /* Parameters of the denoising algorithm. */ + int radius; + float nlm_k_2; + float pca_threshold; + + /* Pointer and parameters of the RenderBuffers. */ + struct RenderBuffers { + int denoising_data_offset; + int denoising_clean_offset; + int pass_stride; + int offset; + int stride; + device_ptr ptr; + int samples; + } render_buffer; + + TilesInfo *tiles; + device_vector<int> tiles_mem; + void tiles_from_rendertiles(RenderTile *rtiles); + + int4 rect; + int4 filter_area; + + struct DeviceFunctions { + function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ + device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ + device_ptr variance_ptr, /* Contains the variance of the guide image. */ + device_ptr out_ptr /* The filtered output is written into this image. */ + )> non_local_means; + function<bool(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr + )> reconstruct; + function<bool()> construct_transform; + + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect + )> combine_halves; + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr + )> divide_shadow; + function<bool(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr + )> get_feature; + function<bool(device_ptr*)> set_tiles; + } functions; + + /* Stores state of the current Reconstruction operation, + * which is accessed by the device in order to perform the operation. */ + struct ReconstructionState { + device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */ + device_ptr temporary_2_ptr; + + int4 filter_rect; + int4 buffer_params; + + int source_w; + int source_h; + } reconstruction_state; + + /* Stores state of the current NLM operation, + * which is accessed by the device in order to perform the operation. */ + struct NLMState { + device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */ + device_ptr temporary_2_ptr; + device_ptr temporary_3_ptr; + + int r; /* Search radius of the filter. */ + int f; /* Patch size of the filter. */ + float a; /* Variance compensation factor in the MSE estimation. */ + float k_2; /* Squared value of the k parameter of the filter. */ + + void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + } nlm_state; + + struct Storage { + device_only_memory<float> transform; + device_only_memory<int> rank; + device_only_memory<float> XtWX; + device_only_memory<float3> XtWY; + int w; + int h; + } storage; + + DenoisingTask(Device *device) : device(device) {} + + void init_from_devicetask(const DeviceTask &task); + + bool run_denoising(); + + struct DenoiseBuffers { + int pass_stride; + int passes; + int w; + int h; + device_only_memory<float> mem; + } buffer; + +protected: + Device *device; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 4b10514a9d2..b63dd00068b 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -35,6 +35,8 @@ CCL_NAMESPACE_BEGIN +class Device; + enum MemoryType { MEM_READ_ONLY, MEM_WRITE_ONLY, @@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> { template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 3; + static const int num_elements = 4; }; template<> struct device_type_traits<float4> { @@ -173,6 +175,9 @@ class device_memory { public: size_t memory_size() { return data_size*data_elements*datatype_size(data_type); } + size_t memory_elements_size(int elements) { + return elements*data_elements*datatype_size(data_type); + } /* data information */ DataType data_type; @@ -213,6 +218,22 @@ protected: device_memory& operator = (const device_memory&); }; +template<typename T> +class device_only_memory : public device_memory +{ +public: + device_only_memory() + { + data_type = device_type_traits<T>::data_type; + data_elements = max(device_type_traits<T>::num_elements, 1); + } + + void resize(size_t num) + { + device_memory::resize(num*sizeof(T)); + } +}; + /* Device Vector */ template<typename T> class device_vector : public device_memory @@ -299,6 +320,27 @@ private: array<T> data; }; +/* A device_sub_ptr is a pointer into another existing memory. + * Therefore, it is not allocated separately, but just created from the already allocated base memory. + * It is freed automatically when it goes out of scope, which should happen before the base memory is freed. + * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */ +class device_sub_ptr +{ +public: + device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type); + ~device_sub_ptr(); + /* No copying. */ + device_sub_ptr& operator = (const device_sub_ptr&); + + device_ptr operator*() const + { + return ptr; + } +protected: + Device *device; + device_ptr ptr; +}; + CCL_NAMESPACE_END #endif /* __DEVICE_MEMORY_H__ */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 8616e31d3b9..35ae0303d6e 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -302,6 +302,60 @@ public: return -1; } + void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + /* If the tile was rendered on another device, copy its memory to + * to the current device now, for the duration of the denoising task. + * Note that this temporarily modifies the RenderBuffers and calls + * the device, so this function is not thread safe. */ + if(tiles[i].buffers->device != sub_device) { + device_vector<float> &mem = tiles[i].buffers->buffer; + + tiles[i].buffers->copy_from_device(); + device_ptr original_ptr = mem.device_pointer; + mem.device_pointer = 0; + sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE); + sub_device->mem_copy_to(mem); + tiles[i].buffer = mem.device_pointer; + mem.device_pointer = original_ptr; + } + } + } + + void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + if(tiles[i].buffers->device != sub_device) { + device_vector<float> &mem = tiles[i].buffers->buffer; + + device_ptr original_ptr = mem.device_pointer; + mem.device_pointer = tiles[i].buffer; + + /* Copy denoised tile to the host. */ + if(i == 4) { + tiles[i].buffers->copy_from_device(sub_device); + } + + size_t mem_size = mem.device_size; + sub_device->mem_free(mem); + mem.device_pointer = original_ptr; + mem.device_size = mem_size; + + /* Copy denoised tile to the original device. */ + if(i == 4) { + tiles[i].buffers->device->mem_copy_to(mem); + } + } + } + } + int get_split_task_count(DeviceTask& task) { int total_tasks = 0; diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 9118793aad6..dddd19f179f 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, unsigned int max_work_groups = num_global_elements / work_pool_size + 1; /* Allocate work_pool_wgs memory. */ - work_pool_wgs.resize(max_work_groups * sizeof(unsigned int)); + work_pool_wgs.resize(max_work_groups); device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE); - queue_index.resize(NUM_QUEUES * sizeof(int)); + queue_index.resize(NUM_QUEUES); device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE); - use_queues_flag.resize(sizeof(char)); + use_queues_flag.resize(1); device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE); ray_state.resize(num_global_elements); diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 58c2fdbb077..68c2ba974a5 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -80,16 +80,16 @@ private: */ device_memory split_data; device_vector<uchar> ray_state; - device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */ + device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */ /* Flag to make sceneintersect and lampemission kernel use queues. */ - device_memory use_queues_flag; + device_only_memory<char> use_queues_flag; /* Approximate time it takes to complete one sample */ double avg_time_per_sample; /* Work pool with respect to each work group. */ - device_memory work_pool_wgs; + device_only_memory<unsigned int> work_pool_wgs; /* clos_max value for which the kernels have been loaded currently. */ int current_max_closure; diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index ca303365627..3bc4c310283 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size) if(type == SHADER) { num = min(shader_w, num); } - else if(type == PATH_TRACE) { + else if(type == RENDER) { } else { num = min(h, num); @@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) tasks.push_back(task); } } - else if(type == PATH_TRACE) { + else if(type == RENDER) { for(int i = 0; i < num; i++) tasks.push_back(*this); } @@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) { - if((type != PATH_TRACE) && + if((type != RENDER) && (type != SHADER)) return; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index feee89fd6e4..44a1efff1f5 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -34,7 +34,7 @@ class Tile; class DeviceTask : public Task { public: - typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type; + typedef enum { RENDER, FILM_CONVERT, SHADER } Type; Type type; int x, y, w, h; @@ -53,7 +53,7 @@ public: int passes_size; - explicit DeviceTask(Type type = PATH_TRACE); + explicit DeviceTask(Type type = RENDER); int get_subtask_count(int num, int max_size = 0); void split(list<DeviceTask>& tasks, int num, int max_size = 0); @@ -65,6 +65,16 @@ public: function<void(RenderTile&)> update_tile_sample; function<void(RenderTile&)> release_tile; function<bool(void)> get_cancel; + function<void(RenderTile*, Device*)> map_neighbor_tiles; + function<void(RenderTile*, Device*)> unmap_neighbor_tiles; + + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + int pass_stride; + int pass_denoising_data; + int pass_denoising_clean; bool need_finish_queue; bool integrator_branched; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index d061973dcb7..a458ca6bf64 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -17,6 +17,7 @@ #ifdef WITH_OPENCL #include "device/device.h" +#include "device/device_denoising.h" #include "util/util_map.h" #include "util/util_param.h" @@ -129,6 +130,8 @@ public: cl_int* error = NULL); static cl_device_type get_device_type(cl_device_id device_id); + static int mem_address_alignment(cl_device_id device_id); + /* Get somewhat more readable device name. * Main difference is AMD OpenCL here which only gives code name * for the regular device name. This will give more sane device @@ -218,7 +221,7 @@ public: cl_int err = stmt; \ \ if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -282,7 +285,7 @@ public: map<ustring, cl_kernel> kernels; }; - OpenCLProgram base_program; + OpenCLProgram base_program, denoising_program; typedef map<string, device_vector<uchar>*> ConstMemMap; typedef map<string, device_ptr> MemMap; @@ -320,6 +323,9 @@ public: void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); void mem_free(device_memory& mem); + + int mem_address_alignment(); + void const_copy_to(const char *name, void *host, size_t size); void tex_alloc(const char *name, device_memory& mem, @@ -328,12 +334,14 @@ public: void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); + void denoise(RenderTile& tile, const DeviceTask& task); + class OpenCLDeviceTask : public DeviceTask { public: OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) @@ -367,9 +375,48 @@ public: virtual void thread_run(DeviceTask * /*task*/) = 0; + virtual bool is_split_kernel() = 0; + protected: string kernel_build_options(const string *debug_src = NULL); + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task); + bool denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type); + void mem_free_sub_ptr(device_ptr ptr); + class ArgumentWrapper { public: ArgumentWrapper() : size(0), pointer(NULL) diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 22aeaddcde8..ae1a7b917c3 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("bake")); base_program.add_kernel(ustring("zero_buffer")); + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + denoising_program.add_kernel(ustring("filter_set_tiles")); + vector<OpenCLProgram*> programs; programs.push_back(&base_program); + programs.push_back(&denoising_program); /* Call actual class to fill the vector with its programs. */ if(!load_kernels(requested_features, programs)) { return false; @@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in NULL, NULL)); } -void OpenCLDeviceBase::mem_zero(device_memory& mem) +void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) { - if(mem.device_pointer) { - if(base_program.is_loaded()) { - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; - cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; - while(d_offset < mem.memory_size()) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + while(d_offset < size) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - d_offset += d_size; - } + d_offset += d_size; + } +} + +void OpenCLDeviceBase::mem_zero(device_memory& mem) +{ + if(mem.device_pointer) { + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); } if(mem.data_pointer) { @@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem) } } +int OpenCLDeviceBase::mem_address_alignment() +{ + return OpenCLInfo::mem_address_alignment(cdDevice); +} + +device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type) +{ + cl_mem_flags mem_flag; + if(type == MEM_READ_ONLY) + mem_flag = CL_MEM_READ_ONLY; + else if(type == MEM_WRITE_ONLY) + mem_flag = CL_MEM_WRITE_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} + +void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } +} + void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) { ConstMemMap::iterator i = const_mem_map.find(name); @@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + /* Try to divide evenly over 2 dimensions. */ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; @@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } +bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + + mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); + mem_zero_kernel(out_ptr, sizeof(float)*w*h); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, guide_mem, variance_mem, + difference, local_rect, w, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, blurDifference, local_rect, w, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, difference, local_rect, w, f); + kernel_set_args(ckNLMUpdateOutput, 0, + dx, dy, blurDifference, image_mem, + out_mem, weightAccum, local_rect, w, f); + + enqueue_kernel(ckNLMCalcDifference, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMCalcWeight, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMUpdateOutput, w, h); + } + + int4 local_rect = make_int4(0, 0, w, h); + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum, local_rect, w); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, + guide_mem, + guide_variance_mem, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + a, task->nlm_k_2); + enqueue_kernel(ckNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMCalcWeight, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + /* Reuse previous arguments. */ + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMConstructGramian, 0, + dx, dy, + blurDifference, + buffer_mem, + color_mem, + color_variance_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + local_rect, + task->reconstruction_state.filter_rect, + task->buffer.w, + task->buffer.h, + f, + task->buffer.pass_stride); + enqueue_kernel(ckNLMConstructGramian, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h, + 256); + } + + kernel_set_args(ckFinalize, 0, + task->buffer.w, + task->buffer.h, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + return true; +} + +bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + (void) task; + + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + (void) task; + + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tiles_mem, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tiles_mem, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task) +{ + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE); + mem_copy_to(task->tiles_mem); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); + + kernel_set_args(ckFilterSetTiles, 0, tiles_mem); + for(int i = 0; i < 9; i++) { + cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); + kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); + } + + enqueue_kernel(ckFilterSetTiles, 1, 1); + + return true; +} + +void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) +{ + DenoisingTask denoising(this); + + denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); + denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); +} + void OpenCLDeviceBase::shader(DeviceTask& task) { /* cast arguments to cl types */ diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index a2fd1d71156..06c15bcf401 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -108,41 +108,53 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(tile.task == RenderTile::PATH_TRACE) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + path_trace(tile, sample); - path_trace(tile, sample); + tile.sample = sample + 1; - tile.sample = sample + 1; + task->update_progress(&tile, tile.w*tile.h); + } + /* Complete kernel execution before release tile */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); task->update_progress(&tile, tile.w*tile.h); } - /* Complete kernel execution before release tile */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - task->release_tile(tile); } } } + + bool is_split_kernel() + { + return false; + } }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background) diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index d175aae137a..76dcbd6fc9a 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -104,7 +104,7 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to @@ -127,21 +127,29 @@ public: /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); + if(tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); + task->update_progress(&tile, tile.w*tile.h); + } task->release_tile(tile); } @@ -150,6 +158,11 @@ public: } } + bool is_split_kernel() + { + return true; + } + protected: /* ** Those guys are for workign around some compiler-specific bugs ** */ diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 38003dd1e1e..642c1bfa11c 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) return get_device_name(device_id); } +int OpenCLInfo::mem_address_alignment(cl_device_id device_id) +{ + int base_align_bits; + if(clGetDeviceInfo(device_id, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(int), + &base_align_bits, + NULL) == CL_SUCCESS) + { + return base_align_bits/8; + } + return 1; +} + CCL_NAMESPACE_END #endif |