diff options
117 files changed, 6434 insertions, 1135 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index affeef994d4..b2694a285b1 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -238,3 +238,15 @@ def register_passes(engine, scene, srl): if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE') if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE') if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE') + + if crl.use_denoising and crl.denoising_store_passes: + engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR'); + engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR'); + engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR'); + engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR'); + engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE'); + engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE'); + engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR'); + engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR'); + engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR'); + engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR');
\ No newline at end of file diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index a8a0f0bfc70..2ac1a1aacdf 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1195,6 +1195,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, ) + cls.use_denoising = BoolProperty( + name="Use Denoising", + description="Denoise the rendered image", + default=False, + ) + cls.denoising_diffuse_direct = BoolProperty( + name="Diffuse Direct", + description="Denoise the direct diffuse lighting", + default=True, + ) + cls.denoising_diffuse_indirect = BoolProperty( + name="Diffuse Indirect", + description="Denoise the indirect diffuse lighting", + default=True, + ) + cls.denoising_glossy_direct = BoolProperty( + name="Glossy Direct", + description="Denoise the direct glossy lighting", + default=True, + ) + cls.denoising_glossy_indirect = BoolProperty( + name="Glossy Indirect", + description="Denoise the indirect glossy lighting", + default=True, + ) + cls.denoising_transmission_direct = BoolProperty( + name="Transmission Direct", + description="Denoise the direct transmission lighting", + default=True, + ) + cls.denoising_transmission_indirect = BoolProperty( + name="Transmission Indirect", + description="Denoise the indirect transmission lighting", + default=True, + ) + cls.denoising_subsurface_direct = BoolProperty( + name="Subsurface Direct", + description="Denoise the direct subsurface lighting", + default=True, + ) + cls.denoising_subsurface_indirect = BoolProperty( + name="Subsurface Indirect", + description="Denoise the indirect subsurface lighting", + default=True, + ) + cls.denoising_strength = FloatProperty( + name="Denoising Strength", + description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)", + min=0.0, max=1.0, + default=0.5, + ) + cls.denoising_feature_strength = FloatProperty( + name="Denoising Feature Strength", + description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)", + min=0.0, max=1.0, + default=0.5, + ) + cls.denoising_radius = IntProperty( + name="Denoising Radius", + description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)", + min=1, max=50, + default=8, + ) + cls.denoising_relative_pca = BoolProperty( + name="Relative filter", + description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)", + default=False, + ) + cls.denoising_store_passes = BoolProperty( + name="Store denoising passes", + description="Store the denoising feature passes and the noisy image", + default=False, + ) + @classmethod def unregister(cls): del bpy.types.SceneRenderLayer.cycles diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index ea5bc8979c0..4ed3ccd9a2c 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): col.prop(rl, "use_pass_emit", text="Emission") col.prop(rl, "use_pass_environment") + if context.scene.cycles.feature_set == 'EXPERIMENTAL': + col.separator() + sub = col.column() + sub.active = crl.use_denoising + sub.prop(crl, "denoising_store_passes", text="Denoising") + if _cycles.with_cycles_debug: col = layout.column() col.prop(crl, "pass_debug_bvh_traversed_nodes") @@ -581,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel): row.prop(rv, "camera_suffix", text="") +class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel): + bl_label = "Denoising" + bl_context = "render_layer" + bl_options = {'DEFAULT_CLOSED'} + + def draw_header(self, context): + rd = context.scene.render + rl = rd.layers.active + crl = rl.cycles + cscene = context.scene.cycles + layout = self.layout + + layout.active = not cscene.use_progressive_refine + layout.prop(crl, "use_denoising", text="") + + def draw(self, context): + layout = self.layout + + scene = context.scene + cscene = scene.cycles + rd = scene.render + rl = rd.layers.active + crl = rl.cycles + + layout.active = crl.use_denoising and not cscene.use_progressive_refine + + split = layout.split() + + col = split.column() + sub = col.column(align=True) + sub.prop(crl, "denoising_radius", text="Radius") + sub.prop(crl, "denoising_strength", slider=True, text="Strength") + + col = split.column() + sub = col.column(align=True) + sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength") + sub.prop(crl, "denoising_relative_pca") + + layout.separator() + + row = layout.row() + row.label(text="Diffuse:") + sub = row.row(align=True) + sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Glossy:") + sub = row.row(align=True) + sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Transmission:") + sub = row.row(align=True) + sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Subsurface:") + sub = row.row(align=True) + sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True) + + class Cycles_PT_post_processing(CyclesButtonsPanel, Panel): bl_label = "Post Processing" bl_options = {'DEFAULT_CLOSED'} @@ -1734,6 +1805,7 @@ classes = ( CyclesRender_PT_layer_options, CyclesRender_PT_layer_passes, CyclesRender_PT_views, + CyclesRender_PT_denoising, Cycles_PT_post_processing, CyclesCamera_PT_dof, Cycles_PT_context_material, diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index dfd6dc8db34..ec71800a9ea 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -299,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine, static void end_render_result(BL::RenderEngine& b_engine, BL::RenderResult& b_rr, bool cancel, + bool highlight, bool do_merge_results) { - b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results); + b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results); } -void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only) +void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight) { BufferParams& params = rtile.buffers->params; int x = params.full_x - session->tile_manager.params.full_x; @@ -340,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda update_render_result(b_rr, b_rlay, rtile); } - end_render_result(b_engine, b_rr, true, true); + end_render_result(b_engine, b_rr, true, highlight, true); } else { /* write result */ write_render_result(b_rr, b_rlay, rtile); - end_render_result(b_engine, b_rr, false, true); + end_render_result(b_engine, b_rr, false, false, true); } } void BlenderSession::write_render_tile(RenderTile& rtile) { - do_write_update_render_tile(rtile, false); + do_write_update_render_tile(rtile, false, false); } -void BlenderSession::update_render_tile(RenderTile& rtile) +void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight) { /* use final write for preview renders, otherwise render result wouldn't be * be updated in blender side * would need to be investigated a bit further, but for now shall be fine */ if(!b_engine.is_preview()) - do_write_update_render_tile(rtile, true); + do_write_update_render_tile(rtile, true, highlight); else - do_write_update_render_tile(rtile, false); + do_write_update_render_tile(rtile, false, false); } void BlenderSession::render() { /* set callback to write out render results */ session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1); - session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1); + session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2); /* get buffer parameters */ SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); @@ -391,7 +392,7 @@ void BlenderSession::render() /* layer will be missing if it was disabled in the UI */ if(b_single_rlay == b_rr.layers.end()) { - end_render_result(b_engine, b_rr, true, false); + end_render_result(b_engine, b_rr, true, true, false); continue; } @@ -407,6 +408,29 @@ void BlenderSession::render() } buffer_params.passes = passes; + + PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles"); + bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising"); + buffer_params.denoising_data_pass = use_denoising; + session->tile_manager.schedule_denoising = use_denoising; + session->params.use_denoising = use_denoising; + scene->film->denoising_data_pass = buffer_params.denoising_data_pass; + scene->film->denoising_flags = 0; + if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR; + if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND; + if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR; + if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND; + if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR; + if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND; + if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR; + if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND; + scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); + buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass; + session->params.denoising_radius = get_int(crl, "denoising_radius"); + session->params.denoising_strength = get_float(crl, "denoising_strength"); + session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); + session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca"); + scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold(); scene->film->tag_passes_update(scene, passes); scene->film->tag_update(scene); @@ -460,7 +484,7 @@ void BlenderSession::render() } /* free result without merging */ - end_render_result(b_engine, b_rr, true, false); + end_render_result(b_engine, b_rr, true, true, false); if(session->progress.get_cancel()) break; @@ -666,6 +690,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr, /* copy pixels */ read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]); } + else { + int denoising_offset = BlenderSync::get_denoising_pass(b_pass); + if(denoising_offset >= 0) { + read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]); + } + } if(!read) { memset(&pixels[0], 0, pixels.size()*sizeof(float)); diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 22b21a18f2e..536808c5b18 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -79,7 +79,7 @@ public: void update_render_result(BL::RenderResult& b_rr, BL::RenderLayer& b_rlay, RenderTile& rtile); - void update_render_tile(RenderTile& rtile); + void update_render_tile(RenderTile& rtile, bool highlight); /* interactive updates */ void synchronize(); @@ -147,7 +147,7 @@ protected: BL::RenderLayer& b_rlay, RenderTile& rtile, bool do_update_only); - void do_write_update_render_tile(RenderTile& rtile, bool do_update_only); + void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight); int builtin_image_frame(const string &builtin_name); void builtin_image_info(const string &builtin_name, diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index f362eade954..08ba535f282 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -525,6 +525,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass) return PASS_NONE; } +int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) +{ + string name = b_pass.name(); + if(name.substr(0, 10) != "Denoising ") { + return -1; + } + name = name.substr(10); + +#define MAP_PASS(passname, offset) if(name == passname) return offset; + MAP_PASS("Normal", DENOISING_PASS_NORMAL); + MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR); + MAP_PASS("Albedo", DENOISING_PASS_ALBEDO); + MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR); + MAP_PASS("Depth", DENOISING_PASS_DEPTH); + MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR); + MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A); + MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B); + MAP_PASS("Image", DENOISING_PASS_COLOR); + MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR); +#undef MAP_PASS + + return -1; +} + array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, BL::SceneRenderLayer& b_srlay) { @@ -544,8 +568,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, Pass::add(pass_type, passes); } -#ifdef __KERNEL_DEBUG__ PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles"); + if(get_boolean(crp, "denoising_store_passes")) { + b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str()); + } +#ifdef __KERNEL_DEBUG__ if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) { b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str()); Pass::add(PASS_BVH_TRAVERSED_NODES, passes); diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index aee39a5652a..0950285d976 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -96,6 +96,7 @@ public: int width, int height); static PassType get_pass_type(BL::RenderPass& b_pass); + static int get_denoising_pass(BL::RenderPass& b_pass); private: /* sync */ diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 6ef2aa1caad..74ec57ddf74 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -25,6 +25,7 @@ set(SRC device.cpp device_cpu.cpp device_cuda.cpp + device_denoising.cpp device_multi.cpp device_opencl.cpp device_split_kernel.cpp @@ -48,6 +49,7 @@ endif() set(SRC_HEADERS device.h + device_denoising.h device_memory.h device_intern.h device_network.h diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index c024021b4b3..0603ecb3afb 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -402,4 +402,16 @@ void Device::free_memory() devices.free_memory(); } + +device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type) + : device(device) +{ + ptr = device->mem_alloc_sub_ptr(mem, offset, size, type); +} + +device_sub_ptr::~device_sub_ptr() +{ + device->mem_free_sub_ptr(ptr); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 6051dd8b3eb..527940e8f50 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -228,6 +228,7 @@ struct DeviceDrawParams { }; class Device { + friend class device_sub_ptr; protected: Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {} @@ -237,6 +238,14 @@ protected: /* used for real time display */ unsigned int vertex_buffer; + virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/) + { + /* Only required for devices that implement denoising. */ + assert(false); + return (device_ptr) 0; + } + virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {}; + public: virtual ~Device(); @@ -265,6 +274,8 @@ public: virtual void mem_zero(device_memory& mem) = 0; virtual void mem_free(device_memory& mem) = 0; + virtual int mem_address_alignment() { return 16; } + /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; @@ -312,6 +323,8 @@ public: /* multi device */ virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {} virtual int device_number(Device * /*sub_device*/) { return 0; } + virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} + virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} /* static */ static Device *create(DeviceInfo& info, Stats &stats, bool background = true); diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 84cce605182..1ecce8bd565 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -25,6 +25,7 @@ #endif #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" @@ -34,6 +35,8 @@ #include "kernel/split/kernel_split_data.h" #include "kernel/kernel_globals.h" +#include "kernel/filter/filter.h" + #include "kernel/osl/osl_shader.h" #include "kernel/osl/osl_globals.h" @@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN class CPUDevice; -class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; -public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); +/* Has to be outside of the class to be shared across template instantiations. */ +static const char *logged_architecture = ""; - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); -}; - -class CPUDevice : public Device -{ - static unordered_map<string, void*> kernel_functions; - - static void register_kernel_function(const char* name, void* func) +template<typename F> +class KernelFunctions { +public: + KernelFunctions() { - kernel_functions[name] = func; + kernel = (F)NULL; } - static const char* get_arch_name() + KernelFunctions(F kernel_default, + F kernel_sse2, + F kernel_sse3, + F kernel_sse41, + F kernel_avx, + F kernel_avx2) { + const char *architecture_name = "default"; + kernel = kernel_default; + + /* Silence potential warnings about unused variables + * when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { - return "cpu_avx2"; + architecture_name = "AVX2"; + kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { - return "cpu_avx"; + architecture_name = "AVX"; + kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { - return "cpu_sse41"; + architecture_name = "SSE4.1"; + kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { - return "cpu_sse3"; + architecture_name = "SSE3"; + kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { - return "cpu_sse2"; + architecture_name = "SSE2"; + kernel = kernel_sse2; } - else #endif - { - return "cpu"; + + if(strstr(architecture_name, logged_architecture) != 0) { + VLOG(1) << "Will be using " << architecture_name << " kernels."; + logged_architecture = architecture_name; } } - template<typename F> - static F get_kernel_function(string name) - { - name = string("kernel_") + get_arch_name() + "_" + name; - - unordered_map<string, void*>::iterator it = kernel_functions.find(name); + inline F operator()() const { + assert(kernel); + return kernel; + } +protected: + F kernel; +}; - if(it == kernel_functions.end()) { - assert(!"kernel function not found"); - return NULL; - } +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); - return (F)it->second; - } + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); - friend class CPUSplitKernel; + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; +class CPUDevice : public Device +{ public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -149,77 +168,89 @@ public: bool use_split_kernel; DeviceRequestedFeatures requested_features; - + + KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; + KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel; + + KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; + + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + + KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, + ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int, + ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel; + unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels; + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), \ + KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), \ + KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), \ + KERNEL_NAME_EVAL(cpu_avx2, name) + CPUDevice(DeviceInfo& info, Stats &stats, bool background) - : Device(info, stats, background) + : Device(info, stats, background), +#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name)) + REGISTER_KERNEL(path_trace), + REGISTER_KERNEL(convert_to_half_float), + REGISTER_KERNEL(convert_to_byte), + REGISTER_KERNEL(shader), + REGISTER_KERNEL(filter_divide_shadow), + REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_combine_halves), + REGISTER_KERNEL(filter_nlm_calc_difference), + REGISTER_KERNEL(filter_nlm_blur), + REGISTER_KERNEL(filter_nlm_calc_weight), + REGISTER_KERNEL(filter_nlm_update_output), + REGISTER_KERNEL(filter_nlm_normalize), + REGISTER_KERNEL(filter_construct_transform), + REGISTER_KERNEL(filter_nlm_construct_gramian), + REGISTER_KERNEL(filter_finalize), + REGISTER_KERNEL(data_init) +#undef REGISTER_KERNEL { #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif - - /* do now to avoid thread issues */ - system_cpu_support_sse2(); - system_cpu_support_sse3(); - system_cpu_support_sse41(); - system_cpu_support_avx(); - system_cpu_support_avx2(); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - VLOG(1) << "Will be using AVX2 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - VLOG(1) << "Will be using AVX kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - VLOG(1) << "Will be using SSE4.1 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - VLOG(1) << "Will be using SSE3kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - VLOG(1) << "Will be using SSE2 kernels."; - } - else -#endif - { - VLOG(1) << "Will be using regular kernels."; - } - use_split_kernel = DebugFlags().cpu.split_kernel; if(use_split_kernel) { VLOG(1) << "Will be using split kernel."; } - kernel_cpu_register_functions(register_kernel_function); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - kernel_cpu_sse2_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - kernel_cpu_sse3_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - kernel_cpu_sse41_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - kernel_cpu_avx_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - kernel_cpu_avx2_register_functions(register_kernel_function); -#endif +#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name)) + REGISTER_SPLIT_KERNEL(path_init); + REGISTER_SPLIT_KERNEL(scene_intersect); + REGISTER_SPLIT_KERNEL(lamp_emission); + REGISTER_SPLIT_KERNEL(do_volume); + REGISTER_SPLIT_KERNEL(queue_enqueue); + REGISTER_SPLIT_KERNEL(indirect_background); + REGISTER_SPLIT_KERNEL(shader_setup); + REGISTER_SPLIT_KERNEL(shader_sort); + REGISTER_SPLIT_KERNEL(shader_eval); + REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); + REGISTER_SPLIT_KERNEL(subsurface_scatter); + REGISTER_SPLIT_KERNEL(direct_lighting); + REGISTER_SPLIT_KERNEL(shadow_blocked_ao); + REGISTER_SPLIT_KERNEL(shadow_blocked_dl); + REGISTER_SPLIT_KERNEL(next_iteration_setup); + REGISTER_SPLIT_KERNEL(indirect_subsurface); + REGISTER_SPLIT_KERNEL(buffer_update); +#undef REGISTER_SPLIT_KERNEL +#undef KERNEL_FUNCTIONS } ~CPUDevice() @@ -273,13 +304,17 @@ public: if(!mem.data_pointer) { free((void*)mem.device_pointer); } - mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { kernel_const_copy(&kernel_globals, name, host, size); @@ -326,13 +361,8 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { - if(!use_split_kernel) { - thread_path_trace(*task); - } - else { - thread_path_trace_split(*task); - } + if(task->type == DeviceTask::RENDER) { + thread_render(*task); } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); @@ -349,116 +379,319 @@ public: } }; - void thread_path_trace(DeviceTask& task) + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; } - KernelGlobals kg = thread_kernel_globals_init(); - RenderTile tile; + return true; + } - void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + + float *blurDifference = (float*) task->nlm_state.temporary_1_ptr; + float *difference = (float*) task->nlm_state.temporary_2_ptr; + float *weightAccum = (float*) task->nlm_state.temporary_3_ptr; + + memset(weightAccum, 0, sizeof(float)*w*h); + memset((float*) out_ptr, 0, sizeof(float)*w*h); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) variance_ptr, + difference, + local_rect, + w, 0, + a, k_2); + + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, dy, + blurDifference, + (float*) image_ptr, + (float*) out_ptr, + weightAccum, + local_rect, + w, f); + } + + int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; + filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - path_trace_kernel = kernel_cpu_avx2_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - path_trace_kernel = kernel_cpu_avx_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - path_trace_kernel = kernel_cpu_sse41_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - path_trace_kernel = kernel_cpu_sse3_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - path_trace_kernel = kernel_cpu_sse2_path_trace; - } - else -#endif - { - path_trace_kernel = kernel_cpu_path_trace; + return true; + } + + bool denoising_construct_transform(DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + x + task->filter_area.x, + y + task->filter_area.y, + y*task->filter_area.z + x, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + &task->rect.x, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + } } + return true; + } - while(task.acquire_tile(this, tile)) { - float *render_buffer = (float*)tile.buffer; - uint *rng_state = (uint*)tile.rng_state; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task.get_cancel() || task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + float *difference = (float*) task->reconstruction_state.temporary_1_ptr; + float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) guide_variance_ptr, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_construct_gramian_kernel()(dx, dy, + blurDifference, + (float*) task->buffer.mem.device_pointer, + (float*) color_ptr, + (float*) color_variance_ptr, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_rect.x, + task->buffer.w, + task->buffer.h, + 4, + task->buffer.pass_stride); + } + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y*task->filter_area.z + x, + task->buffer.w, + task->buffer.h, + (float*) output_ptr, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); + } + } + return true; + } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - path_trace_kernel(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); - } - } + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, y, + (float*) mean_ptr, + (float*) variance_ptr, + (float*) a_ptr, + (float*) b_ptr, + &rect.x, + r); + } + } + return true; + } - tile.sample = sample + 1; + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + task->tiles, + x, y, + (float*) a_ptr, + (float*) b_ptr, + (float*) sample_variance_ptr, + (float*) sv_variance_ptr, + (float*) buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); + } + } + return true; + } - task.update_progress(&tile, tile.w*tile.h); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + task->tiles, + mean_offset, + variance_offset, + x, y, + (float*) mean_ptr, + (float*) variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); } + } + return true; + } - task.release_tile(tile); + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + { + float *render_buffer = (float*)tile.buffer; + uint *rng_state = (uint*)tile.rng_state; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - if(task_pool.canceled()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if(task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + path_trace_kernel()(kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(&tile, tile.w*tile.h); } + } + + void denoise(DeviceTask &task, RenderTile &tile) + { + tile.sample = tile.start_sample + tile.num_samples; + + DenoisingTask denoising(this); - thread_kernel_globals_free(&kg); + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; + + RenderTile rtiles[9]; + rtiles[4] = tile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + + task.update_progress(&tile, tile.w*tile.h); } - void thread_path_trace_split(DeviceTask& task) + void thread_render(DeviceTask& task) { if(task_pool.canceled()) { if(task.need_finish_queue == false) return; } - RenderTile tile; - - CPUSplitKernel split_kernel(this); - /* allocate buffer for kernel globals */ - device_memory kgbuffer; - kgbuffer.resize(sizeof(KernelGlobals)); + device_only_memory<KernelGlobals> kgbuffer; + kgbuffer.resize(1); mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); - requested_features.max_closure = MAX_CLOSURE; - if(!split_kernel.load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); - mem_free(kgbuffer); + CPUSplitKernel *split_kernel = NULL; + if(use_split_kernel) { + split_kernel = new CPUSplitKernel(this); + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel->load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); - return; + delete split_kernel; + return; + } } + RenderTile tile; while(task.acquire_tile(this, tile)) { - device_memory data; - split_kernel.path_trace(&task, tile, kgbuffer, data); + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel) { + device_memory data; + split_kernel->path_trace(&task, tile, kgbuffer, data); + } + else { + path_trace(task, tile, kg); + } + } + else if(tile.task == RenderTile::DENOISE) { + denoise(task, tile); + } task.release_tile(tile); @@ -470,6 +703,7 @@ public: thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); mem_free(kgbuffer); + delete split_kernel; } void thread_film_convert(DeviceTask& task) @@ -477,86 +711,16 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { - void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; - } - else -#endif - { - convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } else { - void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; - } - else -#endif - { - convert_to_byte_kernel = kernel_cpu_convert_to_byte; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } } @@ -568,53 +732,17 @@ public: #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - shader_kernel = kernel_cpu_avx2_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - shader_kernel = kernel_cpu_avx_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - shader_kernel = kernel_cpu_sse41_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - shader_kernel = kernel_cpu_sse3_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - shader_kernel = kernel_cpu_sse2_shader; - } - else -#endif - { - shader_kernel = kernel_cpu_shader; - } - for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel(&kg, - (uint4*)task.shader_input, - (float4*)task.shader_output, - (float*)task.shader_output_luma, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); + shader_kernel()(&kg, + (uint4*)task.shader_input, + (float4*)task.shader_output, + (float*)task.shader_output_luma, + task.shader_eval_type, + task.shader_filter, + x, + task.offset, + sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, device_memory& use_queues_flags, device_memory& work_pool_wgs) { - typedef void(*data_init_t)(KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - ccl_global uint *rng_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer); - - data_init_t data_init; - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - data_init = kernel_cpu_avx2_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - data_init = kernel_cpu_avx_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - data_init = kernel_cpu_sse41_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - data_init = kernel_cpu_sse3_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - data_init = kernel_cpu_sse2_data_init; - } - else -#endif - { - data_init = kernel_cpu_data_init; - } - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); @@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, for(int x = 0; x < dim.global_size[0]; x++) { kg->global_id = make_int2(x, y); - data_init((KernelGlobals*)kernel_globals.device_pointer, - (KernelData*)data.device_pointer, - (void*)split_data.device_pointer, - num_global_elements, - (char*)ray_state.device_pointer, - (uint*)rtile.rng_state, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int*)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char*)use_queues_flags.device_pointer, - (uint*)work_pool_wgs.device_pointer, - rtile.num_samples, - (float*)rtile.buffer); + device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); } } @@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam { CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name); + kernel->func = device->split_kernels[kernel_name](); if(!kernel->func) { delete kernel; return NULL; @@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device return split_data_buffer_size(kg, num_threads); } -unordered_map<string, void*> CPUDevice::kernel_functions; - Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index a971170318e..968ee5bc487 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -21,11 +21,14 @@ #include <string.h> #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" #include "render/buffers.h" +#include "kernel/filter/filter_defines.h" + #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else @@ -113,7 +116,7 @@ public: DedicatedTaskPool task_pool; CUdevice cuDevice; CUcontext cuContext; - CUmodule cuModule; + CUmodule cuModule, cuFilterModule; map<device_ptr, bool> tex_interp_map; map<device_ptr, uint> tex_bindless_map; int cuDevId; @@ -170,7 +173,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -301,7 +304,8 @@ public: * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features, bool split=false) + const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); @@ -316,7 +320,7 @@ public: machine, cuda_version, include_path.c_str()); - if(use_adaptive_compilation()) { + if(!filter && use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); @@ -364,8 +368,22 @@ public: return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) + string compile_kernel(const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { + const char *name, *source; + if(filter) { + name = "filter"; + source = "filter.cu"; + } + else if(split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } /* Compute cubin name. */ int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -373,9 +391,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" - : "lib/kernel_sm_%d%d.cubin", - major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", + name, major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { VLOG(1) << "Using precompiled kernel."; @@ -384,7 +401,7 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features, split); + compile_kernel_get_common_cflags(requested_features, filter, split); /* Try to use locally compiled kernel. */ const string source_path = path_get("source"); @@ -395,9 +412,8 @@ public: */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" - : "cycles_kernel_sm%d%d_%s.cubin", - major, minor, + const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", + name, major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; @@ -432,7 +448,7 @@ public: const string kernel = path_join( path_join(source_path, "kernel"), path_join("kernels", - path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); + path_join("cuda", source))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -480,11 +496,14 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features, use_split_kernel()); - + string cubin = compile_kernel(requested_features, false, use_split_kernel()); if(cubin == "") return false; + string filter_cubin = compile_kernel(requested_features, true, false); + if(filter_cubin == "") + return false; + /* open module */ cuda_push_context(); @@ -499,6 +518,14 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + if(path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if(cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + cuda_pop_context(); return (result == CUDA_SUCCESS); @@ -581,6 +608,11 @@ public: } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; @@ -881,6 +913,368 @@ public: } } + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) + { + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; + } + + mem_copy_to(task->tiles_mem); + + return !have_error(); + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + CUdeviceptr difference = task->nlm_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + + int dx, dy; + int4 local_rect; + int channel_offset = 0; + void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; + void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + dy = i / (2*r+1) - r; + dx = i % (2*r+1) - r; + local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); + } + + local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); + void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cuda_push_context(); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + void *calc_difference_args[] = {&dx, &dy, + &guide_ptr, + &guide_variance_ptr, + &difference, + &local_rect, + &task->buffer.w, + &task->buffer.pass_stride, + &a, + &task->nlm_k_2}; + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + + void *blur_args[] = {&difference, + &blurDifference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *calc_weight_args[] = {&blurDifference, + &difference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + + /* Reuse previous arguments. */ + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *construct_gramian_args[] = {&dx, &dy, + &blurDifference, + &task->buffer.mem.device_pointer, + &color_ptr, + &color_variance_ptr, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &local_rect, + &task->reconstruction_state.filter_rect, + &task->buffer.w, + &task->buffer.h, + &f, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); + } + + void *finalize_args[] = {&task->buffer.w, + &task->buffer.h, + &output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + void denoise(RenderTile &rtile, const DeviceTask &task) + { + DenoisingTask denoising(this); + + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + } + void path_trace(RenderTile& rtile, int sample, bool branched) { if(have_error()) @@ -1305,7 +1699,7 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { + if(task->type == DeviceTask::RENDER) { RenderTile tile; bool branched = task->integrator_branched; @@ -1313,30 +1707,8 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - if(!use_split_kernel()) { - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - - path_trace(tile, sample, branched); - - tile.sample = sample + 1; - - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } - } - else { - DeviceRequestedFeatures requested_features; + DeviceRequestedFeatures requested_features; + if(use_split_kernel()) { if(!use_adaptive_compilation()) { requested_features.max_closure = 64; } @@ -1345,18 +1717,47 @@ public: split_kernel = new CUDASplitKernel(this); split_kernel->load_kernels(requested_features); } + } + + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel()) { + device_memory void_buffer; + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } - while(task->acquire_tile(this, tile)) { - device_memory void_buffer; - split_kernel->path_trace(task, tile, void_buffer, void_buffer); + path_trace(tile, sample, branched); - task->release_tile(tile); + tile.sample = sample + 1; - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; + task->update_progress(&tile, tile.w*tile.h); + } } } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + + denoise(tile, *task); + + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } } } else if(task->type == DeviceTask::SHADER) { diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp new file mode 100644 index 00000000000..39c8cf30105 --- /dev/null +++ b/intern/cycles/device/device_denoising.cpp @@ -0,0 +1,218 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_denoising.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +void DenoisingTask::init_from_devicetask(const DeviceTask &task) +{ + radius = task.denoising_radius; + nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength)); + if(task.denoising_relative_pca) { + pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength)); + } + else { + pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); + } + + render_buffer.pass_stride = task.pass_stride; + render_buffer.denoising_data_offset = task.pass_denoising_data; + render_buffer.denoising_clean_offset = task.pass_denoising_clean; + + /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ + rect = make_int4(max(tiles->x[0], filter_area.x - radius), + max(tiles->y[0], filter_area.y - radius), + min(tiles->x[3], filter_area.x + filter_area.z + radius), + min(tiles->y[3], filter_area.y + filter_area.w + radius)); +} + +void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) +{ + tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int)); + + device_ptr buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = rtiles[i].buffer; + tiles->offsets[i] = rtiles[i].offset; + tiles->strides[i] = rtiles[i].stride; + } + tiles->x[0] = rtiles[3].x; + tiles->x[1] = rtiles[4].x; + tiles->x[2] = rtiles[5].x; + tiles->x[3] = rtiles[5].x + rtiles[5].w; + tiles->y[0] = rtiles[1].y; + tiles->y[1] = rtiles[4].y; + tiles->y[2] = rtiles[7].y; + tiles->y[3] = rtiles[7].y + rtiles[7].h; + + render_buffer.offset = rtiles[4].offset; + render_buffer.stride = rtiles[4].stride; + render_buffer.ptr = rtiles[4].buffer; + + functions.set_tiles(buffers); +} + +bool DenoisingTask::run_denoising() +{ + /* Allocate denoising buffer. */ + buffer.passes = 14; + buffer.w = align_up(rect.z - rect.x, 4); + buffer.h = rect.w - rect.y; + buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); + buffer.mem.resize(buffer.pass_stride * buffer.passes); + device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE); + + device_ptr null_ptr = (device_ptr) 0; + + /* Prefilter shadow feature. */ + { + device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ + functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); + + /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ + nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); + + /* Reuse memory, the previous data isn't needed anymore. */ + device_ptr filtered_a = *buffer_var, + filtered_b = *sample_var; + /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ + nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); + functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); + + device_ptr residual_var = *sample_var_var; + /* Estimate the residual variance between the two filtered halves. */ + functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); + + device_ptr final_a = *unfiltered_a, + final_b = *unfiltered_b; + /* Use the residual variance for a second filter pass. */ + nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); + functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); + + /* Combine the two double-filtered halves to a final shadow feature. */ + device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); + } + + /* Prefilter general features. */ + { + device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 }; + int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 }; + int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 }; + for(int pass = 0; pass < 7; pass++) { + device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + /* Get the unfiltered pass and its variance from the RenderBuffers. */ + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + /* Smooth the pass and store the result in the denoising buffers. */ + nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); + } + } + + /* Copy color passes. */ + { + int mean_from[] = {20, 21, 22}; + int variance_from[] = {23, 24, 25}; + int mean_to[] = { 8, 9, 10}; + int variance_to[] = {11, 12, 13}; + int num_color_passes = 3; + for(int pass = 0; pass < num_color_passes; pass++) { + device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + } + } + + storage.w = filter_area.z; + storage.h = filter_area.w; + storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE); + storage.rank.resize(storage.w*storage.h); + device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE); + device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE); + + functions.construct_transform(); + + device_only_memory<float> temporary_1; + device_only_memory<float> temporary_2; + temporary_1.resize(buffer.w*buffer.h); + temporary_2.resize(buffer.w*buffer.h); + device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE); + device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE); + reconstruction_state.temporary_1_ptr = temporary_1.device_pointer; + reconstruction_state.temporary_2_ptr = temporary_2.device_pointer; + + storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE); + storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE); + device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE); + device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE); + + reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); + int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; + reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, + render_buffer.stride, + render_buffer.pass_stride, + render_buffer.denoising_clean_offset); + reconstruction_state.source_w = rect.z-rect.x; + reconstruction_state.source_h = rect.w-rect.y; + + { + device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr); + } + + device->mem_free(storage.XtWX); + device->mem_free(storage.XtWY); + device->mem_free(storage.transform); + device->mem_free(storage.rank); + device->mem_free(temporary_1); + device->mem_free(temporary_2); + device->mem_free(buffer.mem); + device->mem_free(tiles_mem); + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h new file mode 100644 index 00000000000..86d8eb64386 --- /dev/null +++ b/intern/cycles/device/device_denoising.h @@ -0,0 +1,145 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_DENOISING_H__ +#define __DEVICE_DENOISING_H__ + +#include "device/device.h" + +#include "render/buffers.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +class DenoisingTask { +public: + /* Parameters of the denoising algorithm. */ + int radius; + float nlm_k_2; + float pca_threshold; + + /* Pointer and parameters of the RenderBuffers. */ + struct RenderBuffers { + int denoising_data_offset; + int denoising_clean_offset; + int pass_stride; + int offset; + int stride; + device_ptr ptr; + int samples; + } render_buffer; + + TilesInfo *tiles; + device_vector<int> tiles_mem; + void tiles_from_rendertiles(RenderTile *rtiles); + + int4 rect; + int4 filter_area; + + struct DeviceFunctions { + function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ + device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ + device_ptr variance_ptr, /* Contains the variance of the guide image. */ + device_ptr out_ptr /* The filtered output is written into this image. */ + )> non_local_means; + function<bool(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr + )> reconstruct; + function<bool()> construct_transform; + + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect + )> combine_halves; + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr + )> divide_shadow; + function<bool(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr + )> get_feature; + function<bool(device_ptr*)> set_tiles; + } functions; + + /* Stores state of the current Reconstruction operation, + * which is accessed by the device in order to perform the operation. */ + struct ReconstructionState { + device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */ + device_ptr temporary_2_ptr; + + int4 filter_rect; + int4 buffer_params; + + int source_w; + int source_h; + } reconstruction_state; + + /* Stores state of the current NLM operation, + * which is accessed by the device in order to perform the operation. */ + struct NLMState { + device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */ + device_ptr temporary_2_ptr; + device_ptr temporary_3_ptr; + + int r; /* Search radius of the filter. */ + int f; /* Patch size of the filter. */ + float a; /* Variance compensation factor in the MSE estimation. */ + float k_2; /* Squared value of the k parameter of the filter. */ + + void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + } nlm_state; + + struct Storage { + device_only_memory<float> transform; + device_only_memory<int> rank; + device_only_memory<float> XtWX; + device_only_memory<float3> XtWY; + int w; + int h; + } storage; + + DenoisingTask(Device *device) : device(device) {} + + void init_from_devicetask(const DeviceTask &task); + + bool run_denoising(); + + struct DenoiseBuffers { + int pass_stride; + int passes; + int w; + int h; + device_only_memory<float> mem; + } buffer; + +protected: + Device *device; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 4b10514a9d2..b63dd00068b 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -35,6 +35,8 @@ CCL_NAMESPACE_BEGIN +class Device; + enum MemoryType { MEM_READ_ONLY, MEM_WRITE_ONLY, @@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> { template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 3; + static const int num_elements = 4; }; template<> struct device_type_traits<float4> { @@ -173,6 +175,9 @@ class device_memory { public: size_t memory_size() { return data_size*data_elements*datatype_size(data_type); } + size_t memory_elements_size(int elements) { + return elements*data_elements*datatype_size(data_type); + } /* data information */ DataType data_type; @@ -213,6 +218,22 @@ protected: device_memory& operator = (const device_memory&); }; +template<typename T> +class device_only_memory : public device_memory +{ +public: + device_only_memory() + { + data_type = device_type_traits<T>::data_type; + data_elements = max(device_type_traits<T>::num_elements, 1); + } + + void resize(size_t num) + { + device_memory::resize(num*sizeof(T)); + } +}; + /* Device Vector */ template<typename T> class device_vector : public device_memory @@ -299,6 +320,27 @@ private: array<T> data; }; +/* A device_sub_ptr is a pointer into another existing memory. + * Therefore, it is not allocated separately, but just created from the already allocated base memory. + * It is freed automatically when it goes out of scope, which should happen before the base memory is freed. + * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */ +class device_sub_ptr +{ +public: + device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type); + ~device_sub_ptr(); + /* No copying. */ + device_sub_ptr& operator = (const device_sub_ptr&); + + device_ptr operator*() const + { + return ptr; + } +protected: + Device *device; + device_ptr ptr; +}; + CCL_NAMESPACE_END #endif /* __DEVICE_MEMORY_H__ */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 624260a81c8..bc505b676fc 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -299,6 +299,60 @@ public: return -1; } + void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + /* If the tile was rendered on another device, copy its memory to + * to the current device now, for the duration of the denoising task. + * Note that this temporarily modifies the RenderBuffers and calls + * the device, so this function is not thread safe. */ + if(tiles[i].buffers->device != sub_device) { + device_vector<float> &mem = tiles[i].buffers->buffer; + + tiles[i].buffers->copy_from_device(); + device_ptr original_ptr = mem.device_pointer; + mem.device_pointer = 0; + sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE); + sub_device->mem_copy_to(mem); + tiles[i].buffer = mem.device_pointer; + mem.device_pointer = original_ptr; + } + } + } + + void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + if(tiles[i].buffers->device != sub_device) { + device_vector<float> &mem = tiles[i].buffers->buffer; + + device_ptr original_ptr = mem.device_pointer; + mem.device_pointer = tiles[i].buffer; + + /* Copy denoised tile to the host. */ + if(i == 4) { + tiles[i].buffers->copy_from_device(sub_device); + } + + size_t mem_size = mem.device_size; + sub_device->mem_free(mem); + mem.device_pointer = original_ptr; + mem.device_size = mem_size; + + /* Copy denoised tile to the original device. */ + if(i == 4) { + tiles[i].buffers->device->mem_copy_to(mem); + } + } + } + } + int get_split_task_count(DeviceTask& task) { int total_tasks = 0; diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 9118793aad6..dddd19f179f 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, unsigned int max_work_groups = num_global_elements / work_pool_size + 1; /* Allocate work_pool_wgs memory. */ - work_pool_wgs.resize(max_work_groups * sizeof(unsigned int)); + work_pool_wgs.resize(max_work_groups); device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE); - queue_index.resize(NUM_QUEUES * sizeof(int)); + queue_index.resize(NUM_QUEUES); device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE); - use_queues_flag.resize(sizeof(char)); + use_queues_flag.resize(1); device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE); ray_state.resize(num_global_elements); diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 58c2fdbb077..68c2ba974a5 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -80,16 +80,16 @@ private: */ device_memory split_data; device_vector<uchar> ray_state; - device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */ + device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */ /* Flag to make sceneintersect and lampemission kernel use queues. */ - device_memory use_queues_flag; + device_only_memory<char> use_queues_flag; /* Approximate time it takes to complete one sample */ double avg_time_per_sample; /* Work pool with respect to each work group. */ - device_memory work_pool_wgs; + device_only_memory<unsigned int> work_pool_wgs; /* clos_max value for which the kernels have been loaded currently. */ int current_max_closure; diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index ca303365627..3bc4c310283 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size) if(type == SHADER) { num = min(shader_w, num); } - else if(type == PATH_TRACE) { + else if(type == RENDER) { } else { num = min(h, num); @@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) tasks.push_back(task); } } - else if(type == PATH_TRACE) { + else if(type == RENDER) { for(int i = 0; i < num; i++) tasks.push_back(*this); } @@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) { - if((type != PATH_TRACE) && + if((type != RENDER) && (type != SHADER)) return; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index feee89fd6e4..44a1efff1f5 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -34,7 +34,7 @@ class Tile; class DeviceTask : public Task { public: - typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type; + typedef enum { RENDER, FILM_CONVERT, SHADER } Type; Type type; int x, y, w, h; @@ -53,7 +53,7 @@ public: int passes_size; - explicit DeviceTask(Type type = PATH_TRACE); + explicit DeviceTask(Type type = RENDER); int get_subtask_count(int num, int max_size = 0); void split(list<DeviceTask>& tasks, int num, int max_size = 0); @@ -65,6 +65,16 @@ public: function<void(RenderTile&)> update_tile_sample; function<void(RenderTile&)> release_tile; function<bool(void)> get_cancel; + function<void(RenderTile*, Device*)> map_neighbor_tiles; + function<void(RenderTile*, Device*)> unmap_neighbor_tiles; + + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + int pass_stride; + int pass_denoising_data; + int pass_denoising_clean; bool need_finish_queue; bool integrator_branched; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index d061973dcb7..a458ca6bf64 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -17,6 +17,7 @@ #ifdef WITH_OPENCL #include "device/device.h" +#include "device/device_denoising.h" #include "util/util_map.h" #include "util/util_param.h" @@ -129,6 +130,8 @@ public: cl_int* error = NULL); static cl_device_type get_device_type(cl_device_id device_id); + static int mem_address_alignment(cl_device_id device_id); + /* Get somewhat more readable device name. * Main difference is AMD OpenCL here which only gives code name * for the regular device name. This will give more sane device @@ -218,7 +221,7 @@ public: cl_int err = stmt; \ \ if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -282,7 +285,7 @@ public: map<ustring, cl_kernel> kernels; }; - OpenCLProgram base_program; + OpenCLProgram base_program, denoising_program; typedef map<string, device_vector<uchar>*> ConstMemMap; typedef map<string, device_ptr> MemMap; @@ -320,6 +323,9 @@ public: void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); void mem_free(device_memory& mem); + + int mem_address_alignment(); + void const_copy_to(const char *name, void *host, size_t size); void tex_alloc(const char *name, device_memory& mem, @@ -328,12 +334,14 @@ public: void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); + void denoise(RenderTile& tile, const DeviceTask& task); + class OpenCLDeviceTask : public DeviceTask { public: OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) @@ -367,9 +375,48 @@ public: virtual void thread_run(DeviceTask * /*task*/) = 0; + virtual bool is_split_kernel() = 0; + protected: string kernel_build_options(const string *debug_src = NULL); + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task); + bool denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type); + void mem_free_sub_ptr(device_ptr ptr); + class ArgumentWrapper { public: ArgumentWrapper() : size(0), pointer(NULL) diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 22aeaddcde8..ae1a7b917c3 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("bake")); base_program.add_kernel(ustring("zero_buffer")); + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + denoising_program.add_kernel(ustring("filter_set_tiles")); + vector<OpenCLProgram*> programs; programs.push_back(&base_program); + programs.push_back(&denoising_program); /* Call actual class to fill the vector with its programs. */ if(!load_kernels(requested_features, programs)) { return false; @@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in NULL, NULL)); } -void OpenCLDeviceBase::mem_zero(device_memory& mem) +void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) { - if(mem.device_pointer) { - if(base_program.is_loaded()) { - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; - cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; - while(d_offset < mem.memory_size()) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + while(d_offset < size) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - d_offset += d_size; - } + d_offset += d_size; + } +} + +void OpenCLDeviceBase::mem_zero(device_memory& mem) +{ + if(mem.device_pointer) { + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); } if(mem.data_pointer) { @@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem) } } +int OpenCLDeviceBase::mem_address_alignment() +{ + return OpenCLInfo::mem_address_alignment(cdDevice); +} + +device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type) +{ + cl_mem_flags mem_flag; + if(type == MEM_READ_ONLY) + mem_flag = CL_MEM_READ_ONLY; + else if(type == MEM_WRITE_ONLY) + mem_flag = CL_MEM_WRITE_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} + +void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } +} + void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) { ConstMemMap::iterator i = const_mem_map.find(name); @@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + /* Try to divide evenly over 2 dimensions. */ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; @@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } +bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + + mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); + mem_zero_kernel(out_ptr, sizeof(float)*w*h); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, guide_mem, variance_mem, + difference, local_rect, w, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, blurDifference, local_rect, w, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, difference, local_rect, w, f); + kernel_set_args(ckNLMUpdateOutput, 0, + dx, dy, blurDifference, image_mem, + out_mem, weightAccum, local_rect, w, f); + + enqueue_kernel(ckNLMCalcDifference, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMCalcWeight, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMUpdateOutput, w, h); + } + + int4 local_rect = make_int4(0, 0, w, h); + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum, local_rect, w); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, + guide_mem, + guide_variance_mem, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + a, task->nlm_k_2); + enqueue_kernel(ckNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMCalcWeight, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + /* Reuse previous arguments. */ + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMConstructGramian, 0, + dx, dy, + blurDifference, + buffer_mem, + color_mem, + color_variance_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + local_rect, + task->reconstruction_state.filter_rect, + task->buffer.w, + task->buffer.h, + f, + task->buffer.pass_stride); + enqueue_kernel(ckNLMConstructGramian, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h, + 256); + } + + kernel_set_args(ckFinalize, 0, + task->buffer.w, + task->buffer.h, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + return true; +} + +bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + (void) task; + + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + (void) task; + + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tiles_mem, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tiles_mem, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task) +{ + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE); + mem_copy_to(task->tiles_mem); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); + + kernel_set_args(ckFilterSetTiles, 0, tiles_mem); + for(int i = 0; i < 9; i++) { + cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); + kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); + } + + enqueue_kernel(ckFilterSetTiles, 1, 1); + + return true; +} + +void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) +{ + DenoisingTask denoising(this); + + denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); + denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); +} + void OpenCLDeviceBase::shader(DeviceTask& task) { /* cast arguments to cl types */ diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index a2fd1d71156..06c15bcf401 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -108,41 +108,53 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(tile.task == RenderTile::PATH_TRACE) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + path_trace(tile, sample); - path_trace(tile, sample); + tile.sample = sample + 1; - tile.sample = sample + 1; + task->update_progress(&tile, tile.w*tile.h); + } + /* Complete kernel execution before release tile */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); task->update_progress(&tile, tile.w*tile.h); } - /* Complete kernel execution before release tile */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - task->release_tile(tile); } } } + + bool is_split_kernel() + { + return false; + } }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background) diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index d175aae137a..76dcbd6fc9a 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -104,7 +104,7 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to @@ -127,21 +127,29 @@ public: /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); + if(tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); + task->update_progress(&tile, tile.w*tile.h); + } task->release_tile(tile); } @@ -150,6 +158,11 @@ public: } } + bool is_split_kernel() + { + return true; + } + protected: /* ** Those guys are for workign around some compiler-specific bugs ** */ diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 38003dd1e1e..642c1bfa11c 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) return get_device_name(device_id); } +int OpenCLInfo::mem_address_alignment(cl_device_id device_id) +{ + int base_align_bits; + if(clGetDeviceInfo(device_id, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(int), + &base_align_bits, + NULL) == CL_SUCCESS) + { + return base_align_bits/8; + } + return 1; +} + CCL_NAMESPACE_END #endif diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 9bb0455b9d5..bef869f34b4 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -10,7 +10,23 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_avx2.cpp kernels/cpu/kernel_split.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp + kernels/cpu/kernel_split_avx.cpp + kernels/cpu/kernel_split_avx2.cpp + kernels/cpu/filter.cpp + kernels/cpu/filter_sse2.cpp + kernels/cpu/filter_sse3.cpp + kernels/cpu/filter_sse41.cpp + kernels/cpu/filter_avx.cpp + kernels/cpu/filter_avx2.cpp kernels/opencl/kernel.cl kernels/opencl/kernel_state_buffer_size.cl kernels/opencl/kernel_split.cl @@ -32,8 +48,10 @@ set(SRC kernels/opencl/kernel_next_iteration_setup.cl kernels/opencl/kernel_indirect_subsurface.cl kernels/opencl/kernel_buffer_update.cl + kernels/opencl/filter.cl kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu + kernels/cuda/filter.cu ) set(SRC_BVH_HEADERS @@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu.h kernels/cpu/kernel_cpu_impl.h kernels/cpu/kernel_cpu_image.h + kernels/cpu/filter_cpu.h + kernels/cpu/filter_cpu_impl.h ) set(SRC_KERNELS_CUDA_HEADERS @@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS geom/geom_volume.h ) +set(SRC_FILTER_HEADERS + filter/filter.h + filter/filter_defines.h + filter/filter_features.h + filter/filter_features_sse.h + filter/filter_kernel.h + filter/filter_nlm_cpu.h + filter/filter_nlm_gpu.h + filter/filter_prefilter.h + filter/filter_reconstruction.h + filter/filter_transform.h + filter/filter_transform_gpu.h + filter/filter_transform_sse.h +) + set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h @@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS ../util/util_math_int2.h ../util/util_math_int3.h ../util/util_math_int4.h + ../util/util_math_matrix.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h @@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES) ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) + set(cuda_filter_sources kernels/cuda/filter.cu + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_FILTER_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) - if(${split}) - set(cuda_extra_flags "-D__SPLIT__") - set(cuda_cubin kernel_split) - else() - set(cuda_extra_flags "") - set(cuda_cubin kernel) - endif() - + macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental) if(${experimental}) - set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) - set(cuda_cubin ${cuda_cubin}_experimental) + set(flags ${flags} -D__KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) endif() - set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + set(cuda_cubin ${name}_${arch}.cubin) if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") @@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") - if(split) - set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") - else() - set(cuda_kernel_src "/kernels/cuda/kernel.cu") - endif() + set(cuda_kernel_src "/kernels/cuda/${name}.cu") add_custom_command( OUTPUT ${cuda_cubin} @@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES) ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} - ${cuda_extra_flags} + ${flags} ${cuda_debug_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/.. -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC - DEPENDS ${cuda_sources}) + DEPENDS ${sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) list(APPEND cuda_cubins ${cuda_cubin}) @@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE) if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) # Compile split kernel - CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE) endif() endforeach() @@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS}) set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") if(CXX_HAS_SSE) - list(APPEND SRC - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - kernels/cpu/kernel_split_sse2.cpp - kernels/cpu/kernel_split_sse3.cpp - kernels/cpu/kernel_split_sse41.cpp - ) - set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - list(APPEND SRC - kernels/cpu/kernel_avx.cpp - kernels/cpu/kernel_split_avx.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - list(APPEND SRC - kernels/cpu/kernel_avx2.cpp - kernels/cpu/kernel_split_avx2.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel @@ -432,6 +452,7 @@ add_library(cycles_kernel ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} + ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_SPLIT_HEADERS} @@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index a6bba8bf74d..a04c157dc40 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) #endif } +/* Classifies a closure as diffuse-like or specular-like. + * This is needed for the denoising feature pass generation, + * which are written on the first bounce where more than 25% + * of the sampling weight belongs to diffuse-line closures. */ +ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc) +{ + if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + return true; + } + + if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc; + return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f); + } + + return false; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 7e0f5a7ec75..a5ba2cb2972 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf { float sigma; float invsigma2; - float3 N; } VelvetBsdf; ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index dcd187f9305..ec6f1f20996 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseBsdf { SHADER_CLOSURE_BASE; - float3 N; } DiffuseBsdf; /* DIFFUSE */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 2d982a95fe4..24f40af46a3 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float3 *colors; } DiffuseRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 3fe7572e4ce..30cc8b90330 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf { float alpha_x, alpha_y, ior; MicrofacetExtra *extra; float3 T; - float3 N; } MicrofacetBsdf; /* Beckmann and GGX microfacet importance sampling. */ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 57f1e733ee7..30644946840 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) /* Sample slope distribution (based on page 14 of the supplemental implementation). */ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) { - if(cosI > 0.9999f || cosI < 1e-6f) { + if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); const float phi = M_2PI_F * randU.y; return make_float2(r*cosf(phi), r*sinf(phi)); diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index cb342a026ef..6b770fc0c16 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct OrenNayarBsdf { SHADER_CLOSURE_BASE; - float3 N; float roughness; float a; float b; diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index e152a8780db..420f94755ee 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PhongRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float exponent; float3 *colors; } PhongRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h index 8a116693bdb..215c32e1ffb 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf { SHADER_CLOSURE_BASE; float roughness; - float3 N; } PrincipledDiffuseBsdf; ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf, diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h index 58df4f7ddbb..f4476bfecd0 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PrincipledSheenBsdf { SHADER_CLOSURE_BASE; - float3 N; } PrincipledSheenBsdf; ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf, diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 28e775bcbc8..d8b6d8ddead 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct ToonBsdf { SHADER_CLOSURE_BASE; - float3 N; float size; float smooth; } ToonBsdf; diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index f9236a6e52c..f733ea4c517 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf { float texture_blur; float albedo; float roughness; - float3 N; } Bssrdf; /* Planar Truncated Gaussian diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h new file mode 100644 index 00000000000..f6e474d6702 --- /dev/null +++ b/intern/cycles/kernel/filter/filter.h @@ -0,0 +1,52 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_H__ +#define __FILTER_H__ + +/* CPU Filter Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu.h" + +CCL_NAMESPACE_END + +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h new file mode 100644 index 00000000000..ce96f733aff --- /dev/null +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_DEFINES_H__ +#define __FILTER_DEFINES_H__ + +#define DENOISE_FEATURES 10 +#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) +#define XTWY_SIZE (DENOISE_FEATURES+1) + +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ +#ifdef __KERNEL_OPENCL__ + ccl_global float *buffers[9]; +#else + long long int buffers[9]; +#endif +} TilesInfo; + +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h new file mode 100644 index 00000000000..f5a40d49997 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features.h @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + CCL_NAMESPACE_BEGIN + +#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride] + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). + * pixel_buffer always points to the current pixel in the first pass. */ +#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { + +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } + +ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride) +{ + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = ccl_get_feature(buffer, 0); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] -= mean[i]; + } +} + +ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride) +{ + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); +} + +ccl_device_inline void filter_calculate_scale(float *scale) +{ + scale[0] = 1.0f/max(scale[0], 0.01f); + scale[1] = 1.0f/max(scale[1], 0.01f); + scale[2] = 1.0f/max(scale[2], 0.01f); + scale[6] = 1.0f/max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); +} + +ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +{ + return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); +} + +ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +{ + return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); +} + +ccl_device_inline void design_row_add(float *design_row, + int rank, + ccl_global float ccl_restrict_ptr transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + +/* Fill the design row. */ +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + ccl_global float ccl_restrict_ptr p_buffer, + int2 q_pixel, + ccl_global float ccl_restrict_ptr q_buffer, + int pass_stride, + int rank, + float *design_row, + ccl_global float ccl_restrict_ptr transform, + int stride) +{ + design_row[0] = 1.0f; + math_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0)); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..303c8f482e3 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,95 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + __m128 y4 = _mm_set1_ps(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ + __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = ccl_get_feature_sse(0); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_sub_ps(features[i], mean[i]); + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_mask_ps(features[i], active_pixels); +} + +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride) +{ + scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); + scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); + + scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels); + + __m128 diff, scale; + diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[3] = _mm_mask_ps(scale, active_pixels); + + scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); + + diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[5] = _mm_mask_ps(scale, active_pixels); +} + +ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +{ + scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); + scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); + scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); + scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); + + scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); + scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h new file mode 100644 index 00000000000..2ef03dc0a02 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_texture.h" + +#include "util/util_atomic.h" +#include "util/util_math_matrix.h" + +#include "kernel/filter/filter_defines.h" + +#include "kernel/filter/filter_features.h" +#ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_features_sse.h" +#endif + +#include "kernel/filter/filter_prefilter.h" + +#ifdef __KERNEL_GPU__ +# include "kernel/filter/filter_transform_gpu.h" +#else +# ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_transform_sse.h" +# else +# include "kernel/filter/filter_transform.h" +# endif +#endif + +#include "kernel/filter/filter_reconstruction.h" + +#ifdef __KERNEL_CPU__ +# include "kernel/filter/filter_nlm_cpu.h" +#else +# include "kernel/filter/filter_nlm_gpu.h" +#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h new file mode 100644 index 00000000000..1a314b100be --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -0,0 +1,163 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = varianceImage[c*channel_offset + y*w+x]; + float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + differenceImage[y*w+x] = diff; + } + } +} + +ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +{ +#ifdef __KERNEL_SSE3__ + int aligned_lowx = (rect.x & ~(3)); + int aligned_highx = ((rect.z + 3) & ~(3)); +#endif + for(int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] = 0.0f; + } + for(int y1 = low; y1 < high; y1++) { +#ifdef __KERNEL_SSE3__ + for(int x = aligned_lowx; x < aligned_highx; x+=4) { + _mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x))); + } +#else + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] += differenceImage[y1*w+x]; + } +#endif + } + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] *= 1.0f/(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] = 0.0f; + } + } + for(int dx = -f; dx <= f; dx++) { + int pos_dx = max(0, dx); + int neg_dx = min(0, dx); + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { + outImage[y*w+x] += differenceImage[y*w+dx+x]; + } + } + } + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f)); + } + } +} + +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + accumImage[y*w+x] += weight; + outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + } + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, + float ccl_restrict_ptr differenceImage, + float ccl_restrict_ptr buffer, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) { + int y = fy + filter_rect.y; + for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) { + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, y, 1, + dx, dy, w, h, + pass_stride, + buffer, + color_pass, variance_pass, + l_transform, l_rank, + weight, l_XtWX, l_XtWY, 0); + } + } +} + +ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] /= accumImage[y*w+x]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h new file mode 100644 index 00000000000..b5ba7cf51a5 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -0,0 +1,147 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, + int dx, int dy, + ccl_global float ccl_restrict_ptr weightImage, + ccl_global float ccl_restrict_ptr varianceImage, + ccl_global float *differenceImage, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = varianceImage[c*channel_offset + y*w+x]; + float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + differenceImage[y*w+x] = diff; +} + +ccl_device_inline void kernel_filter_nlm_blur(int x, int y, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int y1 = low; y1 < high; y1++) { + sum += differenceImage[y1*w+x]; + } + sum *= 1.0f/(high-low); + outImage[y*w+x] = sum; +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + sum *= 1.0f/(high-low); + outImage[y*w+x] = expf(-max(sum, 0.0f)); +} + +ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, + int dx, int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr image, + ccl_global float *outImage, + ccl_global float *accumImage, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + sum *= 1.0f/(high-low); + if(outImage) { + accumImage[y*w+x] += sum; + outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + } + else { + accumImage[y*w+x] = sum; + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, + int dx, int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr buffer, + ccl_global float *color_pass, + ccl_global float *variance_pass, + ccl_global float ccl_restrict_ptr transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride, + int localIdx) +{ + int y = fy + filter_rect.y; + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, y, + filter_rect.z*filter_rect.w, + dx, dy, w, h, + pass_stride, + buffer, + color_pass, variance_pass, + transform, rank, + weight, XtWX, XtWY, + localIdx); +} + +ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, + ccl_global float *outImage, + ccl_global float ccl_restrict_ptr accumImage, + int4 rect, int w) +{ + outImage[y*w+x] /= accumImage[y*w+x]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h new file mode 100644 index 00000000000..54bcf888052 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -0,0 +1,145 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* First step of the shadow prefiltering, performs the shadow division and stores all data + * in a nice and easy rectangular array that can be passed to the NLM filter. + * + * Calculates: + * unfiltered: Contains the two half images of the shadow feature pass + * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated. + * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) + * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. + */ +ccl_device void kernel_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + int x, int y, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + + int offset = tiles->offsets[tile]; + int stride = tiles->strides[tile]; + ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile]; + center_buffer += (y*stride + x + offset)*buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample+1)/2; + int even_sample = sample/2; + if(use_split_variance) { + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + } + varA /= (odd_sample - 1); + varB /= (even_sample - 1); + + sampleVariance[idx] = 0.5f*(varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); +} + +/* Load a regular feature from the render buffers into the denoise buffer. + * Parameters: + * - sample: The sample amount in the buffer, used to normalize the buffer. + * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. + * - x, y: Current pixel + * - mean, variance: Target denoise buffers. + * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). + */ +ccl_device void kernel_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, int v_offset, + int x, int y, + ccl_global float *mean, + ccl_global float *variance, + int4 rect, int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + mean[idx] = center_buffer[m_offset] / sample; + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } +} + +/* Combine A/B buffers. + * Calculates the combined mean and the buffer variance. */ +ccl_device void kernel_filter_combine_halves(int x, int y, + ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 rect, int r) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); + if(variance) { + if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { + for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { + int pidx = (py-rect.y)*buffer_w + (px-rect.x); + values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for(int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for(j = i-1; j >= 0 && values[j] > v; j--) + values[j+1] = values[j]; + values[j+1] = v; + } + variance[idx] = values[(7*numValues)/8]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h new file mode 100644 index 00000000000..02f3802fa0c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -0,0 +1,103 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_construct_gramian(int x, int y, + int storage_stride, + int dx, int dy, + int w, int h, + int pass_stride, + ccl_global float ccl_restrict_ptr buffer, + ccl_global float *color_pass, + ccl_global float *variance_pass, + ccl_global float ccl_restrict_ptr transform, + ccl_global int *rank, + float weight, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int localIdx) +{ + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); + +#ifdef __KERNEL_CPU__ + const int stride = 1; + (void)storage_stride; + (void)localIdx; + float design_row[DENOISE_FEATURES+1]; +#elif defined(__KERNEL_CUDA__) + const int stride = storage_stride; + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); +#else + const int stride = storage_stride; + float design_row[DENOISE_FEATURES+1]; +#endif + + float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); + float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride); + + float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride)); + float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride)); + + if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) { + return; + } + + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); + + math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); +} + +ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, + ccl_global float *buffer, + ccl_global int *rank, + int storage_stride, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 buffer_params, + int sample) +{ +#ifdef __KERNEL_CPU__ + const int stride = 1; + (void)storage_stride; +#else + const int stride = storage_stride; +#endif + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); + + float3 final_color = XtWY[0]; + + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + final_color *= sample; + if(buffer_params.w) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; +} + +#undef STORAGE_TYPE + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h new file mode 100644 index 00000000000..139dc402d21 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + /* Temporary storage, used in different steps of the algorithm. */ + float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; + float tempvector[2*DENOISE_FEATURES]; + float ccl_restrict_ptr pixel_buffer; + int2 pixel; + + + + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x)); + math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float *feature_scale = tempvector; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float* feature_matrix = tempmatrix; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + *rank = 0; + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h new file mode 100644 index 00000000000..f7414aeed8a --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, + int x, int y, int4 rect, + int pass_stride, + ccl_global float *transform, + ccl_global int *rank, + int radius, float pca_threshold, + int transform_stride, int localIdx) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; +#else + float features[DENOISE_FEATURES]; +#endif + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + ccl_global float ccl_restrict_ptr pixel_buffer; + int2 pixel; + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x)); + math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + *rank = 0; + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int j = 0; j < (*rank); j++) { + transform[i*DENOISE_FEATURES + j] *= feature_scale[i]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h new file mode 100644 index 00000000000..846f3ab3afa --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + __m128 features[DENOISE_FEATURES]; + float ccl_restrict_ptr pixel_buffer; + int2 pixel; + + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + + __m128 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); + math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + } END_FOR_PIXEL_WINDOW_SSE + + __m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x))); + for(int i = 0; i < DENOISE_FEATURES; i++) { + feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + } + + __m128 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale); + + __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + } END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + + *rank = 0; + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + /* Bake the feature scaling into the transformation matrix. */ + for(int j = 0; j < DENOISE_FEATURES; j++) { + transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]); + } + } + } + else { + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + /* Bake the feature scaling into the transformation matrix. */ + for(int j = 0; j < DENOISE_FEATURES; j++) { + transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]); + } + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 47778553b94..105aee8da15 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Interpolate smooth vertex normal from vertices */ -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + + return is_zero(N)? Ng: N; } /* Ray differentials on triangle */ diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 06c0fb2fbca..84a988f1dbc 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg, #define KERNEL_ARCH cpu #include "kernel/kernels/cpu/kernel_cpu.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu.h" CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 823d30dde78..06728415c15 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_depth = 0.0f; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, @@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 alpha, float3 bsdf, - float3 ao, - int bounce) + float3 ao) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf*ao; L->ao += alpha*throughput*ao; @@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf; - L->path_total += light; - L->path_total_shaded += ao * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + } #endif } ccl_device_inline void path_radiance_accum_total_ao( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 bsdf) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf; #endif } -ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) +ccl_device_inline void path_radiance_accum_light(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + BsdfEval *bsdf_eval, + float3 shadow, + float shadow_fac, + bool is_lamp) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow; L->direct_glossy += throughput*bsdf_eval->glossy*shadow; @@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf_eval->sum_no_mis; - L->path_total += light; - L->path_total_shaded += shadow * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + } #endif } ccl_device_inline void path_radiance_accum_total_light( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, const BsdfEval *bsdf_eval) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf_eval->sum_no_mis; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf_eval->sum_no_mis; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf_eval; #endif @@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * value; - if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { - L->path_total_shaded += throughput * value; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { + L->path_total_shaded += throughput * value; + } } #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_albedo += state->denoising_feature_weight * value; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) @@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi return L_sum; } +ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean) +{ +#ifdef __PASSES__ + kernel_assert(L->use_light_pass); + + *clean = L->emission + L->background; + *noisy = L->direct_scatter + L->indirect_scatter; + +# define ADD_COMPONENT(flag, component) \ + if(kernel_data.film.denoising_flags & flag) \ + *clean += component; \ + else \ + *noisy += component; + + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface); +# undef ADD_COMPONENT +#else + *noisy = L->emission; + *clean = make_float3(0.0f, 0.0f, 0.0f); +#endif + + *noisy = ensure_finite3(*noisy); + *clean = ensure_finite3(*clean); +} + ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) { float fac = 1.0f/num_samples; @@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) /* Calculate final light sum and transparency for shadow catcher object. */ ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, const PathRadiance *L, - ccl_addr_space float* L_transparent) + float* alpha) { const float shadow = path_radiance_sum_shadow(L); float3 L_sum; if(kernel_data.background.transparent) { - *L_transparent = shadow; + *alpha = 1.0f-shadow; L_sum = make_float3(0.0f, 0.0f, 0.0f); } else { diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 21da180bb8e..7595e74e2d5 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -42,6 +42,8 @@ #include "util/util_types.h" #include "util/util_texture.h" +#define ccl_restrict_ptr const * __restrict + #define ccl_addr_space #define ccl_local_id(d) 0 diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index c375d17a95f..80d7401fbcf 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -55,6 +55,10 @@ #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) +#define ccl_restrict_ptr const * __restrict__ +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index c2263ac0d49..15cf4b81b21 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -50,6 +50,8 @@ # define ccl_addr_space #endif +#define ccl_restrict_ptr const * __restrict__ + #define ccl_local_id(d) get_local_id(d) #define ccl_global_id(d) get_global_id(d) diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index a2909cec1a1..9baa9d54957 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P, float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); cu = clamp(cu, -1.0f, 1.0f); /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); xu = clamp(xu, x0, x1); /* Compute yv. */ float z0sq = z0 * z0; diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index ed523696571..8ab4c724829 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa #endif /* __SPLIT_KERNEL__ */ } +#ifdef __DENOISING_FEATURES__ +ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value) +{ + kernel_write_pass_float(buffer, sample, value); + + /* The online one-pass variance update that's used for the megakernel can't easily be implemented + * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+1, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float(buffer+1, sample, 0.0f); + } + else { + float new_mean = buffer[0] * (1.0f / (sample + 1)); + float old_mean = (buffer[0] - value) * (1.0f / sample); + kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +# if defined(__SPLIT_KERNEL__) +# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 +# else +ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value) +{ + buffer[0] = (sample == 0)? value.x: buffer[0] + value.x; + buffer[1] = (sample == 0)? value.y: buffer[1] + value.y; + buffer[2] = (sample == 0)? value.z: buffer[2] + value.z; +} +# endif + +ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value) +{ + kernel_write_pass_float3_unaligned(buffer, sample, value); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float3_unaligned(buffer+3, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f)); + } + else { + float3 sum = make_float3(buffer[0], buffer[1], buffer[2]); + float3 new_mean = sum * (1.0f / (sample + 1)); + float3 old_mean = (sum - value) * (1.0f / sample); + kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, + int sample, float path_total, float path_total_shaded) +{ + if(kernel_data.film.pass_denoising_data == 0) + return; + + buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + + path_total = ensure_finite(path_total); + path_total_shaded = ensure_finite(path_total_shaded); + + kernel_write_pass_float(buffer, sample/2, path_total); + kernel_write_pass_float(buffer+1, sample/2, path_total_shaded); + + float value = path_total_shaded / max(path_total, 1e-7f); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+2, sample/2, value*value); +# else + if(sample < 2) { + kernel_write_pass_float(buffer+2, sample/2, 0.0f); + } + else { + float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f); + float new_value = buffer[1] / max(buffer[0], 1e-7f); + kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value)); + } +# endif +} +#endif /* __DENOISING_FEATURES__ */ + +ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, + ShaderData *sd, + ccl_global PathState *state, + PathRadiance *L) +{ +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight == 0.0f) { + return; + } + + L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); + + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 albedo = make_float3(0.0f, 0.0f, 0.0f); + float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + continue; + + /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ + normal += sc->N * sc->sample_weight; + sum_weight += sc->sample_weight; + if(!bsdf_is_specular_like(sc)) { + albedo += sc->weight; + sum_nonspecular_weight += sc->sample_weight; + } + } + + /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */ + if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) { + if(sum_weight != 0.0f) { + normal /= sum_weight; + } + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); + L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); + + state->denoising_feature_weight = 0.0f; + } +#else + (void) kg; + (void) sd; + (void) state; + (void) L; +#endif /* __DENOISING_FEATURES__ */ +} + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { @@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f #endif } +ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer, + int sample, PathRadiance *L, float alpha, bool is_shadow_catcher) +{ + if(L) { + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(is_shadow_catcher) { + L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, L); + } + + kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); + + kernel_write_light_passes(kg, buffer, L, sample); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { +# ifdef __SHADOW_TRICKS__ + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded)); +# else + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); +# endif + if(kernel_data.film.pass_denoising_clean) { + float3 noisy, clean; + path_radiance_split_denoising(kg, L, &noisy, &clean); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, noisy); + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, clean); + } + else { + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, L_sum); + } + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, L->denoising_normal); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, L->denoising_albedo); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, L->denoising_depth); + } +#endif /* __DENOISING_FEATURES__ */ + } + else { + kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f)); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, make_float3(0.0f, 0.0f, 0.0f)); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, 0.0f); + + if(kernel_data.film.pass_denoising_clean) { + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, make_float3(0.0f, 0.0f, 0.0f)); + } + } +#endif /* __DENOISING_FEATURES__ */ + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 58da141aed3..0d31ae32aa6 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); } } } @@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { @@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } -ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer) +ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, float3 emission; if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __LAMP_MIS__ */ @@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* emission */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); /* scattering */ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; @@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just @@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); + kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); state.catcher_object = sd.object; if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); } } } @@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ @@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_EMISSION) { /* todo: is isect.t wrong here for transparent surfaces? */ float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(kernel_path_subsurface_scatter(kg, &sd, &emission_sd, - &L, + L, &state, rng, &ray, @@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, &L); + kernel_path_subsurface_accum_indirect(&ss_indirect, L); /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. @@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, &ss_indirect, &state, &ray, - &L, + L, &throughput); } else { @@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_path_trace(KernelGlobals *kg, @@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index ddcb57161ea..10816d3e5d1 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf); } } } @@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + for(int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; - if(!CLOSURE_IS_BSDF(sc->type)) - continue; /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { continue; + } int num_samples; @@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba &tp, &ps, L, - &bsdf_ray)) + &bsdf_ray, + sum_sample_weight)) { continue; } @@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = kernel_data.integrator.sample_all_lights_direct; kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect light sampling */ @@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp*num_samples_inv, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } } /* emission and transmittance */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); throughput *= volume_segment.accum_transmittance; /* free cached steps */ @@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_state_branch(&ps, j, num_samples); VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous); + kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous); #ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* todo: support equiangular, MIS and all light sampling. * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L); if(kernel_path_volume_bounce(kg, rng, &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } #endif /* __VOLUME_SCATTER__ */ @@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); state.catcher_object = sd.object; if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); } } } @@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); #ifdef __EMISSION__ /* emission */ if(sd.flag & SD_EMISSION) { float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput); } #endif /* __AO__ */ @@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - &L, &state, rng, &ray, throughput); + L, &state, rng, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, rng, - &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); + &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __VOLUME__ */ } - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, @@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } @@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index c0cd2a63120..0fa77d9e8bd 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->transmission_bounce = 0; state->transparent_bounce = 0; +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + state->flag |= PATH_RAY_STORE_SHADOW_INFO; + state->denoising_feature_weight = 1.0f; + } + else { + state->denoising_feature_weight = 0.0f; + } +#endif /* __DENOISING_FEATURES__ */ + state->min_ray_pdf = FLT_MAX; state->ray_pdf = 0.0f; #ifdef __LAMP_MIS__ @@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; + + if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; + } } ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index bd4ba775b4d..e676ea0f3ae 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light); } } } @@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce( ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, - ccl_addr_space Ray *ray) + ccl_addr_space Ray *ray, + float sum_sample_weight) { /* sample BSDF */ float bsdf_pdf; @@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce( /* modify throughput */ path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); +#ifdef __DENOISING_FEATURES__ + state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); +#endif + /* modify path state */ path_state_next(kg, state, label); @@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } else { - path_radiance_accum_total_light(L, throughput, &L_light); + path_radiance_accum_total_light(L, state, throughput, &L_light); } } } diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 371f2c1c7cb..dcedf51e479 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } } } @@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } } } diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 9a2b0884a7e..cbb2442d1dc 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi) ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range) { + if(is_zero(dir)) + return make_float2(0.0f, 0.0f); + float u = (atan2f(dir.y, dir.x) - range.y) / range.x; float v = (acosf(dir.z / len(dir)) - range.w) / range.z; diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 8c0c5e90a3e..c66f52255f0 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ @@ -186,7 +186,7 @@ void shader_setup_from_subsurface( sd->N = Ng; if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); # ifdef __DPDU__ /* dPdu/dPdv */ @@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 9b354457b91..dd1fa1b82f7 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN #define __PATCH_EVAL__ #define __SHADOW_TRICKS__ +#define __DENOISING_FEATURES__ + #ifdef __KERNEL_SHADING__ # define __SVM__ # define __EMISSION__ @@ -314,31 +316,32 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - PATH_RAY_CAMERA = 1, - PATH_RAY_REFLECT = 2, - PATH_RAY_TRANSMIT = 4, - PATH_RAY_DIFFUSE = 8, - PATH_RAY_GLOSSY = 16, - PATH_RAY_SINGULAR = 32, - PATH_RAY_TRANSPARENT = 64, - - PATH_RAY_SHADOW_OPAQUE = 128, - PATH_RAY_SHADOW_TRANSPARENT = 256, + PATH_RAY_CAMERA = (1 << 0), + PATH_RAY_REFLECT = (1 << 1), + PATH_RAY_TRANSMIT = (1 << 2), + PATH_RAY_DIFFUSE = (1 << 3), + PATH_RAY_GLOSSY = (1 << 4), + PATH_RAY_SINGULAR = (1 << 5), + PATH_RAY_TRANSPARENT = (1 << 6), + + PATH_RAY_SHADOW_OPAQUE = (1 << 7), + PATH_RAY_SHADOW_TRANSPARENT = (1 << 8), PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ + PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = 2048, + PATH_RAY_NODE_UNALIGNED = (1 << 11), - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1), - PATH_RAY_MIS_SKIP = 4096, - PATH_RAY_DIFFUSE_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, - PATH_RAY_SHADOW_CATCHER = 32768, - PATH_RAY_SHADOW_CATCHER_ONLY = 65536, + PATH_RAY_MIS_SKIP = (1 << 12), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13), + PATH_RAY_SINGLE_PASS_DONE = (1 << 14), + PATH_RAY_SHADOW_CATCHER = (1 << 15), + PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16), + PATH_RAY_STORE_SHADOW_INFO = (1 << 17), }; /* Closure Label */ @@ -394,6 +397,22 @@ typedef enum PassType { #define PASS_ALL (~0) +typedef enum DenoisingPassOffsets { + DENOISING_PASS_NORMAL = 0, + DENOISING_PASS_NORMAL_VAR = 3, + DENOISING_PASS_ALBEDO = 6, + DENOISING_PASS_ALBEDO_VAR = 9, + DENOISING_PASS_DEPTH = 12, + DENOISING_PASS_DEPTH_VAR = 13, + DENOISING_PASS_SHADOW_A = 14, + DENOISING_PASS_SHADOW_B = 17, + DENOISING_PASS_COLOR = 20, + DENOISING_PASS_COLOR_VAR = 23, + + DENOISING_PASS_SIZE_BASE = 26, + DENOISING_PASS_SIZE_CLEAN = 3, +} DenoisingPassOffsets; + typedef enum BakePassFilter { BAKE_FILTER_NONE = 0, BAKE_FILTER_DIRECT = (1 << 0), @@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos { BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE), } BakePassFilterCombos; +typedef enum DenoiseFlag { + DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), + DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), + DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), + DENOISING_CLEAN_GLOSSY_IND = (1 << 3), + DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), + DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), + DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6), + DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7), + DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, +} DenoiseFlag; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; @@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance { /* Color of the background on which shadow is alpha-overed. */ float3 shadow_color; #endif + +#ifdef __DENOISING_FEATURES__ + float3 denoising_normal; + float3 denoising_albedo; + float denoising_depth; +#endif /* __DENOISING_FEATURES__ */ } PathRadiance; typedef struct BsdfEval { @@ -724,12 +761,13 @@ typedef struct AttributeDescriptor { #define SHADER_CLOSURE_BASE \ float3 weight; \ ClosureType type; \ - float sample_weight \ + float sample_weight; \ + float3 N typedef ccl_addr_space struct ccl_align(16) ShaderClosure { SHADER_CLOSURE_BASE; - float data[14]; /* pad to 80 bytes */ + float data[10]; /* pad to 80 bytes */ } ShaderClosure; /* Shader Context @@ -960,6 +998,10 @@ typedef struct PathState { int transmission_bounce; int transparent_bounce; +#ifdef __DENOISING_FEATURES__ + float denoising_feature_weight; +#endif /* __DENOISING_FEATURES__ */ + /* multiple importance sampling */ float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ float ray_pdf; /* last bounce pdf */ @@ -1137,6 +1179,11 @@ typedef struct KernelFilm { float mist_inv_depth; float mist_falloff; + int pass_denoising_data; + int pass_denoising_clean; + int denoising_flags; + int pad; + #ifdef __KERNEL_DEBUG__ int pass_bvh_traversed_nodes; int pass_bvh_traversed_instances; diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp new file mode 100644 index 00000000000..2ff1a392dc3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp new file mode 100644 index 00000000000..4a9e6047ecf --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp new file mode 100644 index 00000000000..c22ec576254 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h new file mode 100644 index 00000000000..10007ee2635 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -0,0 +1,132 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleV, + float *sampleVV, + float *bufferV, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, + float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r); + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* rect, + int pass_stride, + int radius, + float pca_threshold); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weightImage, + float *variance, + float *differenceImage, + int* rect, + int w, + int channel_offset, + float a, + float k_2); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, + float *outImage, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, + float *outImage, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *differenceImage, + float *image, + float *outImage, + float *accumImage, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *differenceImage, + float *buffer, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage, + float *accumImage, + int* rect, + int w); + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h new file mode 100644 index 00000000000..3b71e50ca3b --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -0,0 +1,259 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#include "kernel/kernel_compat_cpu.h" + +#include "kernel/filter/filter_kernel.h" + +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + +CCL_NAMESPACE_BEGIN + + +/* Denoise filter */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); +#else + kernel_filter_divide_shadow(sample, tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); +#else + kernel_filter_get_feature(sample, tiles, + m_offset, v_offset, + x, y, + mean, variance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); +#else + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* prefilter_rect, + int pass_stride, + int radius, + float pca_threshold) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); +#else + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + x, y, + load_int4(prefilter_rect), + pass_stride, + transform, + rank, + radius, + pca_threshold); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weightImage, + float *variance, + float *differenceImage, + int *rect, + int w, + int channel_offset, + float a, + float k_2) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); +#else + kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, + float *outImage, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); +#else + kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, + float *outImage, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); +#else + kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *differenceImage, + float *image, + float *outImage, + float *accumImage, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); +#else + kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *differenceImage, + float *buffer, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); +#else + kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage, + float *accumImage, + int *rect, + int w) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); +#else + kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_finalize); +#else + XtWX += storage_ofs*XTWX_SIZE; + XtWY += storage_ofs*XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); +#endif +} + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp new file mode 100644 index 00000000000..f7c9935f1d0 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp new file mode 100644 index 00000000000..070b95a3505 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp new file mode 100644 index 00000000000..1a7b2040da1 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 2600d977972..a645fb4d8dd 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -17,21 +17,23 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index dba15d037ac..6bbb87727b9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -18,21 +18,23 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 39c9a9cf33c..9895080d328 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); - #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 8c05dd1d9ef..b9d82781840 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -57,6 +57,11 @@ # include "kernel/split/kernel_buffer_update.h" #endif +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + CCL_NAMESPACE_BEGIN #ifndef __SPLIT_KERNEL__ @@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, int offset, int stride) { -#ifdef __BRANCHED_PATH__ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +#else +# ifdef __BRANCHED_PATH__ if(kernel_data.integrator.branched) { kernel_branched_path_trace(kg, buffer, @@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, stride); } else -#endif +# endif { kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +#endif /* KERNEL_STUB */ } /* Film */ @@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +#else kernel_film_convert_to_byte(kg, rgba, buffer, @@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +#else kernel_film_convert_to_half_float(kg, rgba, buffer, @@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +#else if(type >= SHADER_EVAL_BAKE) { kernel_assert(output_luma == NULL); -#ifdef __BAKING__ +# ifdef __BAKING__ kernel_bake_evaluate(kg, input, output, @@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, offset, sample); -#endif +# endif } else { kernel_shader_evaluate(kg, @@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, sample); } +#endif /* KERNEL_STUB */ } #else /* __SPLIT_KERNEL__ */ /* Split Kernel Path Tracing */ -#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ +#ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +#else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ { \ kernel_##name(kg); \ } +#endif /* KERNEL_STUB */ #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ @@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) - -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) -{ -#define REGISTER_NAME_STRING(name) #name -#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) -#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); - - REGISTER(path_trace); - REGISTER(convert_to_byte); - REGISTER(convert_to_half_float); - REGISTER(shader); - - REGISTER(data_init); - REGISTER(path_init); - REGISTER(scene_intersect); - REGISTER(lamp_emission); - REGISTER(do_volume); - REGISTER(queue_enqueue); - REGISTER(indirect_background); - REGISTER(shader_setup); - REGISTER(shader_sort); - REGISTER(shader_eval); - REGISTER(holdout_emission_blurring_pathtermination_ao); - REGISTER(subsurface_scatter); - REGISTER(direct_lighting); - REGISTER(shadow_blocked_ao); - REGISTER(shadow_blocked_dl); - REGISTER(next_iteration_setup); - REGISTER(indirect_subsurface); - REGISTER(buffer_update); - -#undef REGISTER -#undef REGISTER_EVAL_NAME -#undef REGISTER_NAME_STRING -} - #endif /* __SPLIT_KERNEL__ */ +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp index 27a746a0799..6ba3425a343 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -17,22 +17,25 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp index 364d279a189..76b2d77ebb8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -18,23 +18,25 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp index 0afb481296f..b468b6f44c8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp index 13d00813591..3e5792d0b17 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -18,19 +18,21 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp index a4312071edc..3629f21cd29 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -18,20 +18,22 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index 1acfaa91ac9..57530c88710 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -18,15 +18,17 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index f7b6a2e21fe..c607753bc4b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index 1900c6e3012..a278554731c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -18,18 +18,20 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu//kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu new file mode 100644 index 00000000000..50f73f9728d --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -0,0 +1,235 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#include "kernel_config.h" + +#include "kernel/kernel_compat_cuda.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + float *transform, int *rank, + int4 filter_area, int4 rect, + int radius, float pca_threshold, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int *l_rank = rank + y*filter_area.z + x; + float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_difference(int dx, int dy, + float ccl_restrict_ptr weightImage, + float ccl_restrict_ptr varianceImage, + float *differenceImage, + int4 rect, int w, + int channel_offset, + float a, float k_2) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_update_output(int dx, int dy, + float ccl_restrict_ptr differenceImage, + float ccl_restrict_ptr image, + float *outImage, float *accumImage, + int4 rect, int w, + int f) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, + float ccl_restrict_ptr differenceImage, + float ccl_restrict_ptr buffer, + float *color_pass, + float *variance_pass, + float const* __restrict__ transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) { + int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); + int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + differenceImage, + buffer, + color_pass, variance_pass, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_finalize(int w, int h, + float *buffer, int *rank, + float *XtWX, float3 *XtWY, + int4 filter_area, int4 buffer_params, + int sample) { + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl new file mode 100644 index 00000000000..3d82bff9892 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -0,0 +1,262 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL kernel entry points */ + +#include "kernel/kernel_compat_opencl.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +__kernel void kernel_ocl_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, + int v_offset, + ccl_global float *mean, + ccl_global float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 prefilter_rect, + int r) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, + ccl_global float *transform, + ccl_global int *rank, + int4 filter_area, + int4 rect, + int pass_stride, + int radius, + float pca_threshold) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + ccl_global int *l_rank = rank + y*filter_area.z + x; + ccl_global float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_difference(int dx, + int dy, + ccl_global float ccl_restrict_ptr weightImage, + ccl_global float ccl_restrict_ptr varianceImage, + ccl_global float *differenceImage, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); + } +} + +__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, + int w, + int f) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, + int w, + int f) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_update_output(int dx, + int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr image, + ccl_global float *outImage, + ccl_global float *accumImage, + int4 rect, + int w, + int f) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage, + ccl_global float ccl_restrict_ptr accumImage, + int4 rect, + int w) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w); + } +} + +__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, + int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr buffer, + ccl_global float *color_pass, + ccl_global float *variance_pass, + ccl_global float ccl_restrict_ptr transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, + int h, + int f, + int pass_stride) { + int x = get_global_id(0) + max(0, rect.x-filter_rect.x); + int y = get_global_id(1) + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + differenceImage, + buffer, + color_pass, variance_pass, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_finalize(int w, + int h, + ccl_global float *buffer, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) { + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, + ccl_global float *buffer_1, + ccl_global float *buffer_2, + ccl_global float *buffer_3, + ccl_global float *buffer_4, + ccl_global float *buffer_5, + ccl_global float *buffer_6, + ccl_global float *buffer_7, + ccl_global float *buffer_8, + ccl_global float *buffer_9) +{ + if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { + tiles->buffers[0] = buffer_1; + tiles->buffers[1] = buffer_2; + tiles->buffers[2] = buffer_3; + tiles->buffers[3] = buffer_4; + tiles->buffers[4] = buffer_5; + tiles->buffers[5] = buffer_6; + tiles->buffers[6] = buffer_7; + tiles->buffers[7] = buffer_8; + tiles->buffers[8] = buffer_9; + } +} diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h index c7bc1b4df0a..dc74a2ada53 100644 --- a/intern/cycles/kernel/split/kernel_branched.h +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; float3 throughput = branched_state->throughput; + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(ps->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ for(int i = branched_state->next_closure; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; @@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( RNG bsdf_rng = cmj_hash(rng, i); for(int j = branched_state->next_sample; j < num_samples; j++) { - ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; if(reset_path_state) { *ps = branched_state->path_state; } @@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( tp, ps, L, - bsdf_ray)) + bsdf_ray, + sum_sample_weight)) { continue; } diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 859c221d976..1f6dce0253c 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum; -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent); - } - else -#endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, L); - } kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L_rad); + bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER); + kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher); + path_rng_end(kg, rng_state, rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 87498910d38..670a557f084 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __SHADOW_TRICKS__ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state->flag & PATH_RAY_CAMERA) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); state->catcher_object = sd->object; if(!kernel_data.background.transparent) { PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; @@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( kernel_split_state.throughput[ray_index] = throughput/probability; } } + + kernel_update_denoising_features(kg, sd, state, L); } } diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index 452b6e45a36..386fbbc4d09 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } else { - path_radiance_accum_total_light(L, throughput, &L_light); + path_radiance_accum_total_light(L, state, throughput, &L_light); } } diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 407f8e784c0..7918c640175 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; sd->flag |= bsdf_transparent_setup(bsdf); } break; @@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; /* todo: giving a fixed weight here will cause issues when * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index c94fa130af7..656357be52d 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac strength = max(strength, 0.0f); /* compute and output perturbed normal */ - float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad); - normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad); + if(is_zero(normal_out)) { + normal_out = normal_in; + } + else { + normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + } if(use_object_space) { object_normal_transform(kg, sd, &normal_out); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 4a09d9f6653..cce4e89e715 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, #ifdef __UV__ case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif + default: data = make_float3(0.0f, 0.0f, 0.0f); } stack_store_float3(stack, out_offset, data); diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 328ff79223b..8e45dbfa5ff 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa float3 co = stack_load_float3(stack, co_offset); float2 uv; - co = normalize(co); - + co = safe_normalize(co); + if(projection == 0) uv = direction_to_equirectangular(co); else diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index cc9b840b13f..d859cae1708 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -402,7 +402,6 @@ typedef enum ClosureType { CLOSURE_BSDF_DIFFUSE_TOON_ID, /* Glossy */ - CLOSURE_BSDF_GLOSSY_ID, CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID, @@ -423,14 +422,13 @@ typedef enum ClosureType { CLOSURE_BSDF_HAIR_REFLECTION_ID, /* Transmission */ - CLOSURE_BSDF_TRANSMISSION_ID, CLOSURE_BSDF_TRANSLUCENT_ID, CLOSURE_BSDF_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID, CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID, - CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID, CLOSURE_BSDF_SHARP_GLASS_ID, CLOSURE_BSDF_HAIR_TRANSMISSION_ID, @@ -465,13 +463,16 @@ typedef enum ClosureType { /* watch this, being lazy with memory usage */ #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID) -#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) -#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) +#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) +#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID) +#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) +#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\ + (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) @@ -480,7 +481,7 @@ typedef enum ClosureType { #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) #define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fe2c2e78926..cf402c3f214 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -42,6 +42,9 @@ BufferParams::BufferParams() full_width = 0; full_height = 0; + denoising_data_pass = false; + denoising_clean_pass = false; + Pass::add(PASS_COMBINED, passes); } @@ -68,10 +71,25 @@ int BufferParams::get_passes_size() for(size_t i = 0; i < passes.size(); i++) size += passes[i].components; - + + if(denoising_data_pass) { + size += DENOISING_PASS_SIZE_BASE; + if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN; + } + return align_up(size, 4); } +int BufferParams::get_denoising_offset() +{ + int offset = 0; + + for(size_t i = 0; i < passes.size(); i++) + offset += passes[i].components; + + return offset; +} + /* Render Buffer Task */ RenderTile::RenderTile() @@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE); } -bool RenderBuffers::copy_from_device() +bool RenderBuffers::copy_from_device(Device *from_device) { if(!buffer.device_pointer) return false; - device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); + if(!from_device) { + from_device = device; + } + + from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); + + return true; +} + +bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels) +{ + float scale = 1.0f/sample; + + if(offset == DENOISING_PASS_COLOR) { + scale *= exposure; + } + else if(offset == DENOISING_PASS_COLOR_VAR) { + scale *= exposure*exposure; + } + + offset += params.get_denoising_offset(); + float *in = (float*)buffer.data_pointer + offset; + int pass_stride = params.get_passes_size(); + int size = params.width*params.height; + + if(components == 1) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + pixels[0] = in[0]*scale; + } + } + else if(components == 3) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; + } + } + else { + return false; + } return true; } diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 5c78971678a..e56556c8abe 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -51,6 +51,9 @@ public: /* passes */ array<Pass> passes; + bool denoising_data_pass; + /* If only some light path types should be denoised, an additional pass is needed. */ + bool denoising_clean_pass; /* functions */ BufferParams(); @@ -59,6 +62,7 @@ public: bool modified(const BufferParams& params); void add_pass(PassType type); int get_passes_size(); + int get_denoising_offset(); }; /* Render Buffers */ @@ -73,18 +77,19 @@ public: /* random number generator state */ device_vector<uint> rng_state; + Device *device; + explicit RenderBuffers(Device *device); ~RenderBuffers(); void reset(Device *device, BufferParams& params); - bool copy_from_device(); + bool copy_from_device(Device *from_device = NULL); bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels); + bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels); protected: void device_free(); - - Device *device; }; /* Display Buffer @@ -131,6 +136,9 @@ protected: class RenderTile { public: + typedef enum { PATH_TRACE, DENOISE } Task; + + Task task; int x, y, w, h; int start_sample; int num_samples; @@ -138,6 +146,7 @@ public: int resolution; int offset; int stride; + int tile_index; device_ptr buffer; device_ptr rng_state; diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 7809f4345f1..c8213d258d5 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -279,6 +279,10 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false); + SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false); + SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); + SOCKET_INT(denoising_flags, "Denoising Flags", 0); + return type; } @@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride += pass.components; } + kfilm->pass_denoising_data = 0; + kfilm->pass_denoising_clean = 0; + kfilm->denoising_flags = 0; + if(denoising_data_pass) { + kfilm->pass_denoising_data = kfilm->pass_stride; + kfilm->pass_stride += DENOISING_PASS_SIZE_BASE; + kfilm->denoising_flags = denoising_flags; + if(denoising_clean_pass) { + kfilm->pass_denoising_clean = kfilm->pass_stride; + kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN; + kfilm->use_light_pass = 1; + } + } + kfilm->pass_stride = align_up(kfilm->pass_stride, 4); kfilm->pass_alpha_threshold = pass_alpha_threshold; @@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f; kfilm->mist_falloff = mist_falloff; + pass_stride = kfilm->pass_stride; + denoising_data_offset = kfilm->pass_denoising_data; + denoising_clean_offset = kfilm->pass_denoising_clean; + need_update = false; } diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 83c941d5c57..29b1e7e9157 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -57,8 +57,15 @@ public: float exposure; array<Pass> passes; + bool denoising_data_pass; + bool denoising_clean_pass; + int denoising_flags; float pass_alpha_threshold; + int pass_stride; + int denoising_data_offset; + int denoising_clean_offset; + FilterType filter_type; float filter_width; size_t filter_table_offset; diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 33d1936659b..03825f780e0 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -903,7 +903,7 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) float3 vNi = vN[i]; if(do_transform) - vNi = normalize(transform_direction(&ntfm, vNi)); + vNi = safe_normalize(transform_direction(&ntfm, vNi)); vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f); } diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index c9b5547b407..3eaf34c847f 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -114,8 +114,9 @@ Session::~Session() } /* clean up */ - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + foreach(RenderTile &rtile, render_tiles) + delete rtile.buffers; + tile_manager.free_device(); delete buffers; delete display; @@ -268,8 +269,8 @@ void Session::run_gpu() /* update status and timing */ update_status_time(); - /* path trace */ - path_trace(); + /* render */ + render(); device->task_wait(); @@ -358,20 +359,22 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) thread_scoped_lock tile_lock(tile_mutex); /* get next tile from manager */ - Tile tile; + Tile *tile; int device_num = device->device_number(tile_device); if(!tile_manager.next_tile(tile, device_num)) return false; /* fill render tile */ - rtile.x = tile_manager.state.buffer.full_x + tile.x; - rtile.y = tile_manager.state.buffer.full_y + tile.y; - rtile.w = tile.w; - rtile.h = tile.h; + rtile.x = tile_manager.state.buffer.full_x + tile->x; + rtile.y = tile_manager.state.buffer.full_y + tile->y; + rtile.w = tile->w; + rtile.h = tile->h; rtile.start_sample = tile_manager.state.sample; rtile.num_samples = tile_manager.state.num_samples; rtile.resolution = tile_manager.state.resolution_divider; + rtile.tile_index = tile->index; + rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE; tile_lock.unlock(); @@ -383,54 +386,70 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) rtile.buffer = buffers->buffer.device_pointer; rtile.rng_state = buffers->rng_state.device_pointer; rtile.buffers = buffers; + tile->buffers = buffers; device->map_tile(tile_device, rtile); return true; } - /* fill buffer parameters */ - BufferParams buffer_params = tile_manager.params; - buffer_params.full_x = rtile.x; - buffer_params.full_y = rtile.y; - buffer_params.width = rtile.w; - buffer_params.height = rtile.h; - - buffer_params.get_offset_stride(rtile.offset, rtile.stride); - - RenderBuffers *tilebuffers; + bool store_rtile = false; + if(tile->buffers == NULL) { + /* fill buffer parameters */ + BufferParams buffer_params = tile_manager.params; + buffer_params.full_x = rtile.x; + buffer_params.full_y = rtile.y; + buffer_params.width = rtile.w; + buffer_params.height = rtile.h; + + /* allocate buffers */ + if(params.progressive_refine) { + tile_lock.lock(); + + if(render_tiles.size() == 0) { + RenderTile nulltile; + nulltile.buffers = NULL; + render_tiles.resize(tile_manager.state.num_tiles, nulltile); + } - /* allocate buffers */ - if(params.progressive_refine) { - tile_lock.lock(); + /* In certain circumstances number of tiles in the tile manager could + * be changed. This is not supported by the progressive refine feature. + */ + assert(render_tiles.size() == tile_manager.state.num_tiles); - if(tile_buffers.size() == 0) - tile_buffers.resize(tile_manager.state.num_tiles, NULL); + RenderTile &stored_rtile = render_tiles[tile->index]; + if(stored_rtile.buffers == NULL) { + tile->buffers = new RenderBuffers(tile_device); + tile->buffers->reset(tile_device, buffer_params); + store_rtile = true; + } + else { + assert(rtile.x == stored_rtile.x && + rtile.y == stored_rtile.y && + rtile.w == stored_rtile.w && + rtile.h == stored_rtile.h); + tile_lock.unlock(); + tile->buffers = stored_rtile.buffers; + } + } + else { + tile->buffers = new RenderBuffers(tile_device); - /* In certain circumstances number of tiles in the tile manager could - * be changed. This is not supported by the progressive refine feature. - */ - assert(tile_buffers.size() == tile_manager.state.num_tiles); + tile->buffers->reset(tile_device, buffer_params); + } + } - tilebuffers = tile_buffers[tile.index]; - if(tilebuffers == NULL) { - tilebuffers = new RenderBuffers(tile_device); - tile_buffers[tile.index] = tilebuffers; + tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride); - tilebuffers->reset(tile_device, buffer_params); - } + rtile.buffer = tile->buffers->buffer.device_pointer; + rtile.rng_state = tile->buffers->rng_state.device_pointer; + rtile.buffers = tile->buffers; + rtile.sample = 0; + if(store_rtile) { + render_tiles[tile->index] = rtile; tile_lock.unlock(); } - else { - tilebuffers = new RenderBuffers(tile_device); - - tilebuffers->reset(tile_device, buffer_params); - } - - rtile.buffer = tilebuffers->buffer.device_pointer; - rtile.rng_state = tilebuffers->rng_state.device_pointer; - rtile.buffers = tilebuffers; /* this will tag tile as IN PROGRESS in blender-side render pipeline, * which is needed to highlight currently rendering tile before first @@ -449,7 +468,7 @@ void Session::update_tile_sample(RenderTile& rtile) if(params.progressive_refine == false) { /* todo: optimize this by making it thread safe and removing lock */ - update_render_tile_cb(rtile); + update_render_tile_cb(rtile, true); } } @@ -462,18 +481,75 @@ void Session::release_tile(RenderTile& rtile) progress.add_finished_tile(); - if(write_render_tile_cb) { - if(params.progressive_refine == false) { - /* todo: optimize this by making it thread safe and removing lock */ - write_render_tile_cb(rtile); + bool delete_tile; - delete rtile.buffers; + if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) { + if(write_render_tile_cb && params.progressive_refine == false) { + write_render_tile_cb(rtile); + if(delete_tile) { + delete rtile.buffers; + tile_manager.state.tiles[rtile.tile_index].buffers = NULL; + } + } + } + else { + if(update_render_tile_cb && params.progressive_refine == false) { + update_render_tile_cb(rtile, false); } } update_status_time(); } +void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device) +{ + thread_scoped_lock tile_lock(tile_mutex); + + int center_idx = tiles[4].tile_index; + assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE); + BufferParams buffer_params = tile_manager.params; + int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y, + buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height); + + for(int dy = -1, i = 0; dy <= 1; dy++) { + for(int dx = -1; dx <= 1; dx++, i++) { + int px = tiles[4].x + dx*params.tile_size.x; + int py = tiles[4].y + dy*params.tile_size.y; + if(px >= image_region.x && py >= image_region.y && + px < image_region.z && py < image_region.w) { + int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx; + Tile *tile = &tile_manager.state.tiles[tile_index]; + assert(tile->buffers); + + tiles[i].buffer = tile->buffers->buffer.device_pointer; + tiles[i].x = tile_manager.state.buffer.full_x + tile->x; + tiles[i].y = tile_manager.state.buffer.full_y + tile->y; + tiles[i].w = tile->w; + tiles[i].h = tile->h; + tiles[i].buffers = tile->buffers; + + tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride); + } + else { + tiles[i].buffer = (device_ptr)NULL; + tiles[i].buffers = NULL; + tiles[i].x = clamp(px, image_region.x, image_region.z); + tiles[i].y = clamp(py, image_region.y, image_region.w); + tiles[i].w = tiles[i].h = 0; + } + } + } + + assert(tiles[4].buffers); + device->map_neighbor_tiles(tile_device, tiles); +} + +void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device) +{ + thread_scoped_lock tile_lock(tile_mutex); + device->unmap_neighbor_tiles(tile_device, tiles); +} + void Session::run_cpu() { bool tiles_written = false; @@ -558,8 +634,8 @@ void Session::run_cpu() /* update status and timing */ update_status_time(); - /* path trace */ - path_trace(); + /* render */ + render(); /* update status and timing */ update_status_time(); @@ -744,10 +820,10 @@ void Session::reset(BufferParams& buffer_params, int samples) if(params.progressive_refine) { thread_scoped_lock buffers_lock(buffers_mutex); - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + foreach(RenderTile &rtile, render_tiles) + delete rtile.buffers; - tile_buffers.clear(); + render_tiles.clear(); } } @@ -882,13 +958,15 @@ void Session::update_status_time(bool show_pause, bool show_done) progress.set_status(status, substatus); } -void Session::path_trace() +void Session::render() { /* add path trace task */ - DeviceTask task(DeviceTask::PATH_TRACE); + DeviceTask task(DeviceTask::RENDER); task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2); task.release_tile = function_bind(&Session::release_tile, this, _1); + task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2); + task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2); task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2); @@ -897,6 +975,18 @@ void Session::path_trace() task.requested_tile_size = params.tile_size; task.passes_size = tile_manager.params.get_passes_size(); + if(params.use_denoising) { + task.denoising_radius = params.denoising_radius; + task.denoising_strength = params.denoising_strength; + task.denoising_feature_strength = params.denoising_feature_strength; + task.denoising_relative_pca = params.denoising_relative_pca; + + assert(!scene->film->need_update); + task.pass_stride = scene->film->pass_stride; + task.pass_denoising_data = scene->film->denoising_data_offset; + task.pass_denoising_clean = scene->film->denoising_clean_offset; + } + device->task_add(task); } @@ -940,9 +1030,7 @@ bool Session::update_progressive_refine(bool cancel) } if(params.progressive_refine) { - foreach(RenderBuffers *buffers, tile_buffers) { - RenderTile rtile; - rtile.buffers = buffers; + foreach(RenderTile &rtile, render_tiles) { rtile.sample = sample; if(write) { @@ -951,7 +1039,7 @@ bool Session::update_progressive_refine(bool cancel) } else { if(update_render_tile_cb) - update_render_tile_cb(rtile); + update_render_tile_cb(rtile, true); } } } @@ -965,10 +1053,11 @@ void Session::device_free() { scene->device_free(); - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + foreach(RenderTile &tile, render_tiles) + delete tile.buffers; + tile_manager.free_device(); - tile_buffers.clear(); + render_tiles.clear(); /* used from background render only, so no need to * re-create render/display buffers here diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index a7e5f78a64d..a7ca90abbce 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -57,6 +57,12 @@ public: bool display_buffer_linear; + bool use_denoising; + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + double cancel_timeout; double reset_timeout; double text_timeout; @@ -77,6 +83,12 @@ public: start_resolution = INT_MAX; threads = 0; + use_denoising = false; + denoising_radius = 8; + denoising_strength = 0.0f; + denoising_feature_strength = 0.0f; + denoising_relative_pca = false; + display_buffer_linear = false; cancel_timeout = 0.1; @@ -99,6 +111,11 @@ public: && tile_size == params.tile_size && start_resolution == params.start_resolution && threads == params.threads + && use_denoising == params.use_denoising + && denoising_radius == params.denoising_radius + && denoising_strength == params.denoising_strength + && denoising_feature_strength == params.denoising_feature_strength + && denoising_relative_pca == params.denoising_relative_pca && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout @@ -126,7 +143,7 @@ public: Stats stats; function<void(RenderTile&)> write_render_tile_cb; - function<void(RenderTile&)> update_render_tile_cb; + function<void(RenderTile&, bool)> update_render_tile_cb; explicit Session(const SessionParams& params); ~Session(); @@ -162,7 +179,7 @@ protected: void update_status_time(bool show_pause = false, bool show_done = false); void tonemap(int sample); - void path_trace(); + void render(); void reset_(BufferParams& params, int samples); void run_cpu(); @@ -177,6 +194,9 @@ protected: void update_tile_sample(RenderTile& tile); void release_tile(RenderTile& tile); + void map_neighbor_tiles(RenderTile *tiles, Device *tile_device); + void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device); + bool device_use_gl; thread *session_thread; @@ -202,7 +222,7 @@ protected: double last_update_time; bool update_progressive_refine(bool cancel); - vector<RenderBuffers *> tile_buffers; + vector<RenderTile> render_tiles; DeviceRequestedFeatures get_requested_device_features(); diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 944e746ca2d..176a1f4f0f3 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -25,37 +25,39 @@ namespace { class TileComparator { public: - TileComparator(TileOrder order, int2 center) - : order_(order), - center_(center) + TileComparator(TileOrder order_, int2 center_, Tile *tiles_) + : order(order_), + center(center_), + tiles(tiles_) {} - bool operator()(Tile &a, Tile &b) + bool operator()(int a, int b) { - switch(order_) { + switch(order) { case TILE_CENTER: { - float2 dist_a = make_float2(center_.x - (a.x + a.w/2), - center_.y - (a.y + a.h/2)); - float2 dist_b = make_float2(center_.x - (b.x + b.w/2), - center_.y - (b.y + b.h/2)); + float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2), + center.y - (tiles[a].y + tiles[a].h/2)); + float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2), + center.y - (tiles[b].y + tiles[b].h/2)); return dot(dist_a, dist_a) < dot(dist_b, dist_b); } case TILE_LEFT_TO_RIGHT: - return (a.x == b.x)? (a.y < b.y): (a.x < b.x); + return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x); case TILE_RIGHT_TO_LEFT: - return (a.x == b.x)? (a.y < b.y): (a.x > b.x); + return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x); case TILE_TOP_TO_BOTTOM: - return (a.y == b.y)? (a.x < b.x): (a.y > b.y); + return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y); case TILE_BOTTOM_TO_TOP: default: - return (a.y == b.y)? (a.x < b.x): (a.y < b.y); + return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y); } } protected: - TileOrder order_; - int2 center_; + TileOrder order; + int2 center; + Tile *tiles; }; inline int2 hilbert_index_to_pos(int n, int d) @@ -96,6 +98,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i num_devices = num_devices_; preserve_tile_device = preserve_tile_device_; background = background_; + schedule_denoising = false; range_start_sample = 0; range_num_samples = -1; @@ -108,6 +111,16 @@ TileManager::~TileManager() { } +void TileManager::free_device() +{ + if(schedule_denoising) { + for(int i = 0; i < state.tiles.size(); i++) { + delete state.tiles[i].buffers; + state.tiles[i].buffers = NULL; + } + } +} + static int get_divider(int w, int h, int start_resolution) { int divider = 1; @@ -133,6 +146,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_) state.num_tiles = 0; state.num_samples = 0; state.resolution_divider = get_divider(params.width, params.height, start_resolution); + state.render_tiles.clear(); + state.denoising_tiles.clear(); state.tiles.clear(); } @@ -157,6 +172,9 @@ void TileManager::set_samples(int num_samples_) } state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height; + if(schedule_denoising) { + state.total_pixel_samples += params.width*params.height; + } } } @@ -169,32 +187,36 @@ int TileManager::gen_tiles(bool sliced) int image_h = max(1, params.height/resolution); int2 center = make_int2(image_w/2, image_h/2); - state.tiles.clear(); - int num_logical_devices = preserve_tile_device? num_devices: 1; int num = min(image_h, num_logical_devices); int slice_num = sliced? num: 1; - int tile_index = 0; + int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x); state.tiles.clear(); - state.tiles.resize(num); - vector<list<Tile> >::iterator tile_list = state.tiles.begin(); + state.render_tiles.clear(); + state.denoising_tiles.clear(); + state.render_tiles.resize(num); + state.denoising_tiles.resize(num); + state.tile_stride = tile_w; + vector<list<int> >::iterator tile_list; + tile_list = state.render_tiles.begin(); if(tile_order == TILE_HILBERT_SPIRAL) { assert(!sliced); + int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y); + state.tiles.resize(tile_w*tile_h); + /* Size of blocks in tiles, must be a power of 2 */ const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4; - int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; - int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y; - int tiles_per_device = (tile_w * tile_h + num - 1) / num; + int tiles_per_device = divide_up(tile_w * tile_h, num); int cur_device = 0, cur_tiles = 0; int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size); /* Number of blocks to fill the image */ - int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x; - int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y; + int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x); + int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y); int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */ /* Offset of spiral (to keep it centered) */ int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2); @@ -225,9 +247,11 @@ int TileManager::gen_tiles(bool sliced) if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) { int w = min(tile_size.x, image_w - pos.x); int h = min(tile_size.y, image_h - pos.y); - tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device)); + int2 ipos = pos / tile_size; + int idx = ipos.y*tile_w + ipos.x; + state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER); + tile_list->push_front(idx); cur_tiles++; - tile_index++; if(cur_tiles == tiles_per_device) { tile_list++; @@ -271,27 +295,28 @@ int TileManager::gen_tiles(bool sliced) break; } } - return tile_index; + return tile_w*tile_h; } + int idx = 0; for(int slice = 0; slice < slice_num; slice++) { int slice_y = (image_h/slice_num)*slice; int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num; - int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; - int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y; + int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y); - int tiles_per_device = (tile_w * tile_h + num - 1) / num; + int tiles_per_device = divide_up(tile_w * tile_h, num); int cur_device = 0, cur_tiles = 0; for(int tile_y = 0; tile_y < tile_h; tile_y++) { - for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) { + for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) { int x = tile_x * tile_size.x; int y = tile_y * tile_size.y; int w = (tile_x == tile_w-1)? image_w - x: tile_size.x; int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y; - tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device)); + state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER)); + tile_list->push_back(idx); if(!sliced) { cur_tiles++; @@ -299,7 +324,7 @@ int TileManager::gen_tiles(bool sliced) if(cur_tiles == tiles_per_device) { /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */ if(tile_order != TILE_BOTTOM_TO_TOP) { - tile_list->sort(TileComparator(tile_order, center)); + tile_list->sort(TileComparator(tile_order, center, &state.tiles[0])); } tile_list++; cur_tiles = 0; @@ -313,7 +338,7 @@ int TileManager::gen_tiles(bool sliced) } } - return tile_index; + return idx; } void TileManager::set_tiles() @@ -333,15 +358,111 @@ void TileManager::set_tiles() state.buffer.full_height = max(1, params.full_height/resolution); } -bool TileManager::next_tile(Tile& tile, int device) +int TileManager::get_neighbor_index(int index, int neighbor) +{ + static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0}; + + int resolution = state.resolution_divider; + int image_w = max(1, params.width/resolution); + int image_h = max(1, params.height/resolution); + int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x); + int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y); + + int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor]; + if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h) + return -1; + + return ny*state.tile_stride + nx; +} + +/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */ +bool TileManager::check_neighbor_state(int index, Tile::State min_state) +{ + if(index < 0 || state.tiles[index].state < min_state) { + return false; + } + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + /* Out-of-bounds tiles don't matter. */ + if(nindex >= 0 && state.tiles[nindex].state < min_state) { + return false; + } + } + + return true; +} + +/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */ +bool TileManager::finish_tile(int index, bool &delete_tile) +{ + delete_tile = false; + + switch(state.tiles[index].state) { + case Tile::RENDER: + { + if(!schedule_denoising) { + state.tiles[index].state = Tile::DONE; + delete_tile = true; + return true; + } + state.tiles[index].state = Tile::RENDERED; + /* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */ + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + if(check_neighbor_state(nindex, Tile::RENDERED)) { + state.tiles[nindex].state = Tile::DENOISE; + state.denoising_tiles[state.tiles[nindex].device].push_back(nindex); + } + } + return false; + } + case Tile::DENOISE: + { + state.tiles[index].state = Tile::DENOISED; + /* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */ + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + if(check_neighbor_state(nindex, Tile::DENOISED)) { + state.tiles[nindex].state = Tile::DONE; + /* It can happen that the tile just finished denoising and already can be freed here. + * However, in that case it still has to be written before deleting, so we can't delete it yet. */ + if(neighbor == 8) { + delete_tile = true; + } + else { + delete state.tiles[nindex].buffers; + state.tiles[nindex].buffers = NULL; + } + } + } + return true; + } + default: + assert(false); + return true; + } +} + +bool TileManager::next_tile(Tile* &tile, int device) { int logical_device = preserve_tile_device? device: 0; - if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty()) + if(logical_device >= state.render_tiles.size()) + return false; + + if(!state.denoising_tiles[logical_device].empty()) { + int idx = state.denoising_tiles[logical_device].front(); + state.denoising_tiles[logical_device].pop_front(); + tile = &state.tiles[idx]; + return true; + } + + if(state.render_tiles[logical_device].empty()) return false; - tile = Tile(state.tiles[logical_device].front()); - state.tiles[logical_device].pop_front(); + int idx = state.render_tiles[logical_device].front(); + state.render_tiles[logical_device].pop_front(); + tile = &state.tiles[idx]; return true; } diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 622b89f7670..e39a8f0627a 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -31,12 +31,20 @@ public: int index; int x, y, w, h; int device; + /* RENDER: The tile has to be rendered. + * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors). + * DENOISE: The tile can be denoised now. + * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors). + * DONE: The tile is finished and has been freed. */ + typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State; + State state; + RenderBuffers *buffers; Tile() {} - Tile(int index_, int x_, int y_, int w_, int h_, int device_) - : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {} + Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER) + : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {} }; /* Tile order */ @@ -58,6 +66,8 @@ public: BufferParams params; struct State { + vector<Tile> tiles; + int tile_stride; BufferParams buffer; int sample; int num_samples; @@ -67,9 +77,12 @@ public: /* Total samples over all pixels: Generally num_samples*num_pixels, * but can be higher due to the initial resolution division for previews. */ uint64_t total_pixel_samples; - /* This vector contains a list of tiles for every logical device in the session. - * In each list, the tiles are sorted according to the tile order setting. */ - vector<list<Tile> > tiles; + + /* These lists contain the indices of the tiles to be rendered/denoised and are used + * when acquiring a new tile for the device. + * Each list in each vector is for one logical device. */ + vector<list<int> > render_tiles; + vector<list<int> > denoising_tiles; } state; int num_samples; @@ -78,10 +91,12 @@ public: bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1); ~TileManager(); + void free_device(); void reset(BufferParams& params, int num_samples); void set_samples(int num_samples); bool next(); - bool next_tile(Tile& tile, int device = 0); + bool next_tile(Tile* &tile, int device = 0); + bool finish_tile(int index, bool& delete_tile); bool done(); void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; } @@ -96,6 +111,9 @@ public: /* Get number of actual samples to render. */ int get_num_effective_samples(); + + /* Schedule tiles for denoising after they've been rendered. */ + bool schedule_denoising; protected: void set_tiles(); @@ -127,6 +145,9 @@ protected: /* Generate tile list, return number of tiles. */ int gen_tiles(bool sliced); + + int get_neighbor_index(int index, int neighbor); + bool check_neighbor_state(int index, Tile::State state); }; CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 388aba65460..43f9a57d099 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -59,6 +59,7 @@ set(SRC_HEADERS util_math_int2.h util_math_int3.h util_math_int4.h + util_math_matrix.h util_md5.h util_opengl.h util_optimization.h diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 52b4fa859b7..12abd8e201e 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -160,6 +160,78 @@ ccl_device_inline float max4(float a, float b, float c, float d) } #ifndef __KERNEL_OPENCL__ +/* Int/Float conversion */ + +ccl_device_inline int as_int(uint i) +{ + union { uint ui; int i; } u; + u.ui = i; + return u.i; +} + +ccl_device_inline uint as_uint(int i) +{ + union { uint ui; int i; } u; + u.i = i; + return u.ui; +} + +ccl_device_inline uint as_uint(float f) +{ + union { uint i; float f; } u; + u.f = f; + return u.i; +} + +ccl_device_inline int __float_as_int(float f) +{ + union { int i; float f; } u; + u.f = f; + return u.i; +} + +ccl_device_inline float __int_as_float(int i) +{ + union { int i; float f; } u; + u.i = i; + return u.f; +} + +ccl_device_inline uint __float_as_uint(float f) +{ + union { uint i; float f; } u; + u.f = f; + return u.i; +} + +ccl_device_inline float __uint_as_float(uint i) +{ + union { uint i; float f; } u; + u.i = i; + return u.f; +} +#endif /* __KERNEL_OPENCL__ */ + +/* Versions of functions which are safe for fast math. */ +ccl_device_inline bool isnan_safe(float f) +{ + unsigned int x = __float_as_uint(f); + return (x << 1) > 0xff000000u; +} + +ccl_device_inline bool isfinite_safe(float f) +{ + /* By IEEE 754 rule, 2*Inf equals Inf */ + unsigned int x = __float_as_uint(f); + return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); +} + +ccl_device_inline float ensure_finite(float v) +{ + return isfinite_safe(v)? v : 0.0f; +} + +#ifndef __KERNEL_OPENCL__ ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); @@ -250,57 +322,6 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPENCL__ -/* Int/Float conversion */ - -ccl_device_inline int as_int(uint i) -{ - union { uint ui; int i; } u; - u.ui = i; - return u.i; -} - -ccl_device_inline uint as_uint(int i) -{ - union { uint ui; int i; } u; - u.i = i; - return u.ui; -} - -ccl_device_inline uint as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline int __float_as_int(float f) -{ - union { int i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __int_as_float(int i) -{ - union { int i; float f; } u; - u.i = i; - return u.f; -} - -ccl_device_inline uint __float_as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __uint_as_float(uint i) -{ - union { uint i; float f; } u; - u.i = i; - return u.f; -} - /* Interpolation */ template<class A, class B> A lerp(const A& a, const A& b, const B& t) @@ -318,20 +339,6 @@ ccl_device_inline float triangle_area(const float3& v1, } #endif /* __KERNEL_OPENCL__ */ -/* Versions of functions which are safe for fast math. */ -ccl_device_inline bool isnan_safe(float f) -{ - unsigned int x = __float_as_uint(f); - return (x << 1) > 0xff000000u; -} - -ccl_device_inline bool isfinite_safe(float f) -{ - /* By IEEE 754 rule, 2*Inf equals Inf */ - unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); -} - /* Orthonormal vectors */ ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b) diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index e0c6b551040..a754be413fe 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -38,6 +38,7 @@ ccl_device_inline float3 operator/(const float3& a, const float3& b); ccl_device_inline float3 operator+(const float3& a, const float3& b); ccl_device_inline float3 operator-(const float3& a, const float3& b); ccl_device_inline float3 operator+=(float3& a, const float3& b); +ccl_device_inline float3 operator-=(float3& a, const float3& b); ccl_device_inline float3 operator*=(float3& a, const float3& b); ccl_device_inline float3 operator*=(float3& a, float f); ccl_device_inline float3 operator/=(float3& a, const float3& b); @@ -166,6 +167,11 @@ ccl_device_inline float3 operator+=(float3& a, const float3& b) return a = a + b; } +ccl_device_inline float3 operator-=(float3& a, const float3& b) +{ + return a = a - b; +} + ccl_device_inline float3 operator*=(float3& a, const float3& b) { return a = a * b; @@ -360,6 +366,15 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b) return a == b; #endif } + +ccl_device_inline float3 ensure_finite3(float3 v) +{ + if(!isfinite_safe(v.x)) v.x = 0.0; + if(!isfinite_safe(v.y)) v.y = 0.0; + if(!isfinite_safe(v.z)) v.z = 0.0; + return v; +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h index 4b327c90c33..79a8c0841e7 100644 --- a/intern/cycles/util/util_math_int4.h +++ b/intern/cycles/util/util_math_int4.h @@ -103,6 +103,15 @@ ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) (mask.w)? a.w: b.w); #endif } + +ccl_device_inline int4 load_int4(const int *v) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_loadu_si128((__m128i*)v)); +#else + return make_int4(v[0], v[1], v[2], v[3]); +#endif +} #endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h new file mode 100644 index 00000000000..31ea10f18a8 --- /dev/null +++ b/intern/cycles/util/util_math_matrix.h @@ -0,0 +1,379 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_MATRIX_H__ +#define __UTIL_MATH_MATRIX_H__ + +CCL_NAMESPACE_BEGIN + +#define MAT(A, size, row, col) A[(row)*(size)+(col)] + +/* Variants that use a constant stride on GPUS. */ +#ifdef __KERNEL_GPU__ +#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)] +/* Element access when only the lower-triangular elements are stored. */ +#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)] +#define VECS(V, i, s) V[(i)*(s)] +#else +#define MATS(A, n, r, c, s) MAT(A, n, r, c) +#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)] +#define VECS(V, i, s) V[i] +#endif + +/* Zeroing helpers. */ + +ccl_device_inline void math_vector_zero(float *v, int n) +{ + for(int i = 0; i < n; i++) + v[i] = 0.0f; +} + +ccl_device_inline void math_matrix_zero(float *A, int n) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = 0.0f; +} + +/* Elementary vector operations. */ + +ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] += b[i]; +} + +ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] *= b[i]; +} + +ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n) +{ + for(int i = 0; i < n; i++) + a[i*astride] *= b[i]; +} + +ccl_device_inline void math_vector_scale(float *a, float b, int n) +{ + for(int i = 0; i < n; i++) + a[i] *= b; +} + +ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] = max(a[i], b[i]); +} + +ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w) +{ + for(int i = 0; i < n; i++) + v[i] += w*x[i]; +} + +ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride) +{ + for(int i = 0; i < n; i++) + v[i*stride] += w*x[i]; +} + +/* Elementary matrix operations. + * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */ + +ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride) +{ + for(int row = 0; row < n; row++) + MATHS(A, row, row, stride) += val; +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian(float *A, + int n, + float ccl_restrict_ptr v, + float weight) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) += v[row]*v[col]*weight; +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, + int n, + float ccl_restrict_ptr v, + float weight, + int stride) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MATHS(A, row, col, stride) += v[row]*v[col]*weight; +} + +/* Transpose matrix A inplace. */ +ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride) +{ + for(int i = 0; i < n; i++) { + for(int j = 0; j < i; j++) { + float temp = MATS(A, n, i, j, stride); + MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride); + MATS(A, n, j, i, stride) = temp; + } + } +} + + + + +/* Solvers for matrix problems */ + +/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A + * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L. + * Also, only the lower triangular part of A is ever accessed. */ +ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + float sum_col = MATHS(A, row, col, stride); + for(int k = 0; k < col; k++) { + sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride); + } + if(row == col) { + sum_col = sqrtf(max(sum_col, 0.0f)); + } + else { + sum_col /= MATHS(A, col, col, stride); + } + MATHS(A, row, col, stride) = sum_col; + } + } +} + +/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process. + * + * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A. + * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S. + * Since L is lower triangular, finding b is relatively easy since y is known. + * Then, the remaining problem is Lt*S = b, which again can be solved easily. + * + * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is + * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */ +ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride) +{ + math_trimatrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */ + math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */ + + /* Use forward substitution to solve L*b = y, replacing y by b. */ + for(int row = 0; row < n; row++) { + float3 sum = VECS(y, row, stride); + for(int col = 0; col < row; col++) + sum -= MATHS(A, row, col, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } + + /* Use backward substitution to solve Lt*S = b, replacing b by S. */ + for(int row = n-1; row >= 0; row--) { + float3 sum = VECS(y, row, stride); + for(int col = row+1; col < n; col++) + sum -= MATHS(A, col, row, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } +} + + + + + +/* Perform the Jacobi Eigenvalue Methon on matrix A. + * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed. + * The algorithm overwrites the contents of A. + * + * After returning, A will be overwritten with D, which is (almost) diagonal, + * and V will contain the eigenvectors of the original A in its rows (!), + * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A. + */ +ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride) +{ + const float singular_epsilon = 1e-9f; + + for (int row = 0; row < n; row++) + for (int col = 0; col < n; col++) + MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; + + for (int sweep = 0; sweep < 8; sweep++) { + float off_diagonal = 0.0f; + for (int row = 1; row < n; row++) + for (int col = 0; col < row; col++) + off_diagonal += fabsf(MAT(A, n, row, col)); + if (off_diagonal < 1e-7f) { + /* The matrix has nearly reached diagonal form. + * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ + break; + } + + /* Set the threshold for the small element rotation skip in the first sweep: + * Skip all elements that are less than a tenth of the average off-diagonal element. */ + float threshold = 0.2f*off_diagonal / (n*n); + + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { + /* Perform a Jacobi rotation on this element that reduces it to zero. */ + float element = MAT(A, n, row, col); + float abs_element = fabsf(element); + + /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ + if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + MAT(A, n, row, col) = 0.0f; + continue; + } + + if(element == 0.0f) { + continue; + } + + /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */ + if(sweep < 3 && (abs_element < threshold)) { + continue; + } + + /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi). + * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case. + * Then, we compute sin(phi) and cos(phi) themselves. */ + float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); + float ratio; + if (abs_element > singular_epsilon*fabsf(singular_diff)) { + float cot_2phi = 0.5f*singular_diff / element; + ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); + if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + } + else { + ratio = element / singular_diff; + } + + float c = 1.0f / sqrtf(1.0f + ratio*ratio); + float s = ratio*c; + /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */ + float tan_phi_2 = s / (1.0f + c); + + /* Update the singular values in the diagonal. */ + float singular_delta = ratio*element; + MAT(A, n, row, row) += singular_delta; + MAT(A, n, col, col) -= singular_delta; + + /* Set the element itself to zero. */ + MAT(A, n, row, col) = 0.0f; + + /* Perform the actual rotations on the matrices. */ +#define ROT(M, r1, c1, r2, c2, stride) \ + { \ + float M1 = MATS(M, n, r1, c1, stride); \ + float M2 = MATS(M, n, r2, c2, stride); \ + MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \ + MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \ + } + + /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */ + for(int i = 0 ; i < col; i++) ROT(A, col, i, row, i, 1); + for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1); + for(int i = row+1; i < n ; i++) ROT(A, i, col, i, row, 1); + + for(int i = 0 ; i < n ; i++) ROT(V, col, i, row, i, v_stride); +#undef ROT + } + } + } + + /* Sort eigenvalues and the associated eigenvectors. */ + for (int i = 0; i < n - 1; i++) { + float v = MAT(A, n, i, i); + int k = i; + for (int j = i; j < n; j++) { + if (MAT(A, n, j, j) >= v) { + v = MAT(A, n, j, j); + k = j; + } + } + if (k != i) { + /* Swap eigenvalues. */ + MAT(A, n, k, k) = MAT(A, n, i, i); + MAT(A, n, i, i) = v; + /* Swap eigenvectors. */ + for (int j = 0; j < n; j++) { + float v = MATS(V, n, i, j, v_stride); + MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); + MATS(V, n, k, j, v_stride) = v; + } + } + } +} + +#ifdef __KERNEL_SSE3__ + +ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +{ + for(int i = 0; i < n; i++) + A[i] = _mm_setzero_ps(); +} +ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = _mm_setzero_ps(); +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); +} + +ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +{ + for(int i = 0; i < n; i++) + V[i] = _mm_add_ps(V[i], a[i]); +} + +ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +{ + for(int i = 0; i < n; i++) + V[i] = _mm_mul_ps(V[i], a[i]); +} + +ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n) +{ + for(int i = 0; i < n; i++) + a[i] = _mm_max_ps(a[i], b[i]); +} + +ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B) +{ + for(int row = 0; row < n; row++) + for(int col = 0; col <= row; col++) + MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); +} +#endif + +#undef MAT + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_MATRIX_H__ */ diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 557809a5719..545a3399f32 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v) static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; static const size_t BITSCAN_NO_BIT_SET_64 = 64; +#ifdef __KERNEL_SSE3__ /* Emulation of SSE4 functions with SSE3 */ - -#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__) +# ifndef __KERNEL_SSE41__ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -362,7 +362,7 @@ __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { char* _r = (char*)(&rvalue + 1); char* _v = (char*)(& value + 1); char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i)); + for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); return rvalue; } @@ -395,7 +395,7 @@ __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int inde #define _mm_extract_ps __emu_mm_extract_ps __forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32* ptr = (int32*)&input; return ptr[index]; + int32_t* ptr = (int32_t*)&input; return ptr[index]; } #define _mm_insert_ps __emu_mm_insert_ps @@ -415,7 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -#ifdef _M_X64 +# ifdef _M_X64 #define _mm_insert_epi64 __emu_mm_insert_epi64 __forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; @@ -426,7 +426,40 @@ __forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { assert(size_t(index) < 2); return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); } -#endif +# endif + +# endif + +#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) + +/* Return a __m128 with every element set to the largest element of v. */ +ccl_device_inline __m128 _mm_hmax_ps(__m128 v) +{ + /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ + v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); + /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ + v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); + return v; +} + +/* Return the sum of the four elements of x. */ +ccl_device_inline float _mm_hsum_ss(__m128 x) +{ + __m128 a = _mm_movehdup_ps(x); + __m128 b = _mm_add_ps(x, a); + return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); +} + +/* Return a __m128 with every element set to the sum of the four elements of x. */ +ccl_device_inline __m128 _mm_hsum_ps(__m128 x) +{ + x = _mm_hadd_ps(x, x); + x = _mm_hadd_ps(x, x); + return x; +} + +/* Replace elements of x with zero where mask isn't set. */ +#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) #endif diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index feab7996aee..0039c59ec48 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -133,6 +133,11 @@ ccl_device_inline size_t align_up(size_t offset, size_t alignment) return (offset + alignment - 1) & ~(alignment - 1); } +ccl_device_inline size_t divide_up(size_t x, size_t y) +{ + return (x + y - 1) / y; +} + ccl_device_inline size_t round_up(size_t x, size_t multiple) { return ((x + multiple - 1) / multiple) * multiple; diff --git a/source/blender/makesrna/intern/rna_render.c b/source/blender/makesrna/intern/rna_render.c index 129cc591d9f..44dcb72264a 100644 --- a/source/blender/makesrna/intern/rna_render.c +++ b/source/blender/makesrna/intern/rna_render.c @@ -549,6 +549,7 @@ static void rna_def_render_engine(BlenderRNA *brna) parm = RNA_def_pointer(func, "result", "RenderResult", "Result", ""); RNA_def_parameter_flags(parm, 0, PARM_REQUIRED); RNA_def_boolean(func, "cancel", 0, "Cancel", "Don't mark tile as done, don't merge results unless forced"); + RNA_def_boolean(func, "highlight", 0, "Highlight", "Don't mark tile as done yet"); RNA_def_boolean(func, "do_merge_results", 0, "Merge Results", "Merge results even if cancel=true"); func = RNA_def_function(srna, "add_pass", "RE_engine_add_pass"); diff --git a/source/blender/render/extern/include/RE_engine.h b/source/blender/render/extern/include/RE_engine.h index 491d44b3a4e..52491673612 100644 --- a/source/blender/render/extern/include/RE_engine.h +++ b/source/blender/render/extern/include/RE_engine.h @@ -141,7 +141,7 @@ void RE_result_load_from_file(struct RenderResult *result, struct ReportList *re struct RenderResult *RE_engine_begin_result(RenderEngine *engine, int x, int y, int w, int h, const char *layername, const char *viewname); void RE_engine_update_result(RenderEngine *engine, struct RenderResult *result); void RE_engine_add_pass(RenderEngine *engine, const char *name, int channels, const char *chan_id, const char *layername); -void RE_engine_end_result(RenderEngine *engine, struct RenderResult *result, int cancel, int merge_results); +void RE_engine_end_result(RenderEngine *engine, struct RenderResult *result, int cancel, int highlight, int merge_results); const char *RE_engine_active_view_get(RenderEngine *engine); void RE_engine_active_view_set(RenderEngine *engine, const char *viewname); diff --git a/source/blender/render/intern/source/external_engine.c b/source/blender/render/intern/source/external_engine.c index 97413c31e16..ee6dc96276c 100644 --- a/source/blender/render/intern/source/external_engine.c +++ b/source/blender/render/intern/source/external_engine.c @@ -259,7 +259,7 @@ void RE_engine_add_pass(RenderEngine *engine, const char *name, int channels, co render_result_add_pass(re->result, name, channels, chan_id, layername, NULL); } -void RE_engine_end_result(RenderEngine *engine, RenderResult *result, int cancel, int merge_results) +void RE_engine_end_result(RenderEngine *engine, RenderResult *result, int cancel, int highlight, int merge_results) { Render *re = engine->re; @@ -268,7 +268,7 @@ void RE_engine_end_result(RenderEngine *engine, RenderResult *result, int cancel } /* merge. on break, don't merge in result for preview renders, looks nicer */ - if (!cancel) { + if (!highlight) { /* for exr tile render, detect tiles that are done */ RenderPart *pa = get_part_from_result(re, result); diff --git a/source/blenderplayer/bad_level_call_stubs/stubs.c b/source/blenderplayer/bad_level_call_stubs/stubs.c index bbff6790c79..9393459a56c 100644 --- a/source/blenderplayer/bad_level_call_stubs/stubs.c +++ b/source/blenderplayer/bad_level_call_stubs/stubs.c @@ -655,7 +655,7 @@ void RE_engine_update_result(struct RenderEngine *engine, struct RenderResult *r void RE_engine_update_progress(struct RenderEngine *engine, float progress) RET_NONE void RE_engine_set_error_message(RenderEngine *engine, const char *msg) RET_NONE void RE_engine_add_pass(RenderEngine *engine, const char *name, int channels, const char *chan_id, const char *layername) RET_NONE -void RE_engine_end_result(RenderEngine *engine, struct RenderResult *result, int cancel, int merge_results) RET_NONE +void RE_engine_end_result(RenderEngine *engine, struct RenderResult *result, int cancel, int highlight, int merge_results) RET_NONE void RE_engine_update_stats(RenderEngine *engine, const char *stats, const char *info) RET_NONE void RE_layer_load_from_file(struct RenderLayer *layer, struct ReportList *reports, const char *filename, int x, int y) RET_NONE void RE_result_load_from_file(struct RenderResult *result, struct ReportList *reports, const char *filename) RET_NONE |