Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/blender/addon/engine.py12
-rw-r--r--intern/cycles/blender/addon/properties.py74
-rw-r--r--intern/cycles/blender/addon/ui.py72
-rw-r--r--intern/cycles/blender/blender_session.cpp52
-rw-r--r--intern/cycles/blender/blender_session.h4
-rw-r--r--intern/cycles/blender/blender_sync.cpp38
-rw-r--r--intern/cycles/blender/blender_sync.h1
-rw-r--r--intern/cycles/device/CMakeLists.txt2
-rw-r--r--intern/cycles/device/device.cpp12
-rw-r--r--intern/cycles/device/device.h13
-rw-r--r--intern/cycles/device/device_cpu.cpp846
-rw-r--r--intern/cycles/device/device_cuda.cpp495
-rw-r--r--intern/cycles/device/device_denoising.cpp218
-rw-r--r--intern/cycles/device/device_denoising.h145
-rw-r--r--intern/cycles/device/device_memory.h44
-rw-r--r--intern/cycles/device/device_multi.cpp54
-rw-r--r--intern/cycles/device/device_split_kernel.cpp6
-rw-r--r--intern/cycles/device/device_split_kernel.h6
-rw-r--r--intern/cycles/device/device_task.cpp6
-rw-r--r--intern/cycles/device/device_task.h14
-rw-r--r--intern/cycles/device/opencl/opencl.h53
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp465
-rw-r--r--intern/cycles/device/opencl/opencl_mega.cpp54
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp45
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp14
-rw-r--r--intern/cycles/kernel/CMakeLists.txt100
-rw-r--r--intern/cycles/kernel/closure/bsdf.h18
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h2
-rw-r--r--intern/cycles/kernel/closure/bsdf_oren_nayar.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_phong_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_diffuse.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_sheen.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_toon.h1
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h1
-rw-r--r--intern/cycles/kernel/filter/filter.h52
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h38
-rw-r--r--intern/cycles/kernel/filter/filter_features.h120
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h95
-rw-r--r--intern/cycles/kernel/filter/filter_kernel.h50
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h163
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h147
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h145
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h103
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h113
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h117
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h102
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h6
-rw-r--r--intern/cycles/kernel/kernel.h38
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h98
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h2
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h4
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h2
-rw-r--r--intern/cycles/kernel/kernel_light.h2
-rw-r--r--intern/cycles/kernel/kernel_passes.h203
-rw-r--r--intern/cycles/kernel/kernel_path.h89
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h117
-rw-r--r--intern/cycles/kernel/kernel_path_state.h14
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h23
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h8
-rw-r--r--intern/cycles/kernel/kernel_projection.h3
-rw-r--r--intern/cycles/kernel/kernel_shader.h6
-rw-r--r--intern/cycles/kernel/kernel_types.h89
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter.cpp61
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h132
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h259
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp37
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx.cpp30
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h2
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h79
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp29
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu235
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl262
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h24
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h15
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h4
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h4
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h2
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h9
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h1
-rw-r--r--intern/cycles/kernel/svm/svm_image.h4
-rw-r--r--intern/cycles/kernel/svm/svm_types.h13
-rw-r--r--intern/cycles/render/buffers.cpp63
-rw-r--r--intern/cycles/render/buffers.h15
-rw-r--r--intern/cycles/render/film.cpp22
-rw-r--r--intern/cycles/render/film.h7
-rw-r--r--intern/cycles/render/mesh.cpp2
-rw-r--r--intern/cycles/render/session.cpp215
-rw-r--r--intern/cycles/render/session.h26
-rw-r--r--intern/cycles/render/tile.cpp199
-rw-r--r--intern/cycles/render/tile.h33
-rw-r--r--intern/cycles/util/CMakeLists.txt1
-rw-r--r--intern/cycles/util/util_math.h151
-rw-r--r--intern/cycles/util/util_math_float3.h15
-rw-r--r--intern/cycles/util/util_math_int4.h9
-rw-r--r--intern/cycles/util/util_math_matrix.h379
-rw-r--r--intern/cycles/util/util_simd.h45
-rw-r--r--intern/cycles/util/util_types.h5
113 files changed, 6428 insertions, 1138 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index d74ae5f3061..8fb2d08cedd 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -241,3 +241,15 @@ def register_passes(engine, scene, srl):
if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
+
+ if crl.use_denoising and crl.denoising_store_passes:
+ engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR');
+ engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR');
+ engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR');
+ engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR');
+ engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE');
+ engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE');
+ engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR');
+ engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR');
+ engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR');
+ engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR'); \ No newline at end of file
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 78ed53ab1b5..2f9874e850e 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1189,6 +1189,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
default=False,
)
+ cls.use_denoising = BoolProperty(
+ name="Use Denoising",
+ description="Denoise the rendered image",
+ default=False,
+ )
+ cls.denoising_diffuse_direct = BoolProperty(
+ name="Diffuse Direct",
+ description="Denoise the direct diffuse lighting",
+ default=True,
+ )
+ cls.denoising_diffuse_indirect = BoolProperty(
+ name="Diffuse Indirect",
+ description="Denoise the indirect diffuse lighting",
+ default=True,
+ )
+ cls.denoising_glossy_direct = BoolProperty(
+ name="Glossy Direct",
+ description="Denoise the direct glossy lighting",
+ default=True,
+ )
+ cls.denoising_glossy_indirect = BoolProperty(
+ name="Glossy Indirect",
+ description="Denoise the indirect glossy lighting",
+ default=True,
+ )
+ cls.denoising_transmission_direct = BoolProperty(
+ name="Transmission Direct",
+ description="Denoise the direct transmission lighting",
+ default=True,
+ )
+ cls.denoising_transmission_indirect = BoolProperty(
+ name="Transmission Indirect",
+ description="Denoise the indirect transmission lighting",
+ default=True,
+ )
+ cls.denoising_subsurface_direct = BoolProperty(
+ name="Subsurface Direct",
+ description="Denoise the direct subsurface lighting",
+ default=True,
+ )
+ cls.denoising_subsurface_indirect = BoolProperty(
+ name="Subsurface Indirect",
+ description="Denoise the indirect subsurface lighting",
+ default=True,
+ )
+ cls.denoising_strength = FloatProperty(
+ name="Denoising Strength",
+ description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
+ min=0.0, max=1.0,
+ default=0.5,
+ )
+ cls.denoising_feature_strength = FloatProperty(
+ name="Denoising Feature Strength",
+ description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
+ min=0.0, max=1.0,
+ default=0.5,
+ )
+ cls.denoising_radius = IntProperty(
+ name="Denoising Radius",
+ description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
+ min=1, max=50,
+ default=8,
+ )
+ cls.denoising_relative_pca = BoolProperty(
+ name="Relative filter",
+ description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
+ default=False,
+ )
+ cls.denoising_store_passes = BoolProperty(
+ name="Store denoising passes",
+ description="Store the denoising feature passes and the noisy image",
+ default=False,
+ )
+
@classmethod
def unregister(cls):
del bpy.types.SceneRenderLayer.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 7ada8b16fc4..c8d63daee5f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
col.prop(rl, "use_pass_emit", text="Emission")
col.prop(rl, "use_pass_environment")
+ if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+ col.separator()
+ sub = col.column()
+ sub.active = crl.use_denoising
+ sub.prop(crl, "denoising_store_passes", text="Denoising")
+
if _cycles.with_cycles_debug:
col = layout.column()
col.prop(crl, "pass_debug_bvh_traversed_nodes")
@@ -581,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
row.prop(rv, "camera_suffix", text="")
+class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
+ bl_label = "Denoising"
+ bl_context = "render_layer"
+ bl_options = {'DEFAULT_CLOSED'}
+
+ def draw_header(self, context):
+ rd = context.scene.render
+ rl = rd.layers.active
+ crl = rl.cycles
+ cscene = context.scene.cycles
+ layout = self.layout
+
+ layout.active = not cscene.use_progressive_refine
+ layout.prop(crl, "use_denoising", text="")
+
+ def draw(self, context):
+ layout = self.layout
+
+ scene = context.scene
+ cscene = scene.cycles
+ rd = scene.render
+ rl = rd.layers.active
+ crl = rl.cycles
+
+ layout.active = crl.use_denoising and not cscene.use_progressive_refine
+
+ split = layout.split()
+
+ col = split.column()
+ sub = col.column(align=True)
+ sub.prop(crl, "denoising_radius", text="Radius")
+ sub.prop(crl, "denoising_strength", slider=True, text="Strength")
+
+ col = split.column()
+ sub = col.column(align=True)
+ sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
+ sub.prop(crl, "denoising_relative_pca")
+
+ layout.separator()
+
+ row = layout.row()
+ row.label(text="Diffuse:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+
+ row = layout.row()
+ row.label(text="Glossy:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
+
+ row = layout.row()
+ row.label(text="Transmission:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
+
+ row = layout.row()
+ row.label(text="Subsurface:")
+ sub = row.row(align=True)
+ sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
+ sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
+
+
class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
bl_label = "Post Processing"
bl_options = {'DEFAULT_CLOSED'}
@@ -1732,6 +1803,7 @@ classes = (
CyclesRender_PT_layer_options,
CyclesRender_PT_layer_passes,
CyclesRender_PT_views,
+ CyclesRender_PT_denoising,
Cycles_PT_post_processing,
CyclesCamera_PT_dof,
Cycles_PT_context_material,
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 6f2e7065d97..ecc668af782 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -303,12 +303,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
static void end_render_result(BL::RenderEngine& b_engine,
BL::RenderResult& b_rr,
bool cancel,
+ bool highlight,
bool do_merge_results)
{
- b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
+ b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
}
-void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
+void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
{
BufferParams& params = rtile.buffers->params;
int x = params.full_x - session->tile_manager.params.full_x;
@@ -344,37 +345,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
update_render_result(b_rr, b_rlay, rtile);
}
- end_render_result(b_engine, b_rr, true, true);
+ end_render_result(b_engine, b_rr, true, highlight, true);
}
else {
/* write result */
write_render_result(b_rr, b_rlay, rtile);
- end_render_result(b_engine, b_rr, false, true);
+ end_render_result(b_engine, b_rr, false, false, true);
}
}
void BlenderSession::write_render_tile(RenderTile& rtile)
{
- do_write_update_render_tile(rtile, false);
+ do_write_update_render_tile(rtile, false, false);
}
-void BlenderSession::update_render_tile(RenderTile& rtile)
+void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
{
/* use final write for preview renders, otherwise render result wouldn't be
* be updated in blender side
* would need to be investigated a bit further, but for now shall be fine
*/
if(!b_engine.is_preview())
- do_write_update_render_tile(rtile, true);
+ do_write_update_render_tile(rtile, true, highlight);
else
- do_write_update_render_tile(rtile, false);
+ do_write_update_render_tile(rtile, false, false);
}
void BlenderSession::render()
{
/* set callback to write out render results */
session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
- session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
+ session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
/* get buffer parameters */
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@@ -395,7 +396,7 @@ void BlenderSession::render()
/* layer will be missing if it was disabled in the UI */
if(b_single_rlay == b_rr.layers.end()) {
- end_render_result(b_engine, b_rr, true, false);
+ end_render_result(b_engine, b_rr, true, true, false);
continue;
}
@@ -411,6 +412,29 @@ void BlenderSession::render()
}
buffer_params.passes = passes;
+
+ PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
+ bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
+ buffer_params.denoising_data_pass = use_denoising;
+ session->tile_manager.schedule_denoising = use_denoising;
+ session->params.use_denoising = use_denoising;
+ scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+ scene->film->denoising_flags = 0;
+ if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
+ if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
+ if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
+ if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
+ if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
+ if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
+ if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
+ if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
+ scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+ buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
+ session->params.denoising_radius = get_int(crl, "denoising_radius");
+ session->params.denoising_strength = get_float(crl, "denoising_strength");
+ session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
+ session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
+
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
scene->film->tag_passes_update(scene, passes);
scene->film->tag_update(scene);
@@ -464,7 +488,7 @@ void BlenderSession::render()
}
/* free result without merging */
- end_render_result(b_engine, b_rr, true, false);
+ end_render_result(b_engine, b_rr, true, true, false);
if(session->progress.get_cancel())
break;
@@ -670,6 +694,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
/* copy pixels */
read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
}
+ else {
+ int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
+ if(denoising_offset >= 0) {
+ read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
+ }
+ }
if(!read) {
memset(&pixels[0], 0, pixels.size()*sizeof(float));
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 2daf66fa373..254e8a049ff 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -81,7 +81,7 @@ public:
void update_render_result(BL::RenderResult& b_rr,
BL::RenderLayer& b_rlay,
RenderTile& rtile);
- void update_render_tile(RenderTile& rtile);
+ void update_render_tile(RenderTile& rtile, bool highlight);
/* interactive updates */
void synchronize();
@@ -150,7 +150,7 @@ protected:
BL::RenderLayer& b_rlay,
RenderTile& rtile,
bool do_update_only);
- void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
+ void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
int builtin_image_frame(const string &builtin_name);
void builtin_image_info(const string &builtin_name,
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index a1e485635ec..d4ae1fc6c9d 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -509,6 +509,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
return PASS_NONE;
}
+int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
+{
+ string name = b_pass.name();
+ if(name.substr(0, 10) != "Denoising ") {
+ return -1;
+ }
+ name = name.substr(10);
+
+#define MAP_PASS(passname, offset) if(name == passname) return offset;
+ MAP_PASS("Normal", DENOISING_PASS_NORMAL);
+ MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
+ MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
+ MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
+ MAP_PASS("Depth", DENOISING_PASS_DEPTH);
+ MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
+ MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
+ MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
+ MAP_PASS("Image", DENOISING_PASS_COLOR);
+ MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+#undef MAP_PASS
+
+ return -1;
+}
+
array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
BL::SceneRenderLayer& b_srlay)
{
@@ -528,8 +552,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
Pass::add(pass_type, passes);
}
-#ifdef __KERNEL_DEBUG__
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+ if(get_boolean(crp, "denoising_store_passes")) {
+ b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str());
+ b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
+ }
+#ifdef __KERNEL_DEBUG__
if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 02afac3320a..e509cce3e31 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -97,6 +97,7 @@ public:
int width, int height);
static PassType get_pass_type(BL::RenderPass& b_pass);
+ static int get_denoising_pass(BL::RenderPass& b_pass);
private:
/* sync */
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6ef2aa1caad..74ec57ddf74 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
device.cpp
device_cpu.cpp
device_cuda.cpp
+ device_denoising.cpp
device_multi.cpp
device_opencl.cpp
device_split_kernel.cpp
@@ -48,6 +49,7 @@ endif()
set(SRC_HEADERS
device.h
+ device_denoising.h
device_memory.h
device_intern.h
device_network.h
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 4c4e862ed1f..949c5f932a4 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -549,4 +549,16 @@ void Device::free_memory()
devices.free_memory();
}
+
+device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
+ : device(device)
+{
+ ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+ device->mem_free_sub_ptr(ptr);
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 21d29a801ae..c22969d7dc6 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -228,6 +228,7 @@ struct DeviceDrawParams {
};
class Device {
+ friend class device_sub_ptr;
protected:
enum {
FALLBACK_SHADER_STATUS_NONE = 0,
@@ -250,6 +251,14 @@ protected:
bool bind_fallback_display_space_shader(const float width, const float height);
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
+ {
+ /* Only required for devices that implement denoising. */
+ assert(false);
+ return (device_ptr) 0;
+ }
+ virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
public:
virtual ~Device();
@@ -278,6 +287,8 @@ public:
virtual void mem_zero(device_memory& mem) = 0;
virtual void mem_free(device_memory& mem) = 0;
+ virtual int mem_address_alignment() { return 16; }
+
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
@@ -326,6 +337,8 @@ public:
/* multi device */
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
virtual int device_number(Device * /*sub_device*/) { return 0; }
+ virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+ virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
/* static */
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 84cce605182..1ecce8bd565 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -25,6 +25,7 @@
#endif
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
@@ -34,6 +35,8 @@
#include "kernel/split/kernel_split_data.h"
#include "kernel/kernel_globals.h"
+#include "kernel/filter/filter.h"
+
#include "kernel/osl/osl_shader.h"
#include "kernel/osl/osl_globals.h"
@@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN
class CPUDevice;
-class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs);
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
-class CPUDevice : public Device
-{
- static unordered_map<string, void*> kernel_functions;
-
- static void register_kernel_function(const char* name, void* func)
+template<typename F>
+class KernelFunctions {
+public:
+ KernelFunctions()
{
- kernel_functions[name] = func;
+ kernel = (F)NULL;
}
- static const char* get_arch_name()
+ KernelFunctions(F kernel_default,
+ F kernel_sse2,
+ F kernel_sse3,
+ F kernel_sse41,
+ F kernel_avx,
+ F kernel_avx2)
{
+ const char *architecture_name = "default";
+ kernel = kernel_default;
+
+ /* Silence potential warnings about unused variables
+ * when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
- return "cpu_avx2";
+ architecture_name = "AVX2";
+ kernel = kernel_avx2;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
- return "cpu_avx";
+ architecture_name = "AVX";
+ kernel = kernel_avx;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
- return "cpu_sse41";
+ architecture_name = "SSE4.1";
+ kernel = kernel_sse41;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
- return "cpu_sse3";
+ architecture_name = "SSE3";
+ kernel = kernel_sse3;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
- return "cpu_sse2";
+ architecture_name = "SSE2";
+ kernel = kernel_sse2;
}
- else
#endif
- {
- return "cpu";
+
+ if(strstr(architecture_name, logged_architecture) != 0) {
+ VLOG(1) << "Will be using " << architecture_name << " kernels.";
+ logged_architecture = architecture_name;
}
}
- template<typename F>
- static F get_kernel_function(string name)
- {
- name = string("kernel_") + get_arch_name() + "_" + name;
-
- unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+ inline F operator()() const {
+ assert(kernel);
+ return kernel;
+ }
+protected:
+ F kernel;
+};
- if(it == kernel_functions.end()) {
- assert(!"kernel function not found");
- return NULL;
- }
+class CPUSplitKernel : public DeviceSplitKernel {
+ CPUDevice *device;
+public:
+ explicit CPUSplitKernel(CPUDevice *device);
- return (F)it->second;
- }
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
- friend class CPUSplitKernel;
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+class CPUDevice : public Device
+{
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@@ -149,77 +168,89 @@ public:
bool use_split_kernel;
DeviceRequestedFeatures requested_features;
-
+
+ KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
+
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+
+ KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+
+ KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+ ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
+ ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
+ unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+ KERNEL_NAME_EVAL(cpu, name), \
+ KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), \
+ KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), \
+ KERNEL_NAME_EVAL(cpu_avx2, name)
+
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
- : Device(info, stats, background)
+ : Device(info, stats, background),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+ REGISTER_KERNEL(path_trace),
+ REGISTER_KERNEL(convert_to_half_float),
+ REGISTER_KERNEL(convert_to_byte),
+ REGISTER_KERNEL(shader),
+ REGISTER_KERNEL(filter_divide_shadow),
+ REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_combine_halves),
+ REGISTER_KERNEL(filter_nlm_calc_difference),
+ REGISTER_KERNEL(filter_nlm_blur),
+ REGISTER_KERNEL(filter_nlm_calc_weight),
+ REGISTER_KERNEL(filter_nlm_update_output),
+ REGISTER_KERNEL(filter_nlm_normalize),
+ REGISTER_KERNEL(filter_construct_transform),
+ REGISTER_KERNEL(filter_nlm_construct_gramian),
+ REGISTER_KERNEL(filter_finalize),
+ REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
-
- /* do now to avoid thread issues */
- system_cpu_support_sse2();
- system_cpu_support_sse3();
- system_cpu_support_sse41();
- system_cpu_support_avx();
- system_cpu_support_avx2();
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- VLOG(1) << "Will be using AVX2 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- VLOG(1) << "Will be using AVX kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- VLOG(1) << "Will be using SSE4.1 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- VLOG(1) << "Will be using SSE3kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- VLOG(1) << "Will be using SSE2 kernels.";
- }
- else
-#endif
- {
- VLOG(1) << "Will be using regular kernels.";
- }
-
use_split_kernel = DebugFlags().cpu.split_kernel;
if(use_split_kernel) {
VLOG(1) << "Will be using split kernel.";
}
- kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+ REGISTER_SPLIT_KERNEL(path_init);
+ REGISTER_SPLIT_KERNEL(scene_intersect);
+ REGISTER_SPLIT_KERNEL(lamp_emission);
+ REGISTER_SPLIT_KERNEL(do_volume);
+ REGISTER_SPLIT_KERNEL(queue_enqueue);
+ REGISTER_SPLIT_KERNEL(indirect_background);
+ REGISTER_SPLIT_KERNEL(shader_setup);
+ REGISTER_SPLIT_KERNEL(shader_sort);
+ REGISTER_SPLIT_KERNEL(shader_eval);
+ REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ REGISTER_SPLIT_KERNEL(subsurface_scatter);
+ REGISTER_SPLIT_KERNEL(direct_lighting);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+ REGISTER_SPLIT_KERNEL(next_iteration_setup);
+ REGISTER_SPLIT_KERNEL(indirect_subsurface);
+ REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
}
~CPUDevice()
@@ -273,13 +304,17 @@ public:
if(!mem.data_pointer) {
free((void*)mem.device_pointer);
}
-
mem.device_pointer = 0;
stats.mem_free(mem.device_size);
mem.device_size = 0;
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
kernel_const_copy(&kernel_globals, name, host, size);
@@ -326,13 +361,8 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
- if(!use_split_kernel) {
- thread_path_trace(*task);
- }
- else {
- thread_path_trace_split(*task);
- }
+ if(task->type == DeviceTask::RENDER) {
+ thread_render(*task);
}
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
@@ -349,116 +379,319 @@ public:
}
};
- void thread_path_trace(DeviceTask& task)
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
{
- if(task_pool.canceled()) {
- if(task.need_finish_queue == false)
- return;
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
}
- KernelGlobals kg = thread_kernel_globals_init();
- RenderTile tile;
+ return true;
+ }
- void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ int4 rect = task->rect;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+ float *difference = (float*) task->nlm_state.temporary_2_ptr;
+ float *weightAccum = (float*) task->nlm_state.temporary_3_ptr;
+
+ memset(weightAccum, 0, sizeof(float)*w*h);
+ memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) variance_ptr,
+ difference,
+ local_rect,
+ w, 0,
+ a, k_2);
+
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+
+ filter_nlm_update_output_kernel()(dx, dy,
+ blurDifference,
+ (float*) image_ptr,
+ (float*) out_ptr,
+ weightAccum,
+ local_rect,
+ w, f);
+ }
+
+ int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+ filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- path_trace_kernel = kernel_cpu_avx2_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- path_trace_kernel = kernel_cpu_avx_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- path_trace_kernel = kernel_cpu_sse41_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- path_trace_kernel = kernel_cpu_sse3_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- path_trace_kernel = kernel_cpu_sse2_path_trace;
- }
- else
-#endif
- {
- path_trace_kernel = kernel_cpu_path_trace;
+ return true;
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ y*task->filter_area.z + x,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ &task->rect.x,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+ }
}
+ return true;
+ }
- while(task.acquire_tile(this, tile)) {
- float *render_buffer = (float*)tile.buffer;
- uint *rng_state = (uint*)tile.rng_state;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task.get_cancel() || task_pool.canceled()) {
- if(task.need_finish_queue == false)
- break;
- }
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ float *difference = (float*) task->reconstruction_state.temporary_1_ptr;
+ float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) guide_variance_ptr,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ 1.0f,
+ task->nlm_k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_construct_gramian_kernel()(dx, dy,
+ blurDifference,
+ (float*) task->buffer.mem.device_pointer,
+ (float*) color_ptr,
+ (float*) color_variance_ptr,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ local_rect,
+ &task->reconstruction_state.filter_rect.x,
+ task->buffer.w,
+ task->buffer.h,
+ 4,
+ task->buffer.pass_stride);
+ }
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_finalize_kernel()(x,
+ y,
+ y*task->filter_area.z + x,
+ task->buffer.w,
+ task->buffer.h,
+ (float*) output_ptr,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.buffer_params.x,
+ task->render_buffer.samples);
+ }
+ }
+ return true;
+ }
- for(int y = tile.y; y < tile.y + tile.h; y++) {
- for(int x = tile.x; x < tile.x + tile.w; x++) {
- path_trace_kernel(&kg, render_buffer, rng_state,
- sample, x, y, tile.offset, tile.stride);
- }
- }
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ (void) task;
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ filter_combine_halves_kernel()(x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ &rect.x,
+ r);
+ }
+ }
+ return true;
+ }
- tile.sample = sample + 1;
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_divide_shadow_kernel()(task->render_buffer.samples,
+ task->tiles,
+ x, y,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ (float*) sample_variance_ptr,
+ (float*) sv_variance_ptr,
+ (float*) buffer_variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
+ }
+ }
+ return true;
+ }
- task.update_progress(&tile, tile.w*tile.h);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_get_feature_kernel()(task->render_buffer.samples,
+ task->tiles,
+ mean_offset,
+ variance_offset,
+ x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
}
+ }
+ return true;
+ }
- task.release_tile(tile);
+ void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+ {
+ float *render_buffer = (float*)tile.buffer;
+ uint *rng_state = (uint*)tile.rng_state;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- if(task_pool.canceled()) {
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
+
+ for(int y = tile.y; y < tile.y + tile.h; y++) {
+ for(int x = tile.x; x < tile.x + tile.w; x++) {
+ path_trace_kernel()(kg, render_buffer, rng_state,
+ sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(&tile, tile.w*tile.h);
}
+ }
+
+ void denoise(DeviceTask &task, RenderTile &tile)
+ {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ DenoisingTask denoising(this);
- thread_kernel_globals_free(&kg);
+ denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+ denoising.render_buffer.samples = tile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = tile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+
+ task.update_progress(&tile, tile.w*tile.h);
}
- void thread_path_trace_split(DeviceTask& task)
+ void thread_render(DeviceTask& task)
{
if(task_pool.canceled()) {
if(task.need_finish_queue == false)
return;
}
- RenderTile tile;
-
- CPUSplitKernel split_kernel(this);
-
/* allocate buffer for kernel globals */
- device_memory kgbuffer;
- kgbuffer.resize(sizeof(KernelGlobals));
+ device_only_memory<KernelGlobals> kgbuffer;
+ kgbuffer.resize(1);
mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
- requested_features.max_closure = MAX_CLOSURE;
- if(!split_kernel.load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
- mem_free(kgbuffer);
+ CPUSplitKernel *split_kernel = NULL;
+ if(use_split_kernel) {
+ split_kernel = new CPUSplitKernel(this);
+ requested_features.max_closure = MAX_CLOSURE;
+ if(!split_kernel->load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
- return;
+ delete split_kernel;
+ return;
+ }
}
+ RenderTile tile;
while(task.acquire_tile(this, tile)) {
- device_memory data;
- split_kernel.path_trace(&task, tile, kgbuffer, data);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel) {
+ device_memory data;
+ split_kernel->path_trace(&task, tile, kgbuffer, data);
+ }
+ else {
+ path_trace(task, tile, kg);
+ }
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ denoise(task, tile);
+ }
task.release_tile(tile);
@@ -470,6 +703,7 @@ public:
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
mem_free(kgbuffer);
+ delete split_kernel;
}
void thread_film_convert(DeviceTask& task)
@@ -477,86 +711,16 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
- void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
- }
- else
-#endif
- {
- convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
else {
- void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
- }
- else
-#endif
- {
- convert_to_byte_kernel = kernel_cpu_convert_to_byte;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
}
@@ -568,53 +732,17 @@ public:
#ifdef WITH_OSL
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
- void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- shader_kernel = kernel_cpu_avx2_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- shader_kernel = kernel_cpu_avx_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- shader_kernel = kernel_cpu_sse41_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- shader_kernel = kernel_cpu_sse3_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- shader_kernel = kernel_cpu_sse2_shader;
- }
- else
-#endif
- {
- shader_kernel = kernel_cpu_shader;
- }
-
for(int sample = 0; sample < task.num_samples; sample++) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel(&kg,
- (uint4*)task.shader_input,
- (float4*)task.shader_output,
- (float*)task.shader_output_luma,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
+ shader_kernel()(&kg,
+ (uint4*)task.shader_input,
+ (float4*)task.shader_output,
+ (float*)task.shader_output_luma,
+ task.shader_eval_type,
+ task.shader_filter,
+ x,
+ task.offset,
+ sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
device_memory& use_queues_flags,
device_memory& work_pool_wgs)
{
- typedef void(*data_init_t)(KernelGlobals *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- ccl_global uint *rng_state,
- int start_sample,
- int end_sample,
- int sx, int sy, int sw, int sh, int offset, int stride,
- ccl_global int *Queue_index,
- int queuesize,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
- ccl_global float *buffer);
-
- data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- data_init = kernel_cpu_avx2_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- data_init = kernel_cpu_avx_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- data_init = kernel_cpu_sse41_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- data_init = kernel_cpu_sse3_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- data_init = kernel_cpu_sse2_data_init;
- }
- else
-#endif
- {
- data_init = kernel_cpu_data_init;
- }
-
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
@@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
for(int x = 0; x < dim.global_size[0]; x++) {
kg->global_id = make_int2(x, y);
- data_init((KernelGlobals*)kernel_globals.device_pointer,
- (KernelData*)data.device_pointer,
- (void*)split_data.device_pointer,
- num_global_elements,
- (char*)ray_state.device_pointer,
- (uint*)rtile.rng_state,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int*)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char*)use_queues_flags.device_pointer,
- (uint*)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float*)rtile.buffer);
+ device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+ (KernelData*)data.device_pointer,
+ (void*)split_data.device_pointer,
+ num_global_elements,
+ (char*)ray_state.device_pointer,
+ (uint*)rtile.rng_state,
+ rtile.start_sample,
+ rtile.start_sample + rtile.num_samples,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ (int*)queue_index.device_pointer,
+ dim.global_size[0] * dim.global_size[1],
+ (char*)use_queues_flags.device_pointer,
+ (uint*)work_pool_wgs.device_pointer,
+ rtile.num_samples,
+ (float*)rtile.buffer);
}
}
@@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam
{
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
- kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+ kernel->func = device->split_kernels[kernel_name]();
if(!kernel->func) {
delete kernel;
return NULL;
@@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
return split_data_buffer_size(kg, num_threads);
}
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index e497ec6b0e1..9a8537a6722 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
#include <string.h>
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
#include "render/buffers.h"
+#include "kernel/filter/filter_defines.h"
+
#ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
#else
@@ -113,7 +116,7 @@ public:
DedicatedTaskPool task_pool;
CUdevice cuDevice;
CUcontext cuContext;
- CUmodule cuModule;
+ CUmodule cuModule, cuFilterModule;
map<device_ptr, bool> tex_interp_map;
map<device_ptr, uint> tex_bindless_map;
int cuDevId;
@@ -170,7 +173,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -301,7 +304,8 @@ public:
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features, bool split=false)
+ const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -316,7 +320,7 @@ public:
machine,
cuda_version,
include_path.c_str());
- if(use_adaptive_compilation()) {
+ if(!filter && use_adaptive_compilation()) {
cflags += " " + requested_features.get_build_options();
}
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -364,8 +368,22 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
+ const char *name, *source;
+ if(filter) {
+ name = "filter";
+ source = "filter.cu";
+ }
+ else if(split) {
+ name = "kernel_split";
+ source = "kernel_split.cu";
+ }
+ else {
+ name = "kernel";
+ source = "kernel.cu";
+ }
/* Compute cubin name. */
int major, minor;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -373,9 +391,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
- : "lib/kernel_sm_%d%d.cubin",
- major, minor));
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+ name, major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
VLOG(1) << "Using precompiled kernel.";
@@ -384,7 +401,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features, split);
+ compile_kernel_get_common_cflags(requested_features, filter, split);
/* Try to use locally compiled kernel. */
const string source_path = path_get("source");
@@ -395,9 +412,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
- : "cycles_kernel_sm%d%d_%s.cubin",
- major, minor,
+ const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+ name, major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -432,7 +448,7 @@ public:
const string kernel = path_join(
path_join(source_path, "kernel"),
path_join("kernels",
- path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+ path_join("cuda", source)));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -480,11 +496,14 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features, use_split_kernel());
-
+ string cubin = compile_kernel(requested_features, false, use_split_kernel());
if(cubin == "")
return false;
+ string filter_cubin = compile_kernel(requested_features, true, false);
+ if(filter_cubin == "")
+ return false;
+
/* open module */
cuda_push_context();
@@ -499,6 +518,14 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+ if(path_read_text(filter_cubin, cubin_data))
+ result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if(cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
cuda_pop_context();
return (result == CUDA_SUCCESS);
@@ -581,6 +608,11 @@ public:
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
CUdeviceptr mem;
@@ -881,6 +913,368 @@ public:
}
}
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+ {
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
+ }
+
+ mem_copy_to(task->tiles_mem);
+
+ return !have_error();
+ }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1)/threads; \
+ int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads, threads, 1, \
+ 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ int4 rect = task->rect;
+ int w = rect.z-rect.x;
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+ int dx, dy;
+ int4 local_rect;
+ int channel_offset = 0;
+ void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterConstructTransform;
+ cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+ task->storage.w,
+ task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cuda_push_context();
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ void *calc_difference_args[] = {&dx, &dy,
+ &guide_ptr,
+ &guide_variance_ptr,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &task->buffer.pass_stride,
+ &a,
+ &task->nlm_k_2};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+ void *blur_args[] = {&difference,
+ &blurDifference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *calc_weight_args[] = {&blurDifference,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+ /* Reuse previous arguments. */
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *construct_gramian_args[] = {&dx, &dy,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &color_ptr,
+ &color_variance_ptr,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &local_rect,
+ &task->reconstruction_state.filter_rect,
+ &task->buffer.w,
+ &task->buffer.h,
+ &f,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+ }
+
+ void *finalize_args[] = {&task->buffer.w,
+ &task->buffer.h,
+ &output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ (void) task;
+
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterCombineHalves;
+ cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&mean_ptr,
+ &variance_ptr,
+ &a_ptr,
+ &b_ptr,
+ &rect,
+ &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ (void) task;
+
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterDivideShadow;
+ cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterGetFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ void denoise(RenderTile &rtile, const DeviceTask &task)
+ {
+ DenoisingTask denoising(this);
+
+ denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+ }
+
void path_trace(RenderTile& rtile, int sample, bool branched)
{
if(have_error())
@@ -1326,7 +1720,7 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
+ if(task->type == DeviceTask::RENDER) {
RenderTile tile;
bool branched = task->integrator_branched;
@@ -1334,30 +1728,8 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- if(!use_split_kernel()) {
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
-
- path_trace(tile, sample, branched);
-
- tile.sample = sample + 1;
-
- task->update_progress(&tile, tile.w*tile.h);
- }
-
- task->release_tile(tile);
- }
- }
- else {
- DeviceRequestedFeatures requested_features;
+ DeviceRequestedFeatures requested_features;
+ if(use_split_kernel()) {
if(!use_adaptive_compilation()) {
requested_features.max_closure = 64;
}
@@ -1366,18 +1738,47 @@ public:
split_kernel = new CUDASplitKernel(this);
split_kernel->load_kernels(requested_features);
}
+ }
+
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel()) {
+ device_memory void_buffer;
+ split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ }
+ else {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
+
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
- while(task->acquire_tile(this, tile)) {
- device_memory void_buffer;
- split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ path_trace(tile, sample, branched);
- task->release_tile(tile);
+ tile.sample = sample + 1;
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
}
}
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ denoise(tile, *task);
+
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+
+ task->release_tile(tile);
+
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
}
}
else if(task->type == DeviceTask::SHADER) {
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
new file mode 100644
index 00000000000..39c8cf30105
--- /dev/null
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+ radius = task.denoising_radius;
+ nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+ if(task.denoising_relative_pca) {
+ pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+ }
+ else {
+ pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+ }
+
+ render_buffer.pass_stride = task.pass_stride;
+ render_buffer.denoising_data_offset = task.pass_denoising_data;
+ render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+ /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+ rect = make_int4(max(tiles->x[0], filter_area.x - radius),
+ max(tiles->y[0], filter_area.y - radius),
+ min(tiles->x[3], filter_area.x + filter_area.z + radius),
+ min(tiles->y[3], filter_area.y + filter_area.w + radius));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+ tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
+
+ device_ptr buffers[9];
+ for(int i = 0; i < 9; i++) {
+ buffers[i] = rtiles[i].buffer;
+ tiles->offsets[i] = rtiles[i].offset;
+ tiles->strides[i] = rtiles[i].stride;
+ }
+ tiles->x[0] = rtiles[3].x;
+ tiles->x[1] = rtiles[4].x;
+ tiles->x[2] = rtiles[5].x;
+ tiles->x[3] = rtiles[5].x + rtiles[5].w;
+ tiles->y[0] = rtiles[1].y;
+ tiles->y[1] = rtiles[4].y;
+ tiles->y[2] = rtiles[7].y;
+ tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+ render_buffer.offset = rtiles[4].offset;
+ render_buffer.stride = rtiles[4].stride;
+ render_buffer.ptr = rtiles[4].buffer;
+
+ functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+ /* Allocate denoising buffer. */
+ buffer.passes = 14;
+ buffer.w = align_up(rect.z - rect.x, 4);
+ buffer.h = rect.w - rect.y;
+ buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+ buffer.mem.resize(buffer.pass_stride * buffer.passes);
+ device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
+
+ device_ptr null_ptr = (device_ptr) 0;
+
+ /* Prefilter shadow feature. */
+ {
+ device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+ nlm_state.temporary_1_ptr = *nlm_temporary_1;
+ nlm_state.temporary_2_ptr = *nlm_temporary_2;
+ nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+ /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+ functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+ functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+ /* Reuse memory, the previous data isn't needed anymore. */
+ device_ptr filtered_a = *buffer_var,
+ filtered_b = *sample_var;
+ /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+ functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+ functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+ device_ptr residual_var = *sample_var_var;
+ /* Estimate the residual variance between the two filtered halves. */
+ functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+ device_ptr final_a = *unfiltered_a,
+ final_b = *unfiltered_b;
+ /* Use the residual variance for a second filter pass. */
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+ functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+ functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+ /* Combine the two double-filtered halves to a final shadow feature. */
+ device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+ }
+
+ /* Prefilter general features. */
+ {
+ device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+ nlm_state.temporary_1_ptr = *nlm_temporary_1;
+ nlm_state.temporary_2_ptr = *nlm_temporary_2;
+ nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+ int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
+ int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
+ int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 };
+ for(int pass = 0; pass < 7; pass++) {
+ device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ /* Get the unfiltered pass and its variance from the RenderBuffers. */
+ functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+ /* Smooth the pass and store the result in the denoising buffers. */
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+ functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+ }
+ }
+
+ /* Copy color passes. */
+ {
+ int mean_from[] = {20, 21, 22};
+ int variance_from[] = {23, 24, 25};
+ int mean_to[] = { 8, 9, 10};
+ int variance_to[] = {11, 12, 13};
+ int num_color_passes = 3;
+ for(int pass = 0; pass < num_color_passes; pass++) {
+ device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+ functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+ }
+ }
+
+ storage.w = filter_area.z;
+ storage.h = filter_area.w;
+ storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
+ storage.rank.resize(storage.w*storage.h);
+ device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
+ device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
+
+ functions.construct_transform();
+
+ device_only_memory<float> temporary_1;
+ device_only_memory<float> temporary_2;
+ temporary_1.resize(buffer.w*buffer.h);
+ temporary_2.resize(buffer.w*buffer.h);
+ device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
+ device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
+ reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+ reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+ storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
+ storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
+ device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
+ device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
+
+ reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+ int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+ reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+ render_buffer.stride,
+ render_buffer.pass_stride,
+ render_buffer.denoising_clean_offset);
+ reconstruction_state.source_w = rect.z-rect.x;
+ reconstruction_state.source_h = rect.w-rect.y;
+
+ {
+ device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+ functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
+ }
+
+ device->mem_free(storage.XtWX);
+ device->mem_free(storage.XtWY);
+ device->mem_free(storage.transform);
+ device->mem_free(storage.rank);
+ device->mem_free(temporary_1);
+ device->mem_free(temporary_2);
+ device->mem_free(buffer.mem);
+ device->mem_free(tiles_mem);
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
new file mode 100644
index 00000000000..86d8eb64386
--- /dev/null
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+ /* Parameters of the denoising algorithm. */
+ int radius;
+ float nlm_k_2;
+ float pca_threshold;
+
+ /* Pointer and parameters of the RenderBuffers. */
+ struct RenderBuffers {
+ int denoising_data_offset;
+ int denoising_clean_offset;
+ int pass_stride;
+ int offset;
+ int stride;
+ device_ptr ptr;
+ int samples;
+ } render_buffer;
+
+ TilesInfo *tiles;
+ device_vector<int> tiles_mem;
+ void tiles_from_rendertiles(RenderTile *rtiles);
+
+ int4 rect;
+ int4 filter_area;
+
+ struct DeviceFunctions {
+ function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
+ device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
+ device_ptr variance_ptr, /* Contains the variance of the guide image. */
+ device_ptr out_ptr /* The filtered output is written into this image. */
+ )> non_local_means;
+ function<bool(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr
+ )> reconstruct;
+ function<bool()> construct_transform;
+
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect
+ )> combine_halves;
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr
+ )> divide_shadow;
+ function<bool(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr
+ )> get_feature;
+ function<bool(device_ptr*)> set_tiles;
+ } functions;
+
+ /* Stores state of the current Reconstruction operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct ReconstructionState {
+ device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+ device_ptr temporary_2_ptr;
+
+ int4 filter_rect;
+ int4 buffer_params;
+
+ int source_w;
+ int source_h;
+ } reconstruction_state;
+
+ /* Stores state of the current NLM operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct NLMState {
+ device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+ device_ptr temporary_2_ptr;
+ device_ptr temporary_3_ptr;
+
+ int r; /* Search radius of the filter. */
+ int f; /* Patch size of the filter. */
+ float a; /* Variance compensation factor in the MSE estimation. */
+ float k_2; /* Squared value of the k parameter of the filter. */
+
+ void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+ } nlm_state;
+
+ struct Storage {
+ device_only_memory<float> transform;
+ device_only_memory<int> rank;
+ device_only_memory<float> XtWX;
+ device_only_memory<float3> XtWY;
+ int w;
+ int h;
+ } storage;
+
+ DenoisingTask(Device *device) : device(device) {}
+
+ void init_from_devicetask(const DeviceTask &task);
+
+ bool run_denoising();
+
+ struct DenoiseBuffers {
+ int pass_stride;
+ int passes;
+ int w;
+ int h;
+ device_only_memory<float> mem;
+ } buffer;
+
+protected:
+ Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 4b10514a9d2..b63dd00068b 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -35,6 +35,8 @@
CCL_NAMESPACE_BEGIN
+class Device;
+
enum MemoryType {
MEM_READ_ONLY,
MEM_WRITE_ONLY,
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
template<> struct device_type_traits<float3> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 3;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<float4> {
@@ -173,6 +175,9 @@ class device_memory
{
public:
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+ size_t memory_elements_size(int elements) {
+ return elements*data_elements*datatype_size(data_type);
+ }
/* data information */
DataType data_type;
@@ -213,6 +218,22 @@ protected:
device_memory& operator = (const device_memory&);
};
+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+ device_only_memory()
+ {
+ data_type = device_type_traits<T>::data_type;
+ data_elements = max(device_type_traits<T>::num_elements, 1);
+ }
+
+ void resize(size_t num)
+ {
+ device_memory::resize(num*sizeof(T));
+ }
+};
+
/* Device Vector */
template<typename T> class device_vector : public device_memory
@@ -299,6 +320,27 @@ private:
array<T> data;
};
+/* A device_sub_ptr is a pointer into another existing memory.
+ * Therefore, it is not allocated separately, but just created from the already allocated base memory.
+ * It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
+ * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
+class device_sub_ptr
+{
+public:
+ device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
+ ~device_sub_ptr();
+ /* No copying. */
+ device_sub_ptr& operator = (const device_sub_ptr&);
+
+ device_ptr operator*() const
+ {
+ return ptr;
+ }
+protected:
+ Device *device;
+ device_ptr ptr;
+};
+
CCL_NAMESPACE_END
#endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 8616e31d3b9..35ae0303d6e 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -302,6 +302,60 @@ public:
return -1;
}
+ void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+ {
+ for(int i = 0; i < 9; i++) {
+ if(!tiles[i].buffers) {
+ continue;
+ }
+ /* If the tile was rendered on another device, copy its memory to
+ * to the current device now, for the duration of the denoising task.
+ * Note that this temporarily modifies the RenderBuffers and calls
+ * the device, so this function is not thread safe. */
+ if(tiles[i].buffers->device != sub_device) {
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+
+ tiles[i].buffers->copy_from_device();
+ device_ptr original_ptr = mem.device_pointer;
+ mem.device_pointer = 0;
+ sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
+ sub_device->mem_copy_to(mem);
+ tiles[i].buffer = mem.device_pointer;
+ mem.device_pointer = original_ptr;
+ }
+ }
+ }
+
+ void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+ {
+ for(int i = 0; i < 9; i++) {
+ if(!tiles[i].buffers) {
+ continue;
+ }
+ if(tiles[i].buffers->device != sub_device) {
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+
+ device_ptr original_ptr = mem.device_pointer;
+ mem.device_pointer = tiles[i].buffer;
+
+ /* Copy denoised tile to the host. */
+ if(i == 4) {
+ tiles[i].buffers->copy_from_device(sub_device);
+ }
+
+ size_t mem_size = mem.device_size;
+ sub_device->mem_free(mem);
+ mem.device_pointer = original_ptr;
+ mem.device_size = mem_size;
+
+ /* Copy denoised tile to the original device. */
+ if(i == 4) {
+ tiles[i].buffers->device->mem_copy_to(mem);
+ }
+ }
+ }
+ }
+
int get_split_task_count(DeviceTask& task)
{
int total_tasks = 0;
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 9118793aad6..dddd19f179f 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
/* Allocate work_pool_wgs memory. */
- work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+ work_pool_wgs.resize(max_work_groups);
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
- queue_index.resize(NUM_QUEUES * sizeof(int));
+ queue_index.resize(NUM_QUEUES);
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
- use_queues_flag.resize(sizeof(char));
+ use_queues_flag.resize(1);
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
ray_state.resize(num_global_elements);
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 58c2fdbb077..68c2ba974a5 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -80,16 +80,16 @@ private:
*/
device_memory split_data;
device_vector<uchar> ray_state;
- device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+ device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
/* Flag to make sceneintersect and lampemission kernel use queues. */
- device_memory use_queues_flag;
+ device_only_memory<char> use_queues_flag;
/* Approximate time it takes to complete one sample */
double avg_time_per_sample;
/* Work pool with respect to each work group. */
- device_memory work_pool_wgs;
+ device_only_memory<unsigned int> work_pool_wgs;
/* clos_max value for which the kernels have been loaded currently. */
int current_max_closure;
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index ca303365627..3bc4c310283 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
if(type == SHADER) {
num = min(shader_w, num);
}
- else if(type == PATH_TRACE) {
+ else if(type == RENDER) {
}
else {
num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
tasks.push_back(task);
}
}
- else if(type == PATH_TRACE) {
+ else if(type == RENDER) {
for(int i = 0; i < num; i++)
tasks.push_back(*this);
}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
{
- if((type != PATH_TRACE) &&
+ if((type != RENDER) &&
(type != SHADER))
return;
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index feee89fd6e4..44a1efff1f5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -34,7 +34,7 @@ class Tile;
class DeviceTask : public Task {
public:
- typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+ typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
Type type;
int x, y, w, h;
@@ -53,7 +53,7 @@ public:
int passes_size;
- explicit DeviceTask(Type type = PATH_TRACE);
+ explicit DeviceTask(Type type = RENDER);
int get_subtask_count(int num, int max_size = 0);
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -65,6 +65,16 @@ public:
function<void(RenderTile&)> update_tile_sample;
function<void(RenderTile&)> release_tile;
function<bool(void)> get_cancel;
+ function<void(RenderTile*, Device*)> map_neighbor_tiles;
+ function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+ int denoising_radius;
+ float denoising_strength;
+ float denoising_feature_strength;
+ bool denoising_relative_pca;
+ int pass_stride;
+ int pass_denoising_data;
+ int pass_denoising_clean;
bool need_finish_queue;
bool integrator_branched;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index d061973dcb7..a458ca6bf64 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -17,6 +17,7 @@
#ifdef WITH_OPENCL
#include "device/device.h"
+#include "device/device_denoising.h"
#include "util/util_map.h"
#include "util/util_param.h"
@@ -129,6 +130,8 @@ public:
cl_int* error = NULL);
static cl_device_type get_device_type(cl_device_id device_id);
+ static int mem_address_alignment(cl_device_id device_id);
+
/* Get somewhat more readable device name.
* Main difference is AMD OpenCL here which only gives code name
* for the regular device name. This will give more sane device
@@ -218,7 +221,7 @@ public:
cl_int err = stmt; \
\
if(err != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+ string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -282,7 +285,7 @@ public:
map<ustring, cl_kernel> kernels;
};
- OpenCLProgram base_program;
+ OpenCLProgram base_program, denoising_program;
typedef map<string, device_vector<uchar>*> ConstMemMap;
typedef map<string, device_ptr> MemMap;
@@ -320,6 +323,9 @@ public:
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
void mem_free(device_memory& mem);
+
+ int mem_address_alignment();
+
void const_copy_to(const char *name, void *host, size_t size);
void tex_alloc(const char *name,
device_memory& mem,
@@ -328,12 +334,14 @@ public:
void tex_free(device_memory& mem);
size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+ void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
void shader(DeviceTask& task);
+ void denoise(RenderTile& tile, const DeviceTask& task);
+
class OpenCLDeviceTask : public DeviceTask {
public:
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -367,9 +375,48 @@ public:
virtual void thread_run(DeviceTask * /*task*/) = 0;
+ virtual bool is_split_kernel() = 0;
+
protected:
string kernel_build_options(const string *debug_src = NULL);
+ void mem_zero_kernel(device_ptr ptr, size_t size);
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task);
+ bool denoising_construct_transform(DenoisingTask *task);
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task);
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task);
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task);
+ bool denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task);
+
+ device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
+ void mem_free_sub_ptr(device_ptr ptr);
+
class ArgumentWrapper {
public:
ArgumentWrapper() : size(0), pointer(NULL)
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 22aeaddcde8..ae1a7b917c3 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("bake"));
base_program.add_kernel(ustring("zero_buffer"));
+ denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+ denoising_program.add_kernel(ustring("filter_divide_shadow"));
+ denoising_program.add_kernel(ustring("filter_get_feature"));
+ denoising_program.add_kernel(ustring("filter_combine_halves"));
+ denoising_program.add_kernel(ustring("filter_construct_transform"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+ denoising_program.add_kernel(ustring("filter_nlm_blur"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+ denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+ denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+ denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+ denoising_program.add_kernel(ustring("filter_finalize"));
+ denoising_program.add_kernel(ustring("filter_set_tiles"));
+
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
+ programs.push_back(&denoising_program);
/* Call actual class to fill the vector with its programs. */
if(!load_kernels(requested_features, programs)) {
return false;
@@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
NULL, NULL));
}
-void OpenCLDeviceBase::mem_zero(device_memory& mem)
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
{
- if(mem.device_pointer) {
- if(base_program.is_loaded()) {
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+ cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
+ size_t global_size[] = {1024, 1024};
+ size_t num_threads = global_size[0] * global_size[1];
- cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
+ cl_mem d_buffer = CL_MEM_PTR(mem);
+ cl_ulong d_offset = 0;
+ cl_ulong d_size = 0;
- while(d_offset < mem.memory_size()) {
- d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+ while(d_offset < size) {
+ d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+ kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
- ckZeroBuffer,
- 2,
- NULL,
- global_size,
- NULL,
- 0,
- NULL,
- NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+ ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+ ckZeroBuffer,
+ 2,
+ NULL,
+ global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
- d_offset += d_size;
- }
+ d_offset += d_size;
+ }
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+ if(mem.device_pointer) {
+ if(base_program.is_loaded()) {
+ mem_zero_kernel(mem.device_pointer, mem.memory_size());
}
if(mem.data_pointer) {
@@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
}
}
+int OpenCLDeviceBase::mem_address_alignment()
+{
+ return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+ cl_mem_flags mem_flag;
+ if(type == MEM_READ_ONLY)
+ mem_flag = CL_MEM_READ_ONLY;
+ else if(type == MEM_WRITE_ONLY)
+ mem_flag = CL_MEM_WRITE_ONLY;
+ else
+ mem_flag = CL_MEM_READ_WRITE;
+
+ cl_buffer_region info;
+ info.origin = mem.memory_elements_size(offset);
+ info.size = mem.memory_elements_size(size);
+
+ device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+ mem_flag,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ &info,
+ &ciErr);
+ opencl_assert_err(ciErr, "clCreateSubBuffer");
+ return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+ if(device_pointer && device_pointer != null_mem) {
+ opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+ }
+}
+
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
{
ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
return global_size + ((r == 0)? 0: group_size - r);
}
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
{
size_t workgroup_size, max_work_items[3];
@@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
+ if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+ workgroup_size = max_workgroup_size;
+ }
+
/* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task)
+{
+ int4 rect = task->rect;
+ int w = rect.z-rect.x;
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+ cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+ mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+ mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
+ cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+ int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy, guide_mem, variance_mem,
+ difference, local_rect, w, 0, a, k_2);
+ kernel_set_args(ckNLMBlur, 0,
+ difference, blurDifference, local_rect, w, f);
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference, difference, local_rect, w, f);
+ kernel_set_args(ckNLMUpdateOutput, 0,
+ dx, dy, blurDifference, image_mem,
+ out_mem, weightAccum, local_rect, w, f);
+
+ enqueue_kernel(ckNLMCalcDifference, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMCalcWeight, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMUpdateOutput, w, h);
+ }
+
+ int4 local_rect = make_int4(0, 0, w, h);
+ kernel_set_args(ckNLMNormalize, 0,
+ out_mem, weightAccum, local_rect, w);
+ enqueue_kernel(ckNLMNormalize, w, h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+ cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+ kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ task->filter_area,
+ task->rect,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+
+ enqueue_kernel(ckFilterConstructTransform,
+ task->storage.w,
+ task->storage.h,
+ 256);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cl_mem color_mem = CL_MEM_PTR(color_ptr);
+ cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy,
+ guide_mem,
+ guide_variance_mem,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ a, task->nlm_k_2);
+ enqueue_kernel(ckNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMBlur, 0,
+ difference,
+ blurDifference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference,
+ difference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMCalcWeight,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ /* Reuse previous arguments. */
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMConstructGramian, 0,
+ dx, dy,
+ blurDifference,
+ buffer_mem,
+ color_mem,
+ color_variance_mem,
+ transform_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ local_rect,
+ task->reconstruction_state.filter_rect,
+ task->buffer.w,
+ task->buffer.h,
+ f,
+ task->buffer.pass_stride);
+ enqueue_kernel(ckNLMConstructGramian,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h,
+ 256);
+ }
+
+ kernel_set_args(ckFinalize, 0,
+ task->buffer.w,
+ task->buffer.h,
+ output_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->filter_area,
+ task->reconstruction_state.buffer_params,
+ task->render_buffer.samples);
+ enqueue_kernel(ckFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task)
+{
+ (void) task;
+
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+ kernel_set_args(ckFilterCombineHalves, 0,
+ mean_mem,
+ variance_mem,
+ a_mem,
+ b_mem,
+ rect,
+ r);
+ enqueue_kernel(ckFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task)
+{
+ (void) task;
+
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+ cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+ cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterDivideShadow, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ a_mem,
+ b_mem,
+ sample_variance_mem,
+ sv_variance_mem,
+ buffer_variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+{
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterGetFeature, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ mean_offset,
+ variance_offset,
+ mean_mem,
+ variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task)
+{
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+ mem_copy_to(task->tiles_mem);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+ kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+ for(int i = 0; i < 9; i++) {
+ cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+ kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+ }
+
+ enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+ return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+ DenoisingTask denoising(this);
+
+ denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+ denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+}
+
void OpenCLDeviceBase::shader(DeviceTask& task)
{
/* cast arguments to cl types */
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index a2fd1d71156..06c15bcf401 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -108,41 +108,53 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
- else if(task->type == DeviceTask::PATH_TRACE) {
+ else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ if(tile.task == RenderTile::PATH_TRACE) {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
+
+ path_trace(tile, sample);
- path_trace(tile, sample);
+ tile.sample = sample + 1;
- tile.sample = sample + 1;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+ /* Complete kernel execution before release tile */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, *task);
task->update_progress(&tile, tile.w*tile.h);
}
- /* Complete kernel execution before release tile */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
-
task->release_tile(tile);
}
}
}
+
+ bool is_split_kernel()
+ {
+ return false;
+ }
};
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index d175aae137a..76dcbd6fc9a 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -104,7 +104,7 @@ public:
else if(task->type == DeviceTask::SHADER) {
shader(*task);
}
- else if(task->type == DeviceTask::PATH_TRACE) {
+ else if(task->type == DeviceTask::RENDER) {
RenderTile tile;
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -127,21 +127,29 @@ public:
/* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
- split_kernel->path_trace(task,
- tile,
- kgbuffer,
- *const_mem_map["__data"]);
-
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ assert(tile.task == RenderTile::PATH_TRACE);
+ split_kernel->path_trace(task,
+ tile,
+ kgbuffer,
+ *const_mem_map["__data"]);
+
+ /* Complete kernel execution before release tile. */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, *task);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
task->release_tile(tile);
}
@@ -150,6 +158,11 @@ public:
}
}
+ bool is_split_kernel()
+ {
+ return true;
+ }
+
protected:
/* ** Those guys are for workign around some compiler-specific bugs ** */
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 38003dd1e1e..642c1bfa11c 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
return get_device_name(device_id);
}
+int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
+{
+ int base_align_bits;
+ if(clGetDeviceInfo(device_id,
+ CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+ sizeof(int),
+ &base_align_bits,
+ NULL) == CL_SUCCESS)
+ {
+ return base_align_bits/8;
+ }
+ return 1;
+}
+
CCL_NAMESPACE_END
#endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9bb0455b9d5..bef869f34b4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
+ kernels/cpu/kernel_sse2.cpp
+ kernels/cpu/kernel_sse3.cpp
+ kernels/cpu/kernel_sse41.cpp
+ kernels/cpu/kernel_avx.cpp
+ kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split.cpp
+ kernels/cpu/kernel_split_sse2.cpp
+ kernels/cpu/kernel_split_sse3.cpp
+ kernels/cpu/kernel_split_sse41.cpp
+ kernels/cpu/kernel_split_avx.cpp
+ kernels/cpu/kernel_split_avx2.cpp
+ kernels/cpu/filter.cpp
+ kernels/cpu/filter_sse2.cpp
+ kernels/cpu/filter_sse3.cpp
+ kernels/cpu/filter_sse41.cpp
+ kernels/cpu/filter_avx.cpp
+ kernels/cpu/filter_avx2.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_split.cl
@@ -32,8 +48,10 @@ set(SRC
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
+ kernels/opencl/filter.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
+ kernels/cuda/filter.cu
)
set(SRC_BVH_HEADERS
@@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu.h
kernels/cpu/kernel_cpu_impl.h
kernels/cpu/kernel_cpu_image.h
+ kernels/cpu/filter_cpu.h
+ kernels/cpu/filter_cpu_impl.h
)
set(SRC_KERNELS_CUDA_HEADERS
@@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
geom/geom_volume.h
)
+set(SRC_FILTER_HEADERS
+ filter/filter.h
+ filter/filter_defines.h
+ filter/filter_features.h
+ filter/filter_features_sse.h
+ filter/filter_kernel.h
+ filter/filter_nlm_cpu.h
+ filter/filter_nlm_gpu.h
+ filter/filter_prefilter.h
+ filter/filter_reconstruction.h
+ filter/filter_transform.h
+ filter/filter_transform_gpu.h
+ filter/filter_transform_sse.h
+)
+
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
@@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
../util/util_math_int2.h
../util/util_math_int3.h
../util/util_math_int4.h
+ ../util/util_math_matrix.h
../util/util_static_assert.h
../util/util_transform.h
../util/util_texture.h
@@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
+ set(cuda_filter_sources kernels/cuda/filter.cu
+ ${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_FILTER_HEADERS}
+ ${SRC_UTIL_HEADERS}
+ )
set(cuda_cubins)
- macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
- if(${split})
- set(cuda_extra_flags "-D__SPLIT__")
- set(cuda_cubin kernel_split)
- else()
- set(cuda_extra_flags "")
- set(cuda_cubin kernel)
- endif()
-
+ macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
if(${experimental})
- set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
- set(cuda_cubin ${cuda_cubin}_experimental)
+ set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+ set(name ${name}_experimental)
endif()
- set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+ set(cuda_cubin ${name}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
- if(split)
- set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
- else()
- set(cuda_kernel_src "/kernels/cuda/kernel.cu")
- endif()
+ set(cuda_kernel_src "/kernels/cuda/${name}.cu")
add_custom_command(
OUTPUT ${cuda_cubin}
@@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
- ${cuda_extra_flags}
+ ${flags}
${cuda_debug_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/..
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
- DEPENDS ${cuda_sources})
+ DEPENDS ${sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
@@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
endif()
endforeach()
@@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
- list(APPEND SRC
- kernels/cpu/kernel_sse2.cpp
- kernels/cpu/kernel_sse3.cpp
- kernels/cpu/kernel_sse41.cpp
- kernels/cpu/kernel_split_sse2.cpp
- kernels/cpu/kernel_split_sse3.cpp
- kernels/cpu/kernel_split_sse41.cpp
- )
-
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
- list(APPEND SRC
- kernels/cpu/kernel_avx.cpp
- kernels/cpu/kernel_split_avx.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
- list(APPEND SRC
- kernels/cpu/kernel_avx2.cpp
- kernels/cpu/kernel_split_avx2.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
@@ -432,6 +452,7 @@ add_library(cycles_kernel
${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
+ ${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
@@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index a6bba8bf74d..a04c157dc40 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
#endif
}
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+ if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ return true;
+ }
+
+ if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+ return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+ }
+
+ return false;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
float sigma;
float invsigma2;
- float3 N;
} VelvetBsdf;
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
} DiffuseBsdf;
/* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float3 *colors;
} DiffuseRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 3fe7572e4ce..30cc8b90330 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
float alpha_x, alpha_y, ior;
MicrofacetExtra *extra;
float3 T;
- float3 N;
} MicrofacetBsdf;
/* Beckmann and GGX microfacet importance sampling. */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 57f1e733ee7..30644946840 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
{
- if(cosI > 0.9999f || cosI < 1e-6f) {
+ if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
const float phi = M_2PI_F * randU.y;
return make_float2(r*cosf(phi), r*sinf(phi));
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct OrenNayarBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float roughness;
float a;
float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PhongRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float exponent;
float3 *colors;
} PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 8a116693bdb..215c32e1ffb 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
SHADER_CLOSURE_BASE;
float roughness;
- float3 N;
} PrincipledDiffuseBsdf;
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 58df4f7ddbb..f4476bfecd0 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledSheenBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
} PrincipledSheenBsdf;
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct ToonBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float size;
float smooth;
} ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index f9236a6e52c..f733ea4c517 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
float texture_blur;
float albedo;
float roughness;
- float3 N;
} Bssrdf;
/* Planar Truncated Gaussian
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+ int offsets[9];
+ int strides[9];
+ int x[4];
+ int y[4];
+ /* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+ ccl_global float *buffers[9];
+#else
+ long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..f5a40d49997
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
+{
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = ccl_get_feature(buffer, 0);
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] -= mean[i];
+ }
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
+{
+ scales[0] = fabsf(pixel.x - mean[0]);
+ scales[1] = fabsf(pixel.y - mean[1]);
+ scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+ scale[0] = 1.0f/max(scale[0], 0.01f);
+ scale[1] = 1.0f/max(scale[1], 0.01f);
+ scale[2] = 1.0f/max(scale[2], 0.01f);
+ scale[6] = 1.0f/max(scale[4], 0.01f);
+ scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+ scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+ return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
+}
+
+ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+ return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+ int rank,
+ ccl_global float ccl_restrict_ptr transform,
+ int stride,
+ int row,
+ float feature)
+{
+ for(int i = 0; i < rank; i++) {
+ design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ }
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ ccl_global float ccl_restrict_ptr p_buffer,
+ int2 q_pixel,
+ ccl_global float ccl_restrict_ptr q_buffer,
+ int pass_stride,
+ int rank,
+ float *design_row,
+ ccl_global float ccl_restrict_ptr transform,
+ int stride)
+{
+ design_row[0] = 1.0f;
+ math_vector_zero(design_row+1, rank);
+ design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+ design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+ design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+ design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..303c8f482e3
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ __m128 y4 = _mm_set1_ps(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+ __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (pixel.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+ features[0] = x;
+ features[1] = y;
+ features[2] = ccl_get_feature_sse(0);
+ features[3] = ccl_get_feature_sse(1);
+ features[4] = ccl_get_feature_sse(2);
+ features[5] = ccl_get_feature_sse(3);
+ features[6] = ccl_get_feature_sse(4);
+ features[7] = ccl_get_feature_sse(5);
+ features[8] = ccl_get_feature_sse(6);
+ features[9] = ccl_get_feature_sse(7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_sub_ps(features[i], mean[i]);
+ }
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+ scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+ scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+ scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
+
+ __m128 diff, scale;
+ diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[3] = _mm_mask_ps(scale, active_pixels);
+
+ scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+ diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+ scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+ scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+ scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+ scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+ scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+ scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+# include "kernel/filter/filter_transform_gpu.h"
+#else
+# ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_transform_sse.h"
+# else
+# include "kernel/filter/filter_transform.h"
+# endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+# include "kernel/filter/filter_nlm_cpu.h"
+#else
+# include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..1a314b100be
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = varianceImage[c*channel_offset + y*w+x];
+ float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ differenceImage[y*w+x] = diff;
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+#ifdef __KERNEL_SSE3__
+ int aligned_lowx = (rect.x & ~(3));
+ int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+ for(int y = rect.y; y < rect.w; y++) {
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] = 0.0f;
+ }
+ for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+ for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+ _mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
+ }
+#else
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] += differenceImage[y1*w+x];
+ }
+#endif
+ }
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] *= 1.0f/(high - low);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] = 0.0f;
+ }
+ }
+ for(int dx = -f; dx <= f; dx++) {
+ int pos_dx = max(0, dx);
+ int neg_dx = min(0, dx);
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+ outImage[y*w+x] += differenceImage[y*w+dx+x];
+ }
+ }
+ }
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+ accumImage[y*w+x] += weight;
+ outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ float ccl_restrict_ptr differenceImage,
+ float ccl_restrict_ptr buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+ for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+ int y = fy + filter_rect.y;
+ for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+ float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+ float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+ int *l_rank = rank + storage_ofs;
+
+ kernel_filter_construct_gramian(x, y, 1,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ color_pass, variance_pass,
+ l_transform, l_rank,
+ weight, l_XtWX, l_XtWY, 0);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] /= accumImage[y*w+x];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..b5ba7cf51a5
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+ int dx, int dy,
+ ccl_global float ccl_restrict_ptr weightImage,
+ ccl_global float ccl_restrict_ptr varianceImage,
+ ccl_global float *differenceImage,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = varianceImage[c*channel_offset + y*w+x];
+ float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ differenceImage[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int y1 = low; y1 < high; y1++) {
+ sum += differenceImage[y1*w+x];
+ }
+ sum *= 1.0f/(high-low);
+ outImage[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ outImage[y*w+x] = expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+ int dx, int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr image,
+ ccl_global float *outImage,
+ ccl_global float *accumImage,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ if(outImage) {
+ accumImage[y*w+x] += sum;
+ outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+ }
+ else {
+ accumImage[y*w+x] = sum;
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+ int dx, int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *color_pass,
+ ccl_global float *variance_pass,
+ ccl_global float ccl_restrict_ptr transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride,
+ int localIdx)
+{
+ int y = fy + filter_rect.y;
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x, y,
+ filter_rect.z*filter_rect.w,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ color_pass, variance_pass,
+ transform, rank,
+ weight, XtWX, XtWY,
+ localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ ccl_global float *outImage,
+ ccl_global float ccl_restrict_ptr accumImage,
+ int4 rect, int w)
+{
+ outImage[y*w+x] /= accumImage[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..54bcf888052
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ int x, int y,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+
+ int offset = tiles->offsets[tile];
+ int stride = tiles->strides[tile];
+ ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
+ center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+ center_buffer += buffer_denoising_offset + 14;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+ unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+ float varA = center_buffer[2];
+ float varB = center_buffer[5];
+ int odd_sample = (sample+1)/2;
+ int even_sample = sample/2;
+ if(use_split_variance) {
+ varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+ }
+ varA /= (odd_sample - 1);
+ varB /= (even_sample - 1);
+
+ sampleVariance[idx] = 0.5f*(varA + varB) / sample;
+ sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+ bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset, int v_offset,
+ int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 rect, int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+ ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ mean[idx] = center_buffer[m_offset] / sample;
+ if(use_split_variance) {
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ }
+ else {
+ variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ }
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 rect, int r)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
+ if(variance) {
+ if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+ else {
+ variance[idx] = 0.0f;
+ float values[25];
+ int numValues = 0;
+ for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+ for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+ int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+ values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+ }
+ }
+ /* Insertion-sort the variances (fast enough for 25 elements). */
+ for(int i = 1; i < numValues; i++) {
+ float v = values[i];
+ int j;
+ for(j = i-1; j >= 0 && values[j] > v; j--)
+ values[j+1] = values[j];
+ values[j+1] = v;
+ }
+ variance[idx] = values[(7*numValues)/8];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..02f3802fa0c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ int storage_stride,
+ int dx, int dy,
+ int w, int h,
+ int pass_stride,
+ ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *color_pass,
+ ccl_global float *variance_pass,
+ ccl_global float ccl_restrict_ptr transform,
+ ccl_global int *rank,
+ float weight,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int localIdx)
+{
+ int p_offset = y *w + x;
+ int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_CPU__
+ const int stride = 1;
+ (void)storage_stride;
+ (void)localIdx;
+ float design_row[DENOISE_FEATURES+1];
+#elif defined(__KERNEL_CUDA__)
+ const int stride = storage_stride;
+ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+ const int stride = storage_stride;
+ float design_row[DENOISE_FEATURES+1];
+#endif
+
+ float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
+ float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+
+ float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
+ float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
+
+ if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+ return;
+ }
+
+ filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
+ make_int2(x+dx, y+dy), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride);
+
+ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+ math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ int storage_stride,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 buffer_params,
+ int sample)
+{
+#ifdef __KERNEL_CPU__
+ const int stride = 1;
+ (void)storage_stride;
+#else
+ const int stride = storage_stride;
+#endif
+
+ math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+ float3 final_color = XtWY[0];
+
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+ final_color *= sample;
+ if(buffer_params.w) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+}
+
+#undef STORAGE_TYPE
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..139dc402d21
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float features[DENOISE_FEATURES];
+
+ /* Temporary storage, used in different steps of the algorithm. */
+ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ float tempvector[2*DENOISE_FEATURES];
+ float ccl_restrict_ptr pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+ math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float *feature_scale = tempvector;
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float* feature_matrix = tempmatrix;
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ *rank = 0;
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < (*rank); i++) {
+ math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ }
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..68304e14143
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int radius, float pca_threshold,
+ int transform_stride, int localIdx)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+ float features[DENOISE_FEATURES];
+#endif
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ ccl_global float ccl_restrict_ptr pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+ math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ *rank = 0;
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int j = 0; j < (*rank); j++) {
+ transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..ed3a92f6241
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ __m128 features[DENOISE_FEATURES];
+ float ccl_restrict_ptr pixel_buffer;
+ int2 pixel;
+
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+
+ __m128 feature_means[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ __m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ }
+
+ __m128 feature_scale[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ filter_calculate_scale_sse(feature_scale);
+
+ __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+ *rank = 0;
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 47778553b94..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Interpolate smooth vertex normal from vertices */
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
- return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+ float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+ return is_zero(N)? Ng: N;
}
/* Ray differentials on triangle */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 06c0fb2fbca..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/kernel_cpu.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 823d30dde78..06728415c15 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_depth = 0.0f;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
}
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 alpha,
float3 bsdf,
- float3 ao,
- int bounce)
+ float3 ao)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf*ao;
L->ao += alpha*throughput*ao;
@@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf;
- L->path_total += light;
- L->path_total_shaded += ao * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf;
+ L->path_total += light;
+ L->path_total_shaded += ao * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 bsdf)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf;
#endif
}
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput,
+ BsdfEval *bsdf_eval,
+ float3 shadow,
+ float shadow_fac,
+ bool is_lamp)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf_eval->sum_no_mis;
- L->path_total += light;
- L->path_total_shaded += shadow * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf_eval->sum_no_mis;
+ L->path_total += light;
+ L->path_total_shaded += shadow * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_light(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
const BsdfEval *bsdf_eval)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf_eval->sum_no_mis;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf_eval->sum_no_mis;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf_eval;
#endif
@@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * value;
- if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
- L->path_total_shaded += throughput * value;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * value;
+ if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+ L->path_total_shaded += throughput * value;
+ }
}
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_albedo += state->denoising_feature_weight * value;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
return L_sum;
}
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+ kernel_assert(L->use_light_pass);
+
+ *clean = L->emission + L->background;
+ *noisy = L->direct_scatter + L->indirect_scatter;
+
+# define ADD_COMPONENT(flag, component) \
+ if(kernel_data.film.denoising_flags & flag) \
+ *clean += component; \
+ else \
+ *noisy += component;
+
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
+# undef ADD_COMPONENT
+#else
+ *noisy = L->emission;
+ *clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+ *noisy = ensure_finite3(*noisy);
+ *clean = ensure_finite3(*clean);
+}
+
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
{
float fac = 1.0f/num_samples;
@@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
/* Calculate final light sum and transparency for shadow catcher object. */
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
const PathRadiance *L,
- ccl_addr_space float* L_transparent)
+ float* alpha)
{
const float shadow = path_radiance_sum_shadow(L);
float3 L_sum;
if(kernel_data.background.transparent) {
- *L_transparent = shadow;
+ *alpha = 1.0f-shadow;
L_sum = make_float3(0.0f, 0.0f, 0.0f);
}
else {
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 21da180bb8e..7595e74e2d5 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -42,6 +42,8 @@
#include "util/util_types.h"
#include "util/util_texture.h"
+#define ccl_restrict_ptr const * __restrict
+
#define ccl_addr_space
#define ccl_local_id(d) 0
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index c375d17a95f..80d7401fbcf 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,10 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
+#define ccl_restrict_ptr const * __restrict__
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index c2263ac0d49..15cf4b81b21 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
# define ccl_addr_space
#endif
+#define ccl_restrict_ptr const * __restrict__
+
#define ccl_local_id(d) get_local_id(d)
#define ccl_global_id(d) get_global_id(d)
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..9baa9d54957 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
cu = clamp(cu, -1.0f, 1.0f);
/* Compute xu. */
- float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+ float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
xu = clamp(xu, x0, x1);
/* Compute yv. */
float z0sq = z0 * z0;
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index ed523696571..8ab4c724829 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#endif /* __SPLIT_KERNEL__ */
}
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+ kernel_write_pass_float(buffer, sample, value);
+
+ /* The online one-pass variance update that's used for the megakernel can't easily be implemented
+ * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+1, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float(buffer+1, sample, 0.0f);
+ }
+ else {
+ float new_mean = buffer[0] * (1.0f / (sample + 1));
+ float old_mean = (buffer[0] - value) * (1.0f / sample);
+ kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+# if defined(__SPLIT_KERNEL__)
+# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+# else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+ buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+ buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+ buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+# endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+ kernel_write_pass_float3_unaligned(buffer, sample, value);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ else {
+ float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+ float3 new_mean = sum * (1.0f / (sample + 1));
+ float3 old_mean = (sum - value) * (1.0f / sample);
+ kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, float path_total, float path_total_shaded)
+{
+ if(kernel_data.film.pass_denoising_data == 0)
+ return;
+
+ buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+ path_total = ensure_finite(path_total);
+ path_total_shaded = ensure_finite(path_total_shaded);
+
+ kernel_write_pass_float(buffer, sample/2, path_total);
+ kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+ float value = path_total_shaded / max(path_total, 1e-7f);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+2, sample/2, value*value);
+# else
+ if(sample < 2) {
+ kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+ }
+ else {
+ float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+ float new_value = buffer[1] / max(buffer[0], 1e-7f);
+ kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+ }
+# endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+ ShaderData *sd,
+ ccl_global PathState *state,
+ PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight == 0.0f) {
+ return;
+ }
+
+ L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+ float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+ float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+ float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+ continue;
+
+ /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+ normal += sc->N * sc->sample_weight;
+ sum_weight += sc->sample_weight;
+ if(!bsdf_is_specular_like(sc)) {
+ albedo += sc->weight;
+ sum_nonspecular_weight += sc->sample_weight;
+ }
+ }
+
+ /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+ if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+ if(sum_weight != 0.0f) {
+ normal /= sum_weight;
+ }
+ L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+ L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+ state->denoising_feature_weight = 0.0f;
+ }
+#else
+ (void) kg;
+ (void) sd;
+ (void) state;
+ (void) L;
+#endif /* __DENOISING_FEATURES__ */
+}
+
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
@@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
#endif
}
+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+ if(L) {
+ float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+ if(is_shadow_catcher) {
+ L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+ }
+ else
+#endif /* __SHADOW_TRICKS__ */
+ {
+ L_sum = path_radiance_clamp_and_sum(kg, L);
+ }
+
+ kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+ kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+# ifdef __SHADOW_TRICKS__
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+# else
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+# endif
+ if(kernel_data.film.pass_denoising_clean) {
+ float3 noisy, clean;
+ path_radiance_split_denoising(kg, L, &noisy, &clean);
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, noisy);
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, clean);
+ }
+ else {
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, L_sum);
+ }
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, L->denoising_normal);
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, L->denoising_albedo);
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, L->denoising_depth);
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+ else {
+ kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, 0.0f);
+
+ if(kernel_data.film.pass_denoising_clean) {
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 58da141aed3..0d31ae32aa6 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
}
}
}
@@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, sd, state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
- RNG *rng,
- int sample,
- Ray ray,
- ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float3 emission;
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __LAMP_MIS__ */
@@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
@@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+ kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* indirect light bounce */
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+ L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
/* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */
@@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_EMISSION) {
/* todo: is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+ kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
- &L,
+ L,
&state,
rng,
&ray,
@@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
- kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* compute direct lighting and next bounce */
- if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
}
#ifdef __SUBSURFACE__
- kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+ kernel_path_subsurface_accum_indirect(&ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
@@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
&ss_indirect,
&state,
&ray,
- &L,
+ L,
&throughput);
}
else {
@@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ddcb57161ea..10816d3e5d1 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
}
}
}
@@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
- if(!CLOSURE_IS_BSDF(sc->type))
- continue;
/* transparency is not handled here, but in outer loop */
- if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
+ }
int num_samples;
@@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
&tp,
&ps,
L,
- &bsdf_ray))
+ &bsdf_ray,
+ sum_sample_weight))
{
continue;
}
@@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect light sampling */
@@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp*num_samples_inv,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
}
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+ kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
rng,
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
#endif /* __VOLUME_SCATTER__ */
@@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+ L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
#ifdef __EMISSION__
/* emission */
if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
}
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+ kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
}
#endif /* __AO__ */
@@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
- &L, &state, rng, &ray, throughput);
+ L, &state, rng, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg, rng,
- &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+ &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng,
- &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+ &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __VOLUME__ */
}
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
@@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
#endif /* __BRANCHED_PATH__ */
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c0cd2a63120..0fa77d9e8bd 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
state->transmission_bounce = 0;
state->transparent_bounce = 0;
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+ state->denoising_feature_weight = 1.0f;
+ }
+ else {
+ state->denoising_feature_weight = 0.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
state->min_ray_pdf = FLT_MAX;
state->ray_pdf = 0.0f;
#ifdef __LAMP_MIS__
@@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
+
+ if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+ state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+ }
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index bd4ba775b4d..e676ea0f3ae 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
}
}
}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
PathRadiance *L,
- ccl_addr_space Ray *ray)
+ ccl_addr_space Ray *ray,
+ float sum_sample_weight)
{
/* sample BSDF */
float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
/* modify throughput */
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+#ifdef __DENOISING_FEATURES__
+ state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
/* modify path state */
path_state_next(kg, state, label);
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput, &L_light);
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
}
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 371f2c1c7cb..dcedf51e479 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
}
}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
}
}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
{
+ if(is_zero(dir))
+ return make_float2(0.0f, 0.0f);
+
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 8c0c5e90a3e..c66f52255f0 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
sd->N = Ng;
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
# ifdef __DPDU__
/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL) {
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 9b354457b91..dd1fa1b82f7 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
#define __PATCH_EVAL__
#define __SHADOW_TRICKS__
+#define __DENOISING_FEATURES__
+
#ifdef __KERNEL_SHADING__
# define __SVM__
# define __EMISSION__
@@ -314,31 +316,32 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
- PATH_RAY_CAMERA = 1,
- PATH_RAY_REFLECT = 2,
- PATH_RAY_TRANSMIT = 4,
- PATH_RAY_DIFFUSE = 8,
- PATH_RAY_GLOSSY = 16,
- PATH_RAY_SINGULAR = 32,
- PATH_RAY_TRANSPARENT = 64,
-
- PATH_RAY_SHADOW_OPAQUE = 128,
- PATH_RAY_SHADOW_TRANSPARENT = 256,
+ PATH_RAY_CAMERA = (1 << 0),
+ PATH_RAY_REFLECT = (1 << 1),
+ PATH_RAY_TRANSMIT = (1 << 2),
+ PATH_RAY_DIFFUSE = (1 << 3),
+ PATH_RAY_GLOSSY = (1 << 4),
+ PATH_RAY_SINGULAR = (1 << 5),
+ PATH_RAY_TRANSPARENT = (1 << 6),
+
+ PATH_RAY_SHADOW_OPAQUE = (1 << 7),
+ PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
- PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
- PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+ PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
+ PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
- PATH_RAY_NODE_UNALIGNED = 2048,
+ PATH_RAY_NODE_UNALIGNED = (1 << 11),
- PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+ PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
- PATH_RAY_MIS_SKIP = 4096,
- PATH_RAY_DIFFUSE_ANCESTOR = 8192,
- PATH_RAY_SINGLE_PASS_DONE = 16384,
- PATH_RAY_SHADOW_CATCHER = 32768,
- PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+ PATH_RAY_MIS_SKIP = (1 << 12),
+ PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
+ PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
+ PATH_RAY_SHADOW_CATCHER = (1 << 15),
+ PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+ PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
};
/* Closure Label */
@@ -394,6 +397,22 @@ typedef enum PassType {
#define PASS_ALL (~0)
+typedef enum DenoisingPassOffsets {
+ DENOISING_PASS_NORMAL = 0,
+ DENOISING_PASS_NORMAL_VAR = 3,
+ DENOISING_PASS_ALBEDO = 6,
+ DENOISING_PASS_ALBEDO_VAR = 9,
+ DENOISING_PASS_DEPTH = 12,
+ DENOISING_PASS_DEPTH_VAR = 13,
+ DENOISING_PASS_SHADOW_A = 14,
+ DENOISING_PASS_SHADOW_B = 17,
+ DENOISING_PASS_COLOR = 20,
+ DENOISING_PASS_COLOR_VAR = 23,
+
+ DENOISING_PASS_SIZE_BASE = 26,
+ DENOISING_PASS_SIZE_CLEAN = 3,
+} DenoisingPassOffsets;
+
typedef enum BakePassFilter {
BAKE_FILTER_NONE = 0,
BAKE_FILTER_DIRECT = (1 << 0),
@@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
} BakePassFilterCombos;
+typedef enum DenoiseFlag {
+ DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
+ DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
+ DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
+ DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
+ DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+ DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+ DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
+ DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
+ DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
+} DenoiseFlag;
+
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
@@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
/* Color of the background on which shadow is alpha-overed. */
float3 shadow_color;
#endif
+
+#ifdef __DENOISING_FEATURES__
+ float3 denoising_normal;
+ float3 denoising_albedo;
+ float denoising_depth;
+#endif /* __DENOISING_FEATURES__ */
} PathRadiance;
typedef struct BsdfEval {
@@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
#define SHADER_CLOSURE_BASE \
float3 weight; \
ClosureType type; \
- float sample_weight \
+ float sample_weight; \
+ float3 N
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
SHADER_CLOSURE_BASE;
- float data[14]; /* pad to 80 bytes */
+ float data[10]; /* pad to 80 bytes */
} ShaderClosure;
/* Shader Context
@@ -960,6 +998,10 @@ typedef struct PathState {
int transmission_bounce;
int transparent_bounce;
+#ifdef __DENOISING_FEATURES__
+ float denoising_feature_weight;
+#endif /* __DENOISING_FEATURES__ */
+
/* multiple importance sampling */
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
float ray_pdf; /* last bounce pdf */
@@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
+ int pass_denoising_data;
+ int pass_denoising_clean;
+ int denoising_flags;
+ int pad;
+
#ifdef __KERNEL_DEBUG__
int pass_bvh_traversed_nodes;
int pass_bvh_traversed_instances;
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+ /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..10007ee2635
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleV,
+ float *sampleVV,
+ float *bufferV,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean,
+ float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weightImage,
+ float *variance,
+ float *differenceImage,
+ int* rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+ float *outImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+ float *outImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *differenceImage,
+ float *image,
+ float *outImage,
+ float *accumImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *differenceImage,
+ float *buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+ float *accumImage,
+ int* rect,
+ int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..3b71e50ca3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+ kernel_filter_divide_shadow(sample, tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean, float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+ kernel_filter_get_feature(sample, tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* prefilter_rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
+ kernel_filter_construct_transform(buffer,
+ x, y,
+ load_int4(prefilter_rect),
+ pass_stride,
+ transform,
+ rank,
+ radius,
+ pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weightImage,
+ float *variance,
+ float *differenceImage,
+ int *rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+ kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+ float *outImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+ kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+ float *outImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+ kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *differenceImage,
+ float *image,
+ float *outImage,
+ float *accumImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+ kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *differenceImage,
+ float *buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+ kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+ float *accumImage,
+ int *rect,
+ int w)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+ kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+ XtWX += storage_ofs*XTWX_SIZE;
+ XtWY += storage_ofs*XTWY_SIZE;
+ rank += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 39c9a9cf33c..9895080d328 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 8c05dd1d9ef..b9d82781840 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -57,6 +57,11 @@
# include "kernel/split/kernel_buffer_update.h"
#endif
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
CCL_NAMESPACE_BEGIN
#ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
-#endif
+# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
+#endif /* KERNEL_STUB */
}
/* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader);
+#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
-#endif
+# endif
}
else {
kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
+#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+#else
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
+#endif /* KERNEL_STUB */
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
- REGISTER(path_trace);
- REGISTER(convert_to_byte);
- REGISTER(convert_to_half_float);
- REGISTER(shader);
-
- REGISTER(data_init);
- REGISTER(path_init);
- REGISTER(scene_intersect);
- REGISTER(lamp_emission);
- REGISTER(do_volume);
- REGISTER(queue_enqueue);
- REGISTER(indirect_background);
- REGISTER(shader_setup);
- REGISTER(shader_sort);
- REGISTER(shader_eval);
- REGISTER(holdout_emission_blurring_pathtermination_ao);
- REGISTER(subsurface_scatter);
- REGISTER(direct_lighting);
- REGISTER(shadow_blocked_ao);
- REGISTER(shadow_blocked_dl);
- REGISTER(next_iteration_setup);
- REGISTER(indirect_subsurface);
- REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
#endif /* __SPLIT_KERNEL__ */
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..50f73f9728d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+ TilesInfo *tiles,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ float *mean,
+ float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+ float *transform, int *rank,
+ int4 filter_area, int4 rect,
+ int radius, float pca_threshold,
+ int pass_stride)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int *l_rank = rank + y*filter_area.z + x;
+ float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+ float ccl_restrict_ptr weightImage,
+ float ccl_restrict_ptr varianceImage,
+ float *differenceImage,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+ float ccl_restrict_ptr differenceImage,
+ float ccl_restrict_ptr image,
+ float *outImage, float *accumImage,
+ int4 rect, int w,
+ int f) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+ float ccl_restrict_ptr differenceImage,
+ float ccl_restrict_ptr buffer,
+ float *color_pass,
+ float *variance_pass,
+ float const* __restrict__ transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+ int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ differenceImage,
+ buffer,
+ color_pass, variance_pass,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+ float *buffer, int *rank,
+ float *XtWX, float3 *XtWY,
+ int4 filter_area, int4 buffer_params,
+ int sample) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..3d82bff9892
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 prefilter_rect,
+ int r)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int4 filter_area,
+ int4 rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ ccl_global int *l_rank = rank + y*filter_area.z + x;
+ ccl_global float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+ int dy,
+ ccl_global float ccl_restrict_ptr weightImage,
+ ccl_global float ccl_restrict_ptr varianceImage,
+ ccl_global float *differenceImage,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect,
+ int w,
+ int f) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect,
+ int w,
+ int f) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+ int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr image,
+ ccl_global float *outImage,
+ ccl_global float *accumImage,
+ int4 rect,
+ int w,
+ int f) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
+ ccl_global float ccl_restrict_ptr accumImage,
+ int4 rect,
+ int w) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+ int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *color_pass,
+ ccl_global float *variance_pass,
+ ccl_global float ccl_restrict_ptr transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride) {
+ int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+ int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ differenceImage,
+ buffer,
+ color_pass, variance_pass,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+ int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 filter_area,
+ int4 buffer_params,
+ int sample) {
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+ ccl_global float *buffer_1,
+ ccl_global float *buffer_2,
+ ccl_global float *buffer_3,
+ ccl_global float *buffer_4,
+ ccl_global float *buffer_5,
+ ccl_global float *buffer_6,
+ ccl_global float *buffer_7,
+ ccl_global float *buffer_8,
+ ccl_global float *buffer_9)
+{
+ if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+ tiles->buffers[0] = buffer_1;
+ tiles->buffers[1] = buffer_2;
+ tiles->buffers[2] = buffer_3;
+ tiles->buffers[3] = buffer_4;
+ tiles->buffers[4] = buffer_5;
+ tiles->buffers[5] = buffer_6;
+ tiles->buffers[6] = buffer_7;
+ tiles->buffers[7] = buffer_8;
+ tiles->buffers[8] = buffer_9;
+ }
+}
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index c7bc1b4df0a..dc74a2ada53 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
float3 throughput = branched_state->throughput;
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(ps->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
@@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG bsdf_rng = cmj_hash(rng, i);
for(int j = branched_state->next_sample; j < num_samples; j++) {
- ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
if(reset_path_state) {
*ps = branched_state->path_state;
}
@@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
tp,
ps,
L,
- bsdf_ray))
+ bsdf_ray,
+ sum_sample_weight))
{
continue;
}
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 859c221d976..1f6dce0253c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- float3 L_sum;
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
- }
- else
-#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, L);
- }
kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
- float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L_rad);
+ bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 87498910d38..670a557f084 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __SHADOW_TRICKS__
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state->flag & PATH_RAY_CAMERA) {
- state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state->catcher_object = sd->object;
if(!kernel_data.background.transparent) {
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
+
+ kernel_update_denoising_features(kg, sd, state, L);
}
}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 452b6e45a36..386fbbc4d09 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
&shadow))
{
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput, &L_light);
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 407f8e784c0..7918c640175 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
@@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
/* todo: giving a fixed weight here will cause issues when
* mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index c94fa130af7..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
strength = max(strength, 0.0f);
/* compute and output perturbed normal */
- float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
- normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+ if(is_zero(normal_out)) {
+ normal_out = normal_in;
+ }
+ else {
+ normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ }
if(use_object_space) {
object_normal_transform(kg, sd, &normal_out);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 4a09d9f6653..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
#ifdef __UV__
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
+ default: data = make_float3(0.0f, 0.0f, 0.0f);
}
stack_store_float3(stack, out_offset, data);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 328ff79223b..8e45dbfa5ff 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
float3 co = stack_load_float3(stack, co_offset);
float2 uv;
- co = normalize(co);
-
+ co = safe_normalize(co);
+
if(projection == 0)
uv = direction_to_equirectangular(co);
else
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index cc9b840b13f..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -402,7 +402,6 @@ typedef enum ClosureType {
CLOSURE_BSDF_DIFFUSE_TOON_ID,
/* Glossy */
- CLOSURE_BSDF_GLOSSY_ID,
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
@@ -423,14 +422,13 @@ typedef enum ClosureType {
CLOSURE_BSDF_HAIR_REFLECTION_ID,
/* Transmission */
- CLOSURE_BSDF_TRANSMISSION_ID,
CLOSURE_BSDF_TRANSLUCENT_ID,
CLOSURE_BSDF_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
- CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
CLOSURE_BSDF_SHARP_GLASS_ID,
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
@@ -465,13 +463,16 @@ typedef enum ClosureType {
/* watch this, being lazy with memory usage */
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+ (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -480,7 +481,7 @@ typedef enum ClosureType {
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
#define CLOSURE_WEIGHT_CUTOFF 1e-5f
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 49934143ee9..ff6f40c010e 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,9 @@ BufferParams::BufferParams()
full_width = 0;
full_height = 0;
+ denoising_data_pass = false;
+ denoising_clean_pass = false;
+
Pass::add(PASS_COMBINED, passes);
}
@@ -68,10 +71,25 @@ int BufferParams::get_passes_size()
for(size_t i = 0; i < passes.size(); i++)
size += passes[i].components;
-
+
+ if(denoising_data_pass) {
+ size += DENOISING_PASS_SIZE_BASE;
+ if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+ }
+
return align_up(size, 4);
}
+int BufferParams::get_denoising_offset()
+{
+ int offset = 0;
+
+ for(size_t i = 0; i < passes.size(); i++)
+ offset += passes[i].components;
+
+ return offset;
+}
+
/* Render Buffer Task */
RenderTile::RenderTile()
@@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
}
-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::copy_from_device(Device *from_device)
{
if(!buffer.device_pointer)
return false;
- device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+ if(!from_device) {
+ from_device = device;
+ }
+
+ from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+
+ return true;
+}
+
+bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
+{
+ float scale = 1.0f/sample;
+
+ if(offset == DENOISING_PASS_COLOR) {
+ scale *= exposure;
+ }
+ else if(offset == DENOISING_PASS_COLOR_VAR) {
+ scale *= exposure*exposure;
+ }
+
+ offset += params.get_denoising_offset();
+ float *in = (float*)buffer.data_pointer + offset;
+ int pass_stride = params.get_passes_size();
+ int size = params.width*params.height;
+
+ if(components == 1) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+ pixels[0] = in[0]*scale;
+ }
+ }
+ else if(components == 3) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+ pixels[0] = in[0]*scale;
+ pixels[1] = in[1]*scale;
+ pixels[2] = in[2]*scale;
+ }
+ }
+ else {
+ return false;
+ }
return true;
}
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 5c78971678a..e56556c8abe 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -51,6 +51,9 @@ public:
/* passes */
array<Pass> passes;
+ bool denoising_data_pass;
+ /* If only some light path types should be denoised, an additional pass is needed. */
+ bool denoising_clean_pass;
/* functions */
BufferParams();
@@ -59,6 +62,7 @@ public:
bool modified(const BufferParams& params);
void add_pass(PassType type);
int get_passes_size();
+ int get_denoising_offset();
};
/* Render Buffers */
@@ -73,18 +77,19 @@ public:
/* random number generator state */
device_vector<uint> rng_state;
+ Device *device;
+
explicit RenderBuffers(Device *device);
~RenderBuffers();
void reset(Device *device, BufferParams& params);
- bool copy_from_device();
+ bool copy_from_device(Device *from_device = NULL);
bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
+ bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
protected:
void device_free();
-
- Device *device;
};
/* Display Buffer
@@ -131,6 +136,9 @@ protected:
class RenderTile {
public:
+ typedef enum { PATH_TRACE, DENOISE } Task;
+
+ Task task;
int x, y, w, h;
int start_sample;
int num_samples;
@@ -138,6 +146,7 @@ public:
int resolution;
int offset;
int stride;
+ int tile_index;
device_ptr buffer;
device_ptr rng_state;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 7809f4345f1..c8213d258d5 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -279,6 +279,10 @@ NODE_DEFINE(Film)
SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
+ SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
+ SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+ SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+
return type;
}
@@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride += pass.components;
}
+ kfilm->pass_denoising_data = 0;
+ kfilm->pass_denoising_clean = 0;
+ kfilm->denoising_flags = 0;
+ if(denoising_data_pass) {
+ kfilm->pass_denoising_data = kfilm->pass_stride;
+ kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
+ kfilm->denoising_flags = denoising_flags;
+ if(denoising_clean_pass) {
+ kfilm->pass_denoising_clean = kfilm->pass_stride;
+ kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
+ kfilm->use_light_pass = 1;
+ }
+ }
+
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
kfilm->pass_alpha_threshold = pass_alpha_threshold;
@@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
kfilm->mist_falloff = mist_falloff;
+ pass_stride = kfilm->pass_stride;
+ denoising_data_offset = kfilm->pass_denoising_data;
+ denoising_clean_offset = kfilm->pass_denoising_clean;
+
need_update = false;
}
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 83c941d5c57..29b1e7e9157 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -57,8 +57,15 @@ public:
float exposure;
array<Pass> passes;
+ bool denoising_data_pass;
+ bool denoising_clean_pass;
+ int denoising_flags;
float pass_alpha_threshold;
+ int pass_stride;
+ int denoising_data_offset;
+ int denoising_clean_offset;
+
FilterType filter_type;
float filter_width;
size_t filter_table_offset;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 33d1936659b..03825f780e0 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -903,7 +903,7 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
float3 vNi = vN[i];
if(do_transform)
- vNi = normalize(transform_direction(&ntfm, vNi));
+ vNi = safe_normalize(transform_direction(&ntfm, vNi));
vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
}
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c9b5547b407..3eaf34c847f 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -114,8 +114,9 @@ Session::~Session()
}
/* clean up */
- foreach(RenderBuffers *buffers, tile_buffers)
- delete buffers;
+ foreach(RenderTile &rtile, render_tiles)
+ delete rtile.buffers;
+ tile_manager.free_device();
delete buffers;
delete display;
@@ -268,8 +269,8 @@ void Session::run_gpu()
/* update status and timing */
update_status_time();
- /* path trace */
- path_trace();
+ /* render */
+ render();
device->task_wait();
@@ -358,20 +359,22 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
thread_scoped_lock tile_lock(tile_mutex);
/* get next tile from manager */
- Tile tile;
+ Tile *tile;
int device_num = device->device_number(tile_device);
if(!tile_manager.next_tile(tile, device_num))
return false;
/* fill render tile */
- rtile.x = tile_manager.state.buffer.full_x + tile.x;
- rtile.y = tile_manager.state.buffer.full_y + tile.y;
- rtile.w = tile.w;
- rtile.h = tile.h;
+ rtile.x = tile_manager.state.buffer.full_x + tile->x;
+ rtile.y = tile_manager.state.buffer.full_y + tile->y;
+ rtile.w = tile->w;
+ rtile.h = tile->h;
rtile.start_sample = tile_manager.state.sample;
rtile.num_samples = tile_manager.state.num_samples;
rtile.resolution = tile_manager.state.resolution_divider;
+ rtile.tile_index = tile->index;
+ rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
tile_lock.unlock();
@@ -383,54 +386,70 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
rtile.buffer = buffers->buffer.device_pointer;
rtile.rng_state = buffers->rng_state.device_pointer;
rtile.buffers = buffers;
+ tile->buffers = buffers;
device->map_tile(tile_device, rtile);
return true;
}
- /* fill buffer parameters */
- BufferParams buffer_params = tile_manager.params;
- buffer_params.full_x = rtile.x;
- buffer_params.full_y = rtile.y;
- buffer_params.width = rtile.w;
- buffer_params.height = rtile.h;
-
- buffer_params.get_offset_stride(rtile.offset, rtile.stride);
-
- RenderBuffers *tilebuffers;
+ bool store_rtile = false;
+ if(tile->buffers == NULL) {
+ /* fill buffer parameters */
+ BufferParams buffer_params = tile_manager.params;
+ buffer_params.full_x = rtile.x;
+ buffer_params.full_y = rtile.y;
+ buffer_params.width = rtile.w;
+ buffer_params.height = rtile.h;
+
+ /* allocate buffers */
+ if(params.progressive_refine) {
+ tile_lock.lock();
+
+ if(render_tiles.size() == 0) {
+ RenderTile nulltile;
+ nulltile.buffers = NULL;
+ render_tiles.resize(tile_manager.state.num_tiles, nulltile);
+ }
- /* allocate buffers */
- if(params.progressive_refine) {
- tile_lock.lock();
+ /* In certain circumstances number of tiles in the tile manager could
+ * be changed. This is not supported by the progressive refine feature.
+ */
+ assert(render_tiles.size() == tile_manager.state.num_tiles);
- if(tile_buffers.size() == 0)
- tile_buffers.resize(tile_manager.state.num_tiles, NULL);
+ RenderTile &stored_rtile = render_tiles[tile->index];
+ if(stored_rtile.buffers == NULL) {
+ tile->buffers = new RenderBuffers(tile_device);
+ tile->buffers->reset(tile_device, buffer_params);
+ store_rtile = true;
+ }
+ else {
+ assert(rtile.x == stored_rtile.x &&
+ rtile.y == stored_rtile.y &&
+ rtile.w == stored_rtile.w &&
+ rtile.h == stored_rtile.h);
+ tile_lock.unlock();
+ tile->buffers = stored_rtile.buffers;
+ }
+ }
+ else {
+ tile->buffers = new RenderBuffers(tile_device);
- /* In certain circumstances number of tiles in the tile manager could
- * be changed. This is not supported by the progressive refine feature.
- */
- assert(tile_buffers.size() == tile_manager.state.num_tiles);
+ tile->buffers->reset(tile_device, buffer_params);
+ }
+ }
- tilebuffers = tile_buffers[tile.index];
- if(tilebuffers == NULL) {
- tilebuffers = new RenderBuffers(tile_device);
- tile_buffers[tile.index] = tilebuffers;
+ tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
- tilebuffers->reset(tile_device, buffer_params);
- }
+ rtile.buffer = tile->buffers->buffer.device_pointer;
+ rtile.rng_state = tile->buffers->rng_state.device_pointer;
+ rtile.buffers = tile->buffers;
+ rtile.sample = 0;
+ if(store_rtile) {
+ render_tiles[tile->index] = rtile;
tile_lock.unlock();
}
- else {
- tilebuffers = new RenderBuffers(tile_device);
-
- tilebuffers->reset(tile_device, buffer_params);
- }
-
- rtile.buffer = tilebuffers->buffer.device_pointer;
- rtile.rng_state = tilebuffers->rng_state.device_pointer;
- rtile.buffers = tilebuffers;
/* this will tag tile as IN PROGRESS in blender-side render pipeline,
* which is needed to highlight currently rendering tile before first
@@ -449,7 +468,7 @@ void Session::update_tile_sample(RenderTile& rtile)
if(params.progressive_refine == false) {
/* todo: optimize this by making it thread safe and removing lock */
- update_render_tile_cb(rtile);
+ update_render_tile_cb(rtile, true);
}
}
@@ -462,18 +481,75 @@ void Session::release_tile(RenderTile& rtile)
progress.add_finished_tile();
- if(write_render_tile_cb) {
- if(params.progressive_refine == false) {
- /* todo: optimize this by making it thread safe and removing lock */
- write_render_tile_cb(rtile);
+ bool delete_tile;
- delete rtile.buffers;
+ if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) {
+ if(write_render_tile_cb && params.progressive_refine == false) {
+ write_render_tile_cb(rtile);
+ if(delete_tile) {
+ delete rtile.buffers;
+ tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
+ }
+ }
+ }
+ else {
+ if(update_render_tile_cb && params.progressive_refine == false) {
+ update_render_tile_cb(rtile, false);
}
}
update_status_time();
}
+void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+ thread_scoped_lock tile_lock(tile_mutex);
+
+ int center_idx = tiles[4].tile_index;
+ assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+ BufferParams buffer_params = tile_manager.params;
+ int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y,
+ buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height);
+
+ for(int dy = -1, i = 0; dy <= 1; dy++) {
+ for(int dx = -1; dx <= 1; dx++, i++) {
+ int px = tiles[4].x + dx*params.tile_size.x;
+ int py = tiles[4].y + dy*params.tile_size.y;
+ if(px >= image_region.x && py >= image_region.y &&
+ px < image_region.z && py < image_region.w) {
+ int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx;
+ Tile *tile = &tile_manager.state.tiles[tile_index];
+ assert(tile->buffers);
+
+ tiles[i].buffer = tile->buffers->buffer.device_pointer;
+ tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
+ tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
+ tiles[i].w = tile->w;
+ tiles[i].h = tile->h;
+ tiles[i].buffers = tile->buffers;
+
+ tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
+ }
+ else {
+ tiles[i].buffer = (device_ptr)NULL;
+ tiles[i].buffers = NULL;
+ tiles[i].x = clamp(px, image_region.x, image_region.z);
+ tiles[i].y = clamp(py, image_region.y, image_region.w);
+ tiles[i].w = tiles[i].h = 0;
+ }
+ }
+ }
+
+ assert(tiles[4].buffers);
+ device->map_neighbor_tiles(tile_device, tiles);
+}
+
+void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+ thread_scoped_lock tile_lock(tile_mutex);
+ device->unmap_neighbor_tiles(tile_device, tiles);
+}
+
void Session::run_cpu()
{
bool tiles_written = false;
@@ -558,8 +634,8 @@ void Session::run_cpu()
/* update status and timing */
update_status_time();
- /* path trace */
- path_trace();
+ /* render */
+ render();
/* update status and timing */
update_status_time();
@@ -744,10 +820,10 @@ void Session::reset(BufferParams& buffer_params, int samples)
if(params.progressive_refine) {
thread_scoped_lock buffers_lock(buffers_mutex);
- foreach(RenderBuffers *buffers, tile_buffers)
- delete buffers;
+ foreach(RenderTile &rtile, render_tiles)
+ delete rtile.buffers;
- tile_buffers.clear();
+ render_tiles.clear();
}
}
@@ -882,13 +958,15 @@ void Session::update_status_time(bool show_pause, bool show_done)
progress.set_status(status, substatus);
}
-void Session::path_trace()
+void Session::render()
{
/* add path trace task */
- DeviceTask task(DeviceTask::PATH_TRACE);
+ DeviceTask task(DeviceTask::RENDER);
task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
task.release_tile = function_bind(&Session::release_tile, this, _1);
+ task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
+ task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
@@ -897,6 +975,18 @@ void Session::path_trace()
task.requested_tile_size = params.tile_size;
task.passes_size = tile_manager.params.get_passes_size();
+ if(params.use_denoising) {
+ task.denoising_radius = params.denoising_radius;
+ task.denoising_strength = params.denoising_strength;
+ task.denoising_feature_strength = params.denoising_feature_strength;
+ task.denoising_relative_pca = params.denoising_relative_pca;
+
+ assert(!scene->film->need_update);
+ task.pass_stride = scene->film->pass_stride;
+ task.pass_denoising_data = scene->film->denoising_data_offset;
+ task.pass_denoising_clean = scene->film->denoising_clean_offset;
+ }
+
device->task_add(task);
}
@@ -940,9 +1030,7 @@ bool Session::update_progressive_refine(bool cancel)
}
if(params.progressive_refine) {
- foreach(RenderBuffers *buffers, tile_buffers) {
- RenderTile rtile;
- rtile.buffers = buffers;
+ foreach(RenderTile &rtile, render_tiles) {
rtile.sample = sample;
if(write) {
@@ -951,7 +1039,7 @@ bool Session::update_progressive_refine(bool cancel)
}
else {
if(update_render_tile_cb)
- update_render_tile_cb(rtile);
+ update_render_tile_cb(rtile, true);
}
}
}
@@ -965,10 +1053,11 @@ void Session::device_free()
{
scene->device_free();
- foreach(RenderBuffers *buffers, tile_buffers)
- delete buffers;
+ foreach(RenderTile &tile, render_tiles)
+ delete tile.buffers;
+ tile_manager.free_device();
- tile_buffers.clear();
+ render_tiles.clear();
/* used from background render only, so no need to
* re-create render/display buffers here
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index a7e5f78a64d..a7ca90abbce 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -57,6 +57,12 @@ public:
bool display_buffer_linear;
+ bool use_denoising;
+ int denoising_radius;
+ float denoising_strength;
+ float denoising_feature_strength;
+ bool denoising_relative_pca;
+
double cancel_timeout;
double reset_timeout;
double text_timeout;
@@ -77,6 +83,12 @@ public:
start_resolution = INT_MAX;
threads = 0;
+ use_denoising = false;
+ denoising_radius = 8;
+ denoising_strength = 0.0f;
+ denoising_feature_strength = 0.0f;
+ denoising_relative_pca = false;
+
display_buffer_linear = false;
cancel_timeout = 0.1;
@@ -99,6 +111,11 @@ public:
&& tile_size == params.tile_size
&& start_resolution == params.start_resolution
&& threads == params.threads
+ && use_denoising == params.use_denoising
+ && denoising_radius == params.denoising_radius
+ && denoising_strength == params.denoising_strength
+ && denoising_feature_strength == params.denoising_feature_strength
+ && denoising_relative_pca == params.denoising_relative_pca
&& display_buffer_linear == params.display_buffer_linear
&& cancel_timeout == params.cancel_timeout
&& reset_timeout == params.reset_timeout
@@ -126,7 +143,7 @@ public:
Stats stats;
function<void(RenderTile&)> write_render_tile_cb;
- function<void(RenderTile&)> update_render_tile_cb;
+ function<void(RenderTile&, bool)> update_render_tile_cb;
explicit Session(const SessionParams& params);
~Session();
@@ -162,7 +179,7 @@ protected:
void update_status_time(bool show_pause = false, bool show_done = false);
void tonemap(int sample);
- void path_trace();
+ void render();
void reset_(BufferParams& params, int samples);
void run_cpu();
@@ -177,6 +194,9 @@ protected:
void update_tile_sample(RenderTile& tile);
void release_tile(RenderTile& tile);
+ void map_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+ void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+
bool device_use_gl;
thread *session_thread;
@@ -202,7 +222,7 @@ protected:
double last_update_time;
bool update_progressive_refine(bool cancel);
- vector<RenderBuffers *> tile_buffers;
+ vector<RenderTile> render_tiles;
DeviceRequestedFeatures get_requested_device_features();
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 944e746ca2d..176a1f4f0f3 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -25,37 +25,39 @@ namespace {
class TileComparator {
public:
- TileComparator(TileOrder order, int2 center)
- : order_(order),
- center_(center)
+ TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
+ : order(order_),
+ center(center_),
+ tiles(tiles_)
{}
- bool operator()(Tile &a, Tile &b)
+ bool operator()(int a, int b)
{
- switch(order_) {
+ switch(order) {
case TILE_CENTER:
{
- float2 dist_a = make_float2(center_.x - (a.x + a.w/2),
- center_.y - (a.y + a.h/2));
- float2 dist_b = make_float2(center_.x - (b.x + b.w/2),
- center_.y - (b.y + b.h/2));
+ float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2),
+ center.y - (tiles[a].y + tiles[a].h/2));
+ float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2),
+ center.y - (tiles[b].y + tiles[b].h/2));
return dot(dist_a, dist_a) < dot(dist_b, dist_b);
}
case TILE_LEFT_TO_RIGHT:
- return (a.x == b.x)? (a.y < b.y): (a.x < b.x);
+ return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x);
case TILE_RIGHT_TO_LEFT:
- return (a.x == b.x)? (a.y < b.y): (a.x > b.x);
+ return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x);
case TILE_TOP_TO_BOTTOM:
- return (a.y == b.y)? (a.x < b.x): (a.y > b.y);
+ return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y);
case TILE_BOTTOM_TO_TOP:
default:
- return (a.y == b.y)? (a.x < b.x): (a.y < b.y);
+ return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y);
}
}
protected:
- TileOrder order_;
- int2 center_;
+ TileOrder order;
+ int2 center;
+ Tile *tiles;
};
inline int2 hilbert_index_to_pos(int n, int d)
@@ -96,6 +98,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i
num_devices = num_devices_;
preserve_tile_device = preserve_tile_device_;
background = background_;
+ schedule_denoising = false;
range_start_sample = 0;
range_num_samples = -1;
@@ -108,6 +111,16 @@ TileManager::~TileManager()
{
}
+void TileManager::free_device()
+{
+ if(schedule_denoising) {
+ for(int i = 0; i < state.tiles.size(); i++) {
+ delete state.tiles[i].buffers;
+ state.tiles[i].buffers = NULL;
+ }
+ }
+}
+
static int get_divider(int w, int h, int start_resolution)
{
int divider = 1;
@@ -133,6 +146,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
state.num_tiles = 0;
state.num_samples = 0;
state.resolution_divider = get_divider(params.width, params.height, start_resolution);
+ state.render_tiles.clear();
+ state.denoising_tiles.clear();
state.tiles.clear();
}
@@ -157,6 +172,9 @@ void TileManager::set_samples(int num_samples_)
}
state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+ if(schedule_denoising) {
+ state.total_pixel_samples += params.width*params.height;
+ }
}
}
@@ -169,32 +187,36 @@ int TileManager::gen_tiles(bool sliced)
int image_h = max(1, params.height/resolution);
int2 center = make_int2(image_w/2, image_h/2);
- state.tiles.clear();
-
int num_logical_devices = preserve_tile_device? num_devices: 1;
int num = min(image_h, num_logical_devices);
int slice_num = sliced? num: 1;
- int tile_index = 0;
+ int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
state.tiles.clear();
- state.tiles.resize(num);
- vector<list<Tile> >::iterator tile_list = state.tiles.begin();
+ state.render_tiles.clear();
+ state.denoising_tiles.clear();
+ state.render_tiles.resize(num);
+ state.denoising_tiles.resize(num);
+ state.tile_stride = tile_w;
+ vector<list<int> >::iterator tile_list;
+ tile_list = state.render_tiles.begin();
if(tile_order == TILE_HILBERT_SPIRAL) {
assert(!sliced);
+ int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+ state.tiles.resize(tile_w*tile_h);
+
/* Size of blocks in tiles, must be a power of 2 */
const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4;
- int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
- int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y;
- int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+ int tiles_per_device = divide_up(tile_w * tile_h, num);
int cur_device = 0, cur_tiles = 0;
int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
/* Number of blocks to fill the image */
- int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x;
- int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y;
+ int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x);
+ int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y);
int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
/* Offset of spiral (to keep it centered) */
int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2);
@@ -225,9 +247,11 @@ int TileManager::gen_tiles(bool sliced)
if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
int w = min(tile_size.x, image_w - pos.x);
int h = min(tile_size.y, image_h - pos.y);
- tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device));
+ int2 ipos = pos / tile_size;
+ int idx = ipos.y*tile_w + ipos.x;
+ state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
+ tile_list->push_front(idx);
cur_tiles++;
- tile_index++;
if(cur_tiles == tiles_per_device) {
tile_list++;
@@ -271,27 +295,28 @@ int TileManager::gen_tiles(bool sliced)
break;
}
}
- return tile_index;
+ return tile_w*tile_h;
}
+ int idx = 0;
for(int slice = 0; slice < slice_num; slice++) {
int slice_y = (image_h/slice_num)*slice;
int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num;
- int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
- int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y;
+ int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y);
- int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+ int tiles_per_device = divide_up(tile_w * tile_h, num);
int cur_device = 0, cur_tiles = 0;
for(int tile_y = 0; tile_y < tile_h; tile_y++) {
- for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) {
+ for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
int x = tile_x * tile_size.x;
int y = tile_y * tile_size.y;
int w = (tile_x == tile_w-1)? image_w - x: tile_size.x;
int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y;
- tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device));
+ state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER));
+ tile_list->push_back(idx);
if(!sliced) {
cur_tiles++;
@@ -299,7 +324,7 @@ int TileManager::gen_tiles(bool sliced)
if(cur_tiles == tiles_per_device) {
/* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */
if(tile_order != TILE_BOTTOM_TO_TOP) {
- tile_list->sort(TileComparator(tile_order, center));
+ tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
}
tile_list++;
cur_tiles = 0;
@@ -313,7 +338,7 @@ int TileManager::gen_tiles(bool sliced)
}
}
- return tile_index;
+ return idx;
}
void TileManager::set_tiles()
@@ -333,15 +358,111 @@ void TileManager::set_tiles()
state.buffer.full_height = max(1, params.full_height/resolution);
}
-bool TileManager::next_tile(Tile& tile, int device)
+int TileManager::get_neighbor_index(int index, int neighbor)
+{
+ static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+
+ int resolution = state.resolution_divider;
+ int image_w = max(1, params.width/resolution);
+ int image_h = max(1, params.height/resolution);
+ int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x);
+ int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y);
+
+ int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor];
+ if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+ return -1;
+
+ return ny*state.tile_stride + nx;
+}
+
+/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */
+bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+{
+ if(index < 0 || state.tiles[index].state < min_state) {
+ return false;
+ }
+ for(int neighbor = 0; neighbor < 9; neighbor++) {
+ int nindex = get_neighbor_index(index, neighbor);
+ /* Out-of-bounds tiles don't matter. */
+ if(nindex >= 0 && state.tiles[nindex].state < min_state) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */
+bool TileManager::finish_tile(int index, bool &delete_tile)
+{
+ delete_tile = false;
+
+ switch(state.tiles[index].state) {
+ case Tile::RENDER:
+ {
+ if(!schedule_denoising) {
+ state.tiles[index].state = Tile::DONE;
+ delete_tile = true;
+ return true;
+ }
+ state.tiles[index].state = Tile::RENDERED;
+ /* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */
+ for(int neighbor = 0; neighbor < 9; neighbor++) {
+ int nindex = get_neighbor_index(index, neighbor);
+ if(check_neighbor_state(nindex, Tile::RENDERED)) {
+ state.tiles[nindex].state = Tile::DENOISE;
+ state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
+ }
+ }
+ return false;
+ }
+ case Tile::DENOISE:
+ {
+ state.tiles[index].state = Tile::DENOISED;
+ /* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */
+ for(int neighbor = 0; neighbor < 9; neighbor++) {
+ int nindex = get_neighbor_index(index, neighbor);
+ if(check_neighbor_state(nindex, Tile::DENOISED)) {
+ state.tiles[nindex].state = Tile::DONE;
+ /* It can happen that the tile just finished denoising and already can be freed here.
+ * However, in that case it still has to be written before deleting, so we can't delete it yet. */
+ if(neighbor == 8) {
+ delete_tile = true;
+ }
+ else {
+ delete state.tiles[nindex].buffers;
+ state.tiles[nindex].buffers = NULL;
+ }
+ }
+ }
+ return true;
+ }
+ default:
+ assert(false);
+ return true;
+ }
+}
+
+bool TileManager::next_tile(Tile* &tile, int device)
{
int logical_device = preserve_tile_device? device: 0;
- if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty())
+ if(logical_device >= state.render_tiles.size())
+ return false;
+
+ if(!state.denoising_tiles[logical_device].empty()) {
+ int idx = state.denoising_tiles[logical_device].front();
+ state.denoising_tiles[logical_device].pop_front();
+ tile = &state.tiles[idx];
+ return true;
+ }
+
+ if(state.render_tiles[logical_device].empty())
return false;
- tile = Tile(state.tiles[logical_device].front());
- state.tiles[logical_device].pop_front();
+ int idx = state.render_tiles[logical_device].front();
+ state.render_tiles[logical_device].pop_front();
+ tile = &state.tiles[idx];
return true;
}
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 622b89f7670..e39a8f0627a 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -31,12 +31,20 @@ public:
int index;
int x, y, w, h;
int device;
+ /* RENDER: The tile has to be rendered.
+ * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
+ * DENOISE: The tile can be denoised now.
+ * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
+ * DONE: The tile is finished and has been freed. */
+ typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
+ State state;
+ RenderBuffers *buffers;
Tile()
{}
- Tile(int index_, int x_, int y_, int w_, int h_, int device_)
- : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {}
+ Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
+ : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {}
};
/* Tile order */
@@ -58,6 +66,8 @@ public:
BufferParams params;
struct State {
+ vector<Tile> tiles;
+ int tile_stride;
BufferParams buffer;
int sample;
int num_samples;
@@ -67,9 +77,12 @@ public:
/* Total samples over all pixels: Generally num_samples*num_pixels,
* but can be higher due to the initial resolution division for previews. */
uint64_t total_pixel_samples;
- /* This vector contains a list of tiles for every logical device in the session.
- * In each list, the tiles are sorted according to the tile order setting. */
- vector<list<Tile> > tiles;
+
+ /* These lists contain the indices of the tiles to be rendered/denoised and are used
+ * when acquiring a new tile for the device.
+ * Each list in each vector is for one logical device. */
+ vector<list<int> > render_tiles;
+ vector<list<int> > denoising_tiles;
} state;
int num_samples;
@@ -78,10 +91,12 @@ public:
bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
~TileManager();
+ void free_device();
void reset(BufferParams& params, int num_samples);
void set_samples(int num_samples);
bool next();
- bool next_tile(Tile& tile, int device = 0);
+ bool next_tile(Tile* &tile, int device = 0);
+ bool finish_tile(int index, bool& delete_tile);
bool done();
void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; }
@@ -96,6 +111,9 @@ public:
/* Get number of actual samples to render. */
int get_num_effective_samples();
+
+ /* Schedule tiles for denoising after they've been rendered. */
+ bool schedule_denoising;
protected:
void set_tiles();
@@ -127,6 +145,9 @@ protected:
/* Generate tile list, return number of tiles. */
int gen_tiles(bool sliced);
+
+ int get_neighbor_index(int index, int neighbor);
+ bool check_neighbor_state(int index, Tile::State state);
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 388aba65460..43f9a57d099 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -59,6 +59,7 @@ set(SRC_HEADERS
util_math_int2.h
util_math_int3.h
util_math_int4.h
+ util_math_matrix.h
util_md5.h
util_opengl.h
util_optimization.h
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 52b4fa859b7..b719640b19c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -160,6 +160,78 @@ ccl_device_inline float max4(float a, float b, float c, float d)
}
#ifndef __KERNEL_OPENCL__
+/* Int/Float conversion */
+
+ccl_device_inline int as_int(uint i)
+{
+ union { uint ui; int i; } u;
+ u.ui = i;
+ return u.i;
+}
+
+ccl_device_inline uint as_uint(int i)
+{
+ union { uint ui; int i; } u;
+ u.i = i;
+ return u.ui;
+}
+
+ccl_device_inline uint as_uint(float f)
+{
+ union { uint i; float f; } u;
+ u.f = f;
+ return u.i;
+}
+
+ccl_device_inline int __float_as_int(float f)
+{
+ union { int i; float f; } u;
+ u.f = f;
+ return u.i;
+}
+
+ccl_device_inline float __int_as_float(int i)
+{
+ union { int i; float f; } u;
+ u.i = i;
+ return u.f;
+}
+
+ccl_device_inline uint __float_as_uint(float f)
+{
+ union { uint i; float f; } u;
+ u.f = f;
+ return u.i;
+}
+
+ccl_device_inline float __uint_as_float(uint i)
+{
+ union { uint i; float f; } u;
+ u.i = i;
+ return u.f;
+}
+#endif /* __KERNEL_OPENCL__ */
+
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
+{
+ unsigned int x = __float_as_uint(f);
+ return (x << 1) > 0xff000000u;
+}
+
+ccl_device_inline bool isfinite_safe(float f)
+{
+ /* By IEEE 754 rule, 2*Inf equals Inf */
+ unsigned int x = __float_as_uint(f);
+ return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
+}
+
+ccl_device_inline float ensure_finite(float v)
+{
+ return isfinite_safe(v)? v : 0.0f;
+}
+
+#ifndef __KERNEL_OPENCL__
ccl_device_inline int clamp(int a, int mn, int mx)
{
return min(max(a, mn), mx);
@@ -250,57 +322,6 @@ CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_OPENCL__
-/* Int/Float conversion */
-
-ccl_device_inline int as_int(uint i)
-{
- union { uint ui; int i; } u;
- u.ui = i;
- return u.i;
-}
-
-ccl_device_inline uint as_uint(int i)
-{
- union { uint ui; int i; } u;
- u.i = i;
- return u.ui;
-}
-
-ccl_device_inline uint as_uint(float f)
-{
- union { uint i; float f; } u;
- u.f = f;
- return u.i;
-}
-
-ccl_device_inline int __float_as_int(float f)
-{
- union { int i; float f; } u;
- u.f = f;
- return u.i;
-}
-
-ccl_device_inline float __int_as_float(int i)
-{
- union { int i; float f; } u;
- u.i = i;
- return u.f;
-}
-
-ccl_device_inline uint __float_as_uint(float f)
-{
- union { uint i; float f; } u;
- u.f = f;
- return u.i;
-}
-
-ccl_device_inline float __uint_as_float(uint i)
-{
- union { uint i; float f; } u;
- u.i = i;
- return u.f;
-}
-
/* Interpolation */
template<class A, class B> A lerp(const A& a, const A& b, const B& t)
@@ -318,20 +339,6 @@ ccl_device_inline float triangle_area(const float3& v1,
}
#endif /* __KERNEL_OPENCL__ */
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
- unsigned int x = __float_as_uint(f);
- return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
- /* By IEEE 754 rule, 2*Inf equals Inf */
- unsigned int x = __float_as_uint(f);
- return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
-}
-
/* Orthonormal vectors */
ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
@@ -485,17 +492,17 @@ ccl_device float safe_powf(float a, float b)
return compatible_powf(a, b);
}
-ccl_device float safe_logf(float a, float b)
+ccl_device float safe_divide(float a, float b)
{
- if(UNLIKELY(a < 0.0f || b < 0.0f))
- return 0.0f;
-
- return logf(a)/logf(b);
+ return (b != 0.0f)? a/b: 0.0f;
}
-ccl_device float safe_divide(float a, float b)
+ccl_device float safe_logf(float a, float b)
{
- return (b != 0.0f)? a/b: 0.0f;
+ if(UNLIKELY(a <= 0.0f || b <= 0.0f))
+ return 0.0f;
+
+ return safe_divide(logf(a),logf(b));
}
ccl_device float safe_modulo(float a, float b)
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index e0c6b551040..a754be413fe 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -38,6 +38,7 @@ ccl_device_inline float3 operator/(const float3& a, const float3& b);
ccl_device_inline float3 operator+(const float3& a, const float3& b);
ccl_device_inline float3 operator-(const float3& a, const float3& b);
ccl_device_inline float3 operator+=(float3& a, const float3& b);
+ccl_device_inline float3 operator-=(float3& a, const float3& b);
ccl_device_inline float3 operator*=(float3& a, const float3& b);
ccl_device_inline float3 operator*=(float3& a, float f);
ccl_device_inline float3 operator/=(float3& a, const float3& b);
@@ -166,6 +167,11 @@ ccl_device_inline float3 operator+=(float3& a, const float3& b)
return a = a + b;
}
+ccl_device_inline float3 operator-=(float3& a, const float3& b)
+{
+ return a = a - b;
+}
+
ccl_device_inline float3 operator*=(float3& a, const float3& b)
{
return a = a * b;
@@ -360,6 +366,15 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
return a == b;
#endif
}
+
+ccl_device_inline float3 ensure_finite3(float3 v)
+{
+ if(!isfinite_safe(v.x)) v.x = 0.0;
+ if(!isfinite_safe(v.y)) v.y = 0.0;
+ if(!isfinite_safe(v.z)) v.z = 0.0;
+ return v;
+}
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
index 4b327c90c33..79a8c0841e7 100644
--- a/intern/cycles/util/util_math_int4.h
+++ b/intern/cycles/util/util_math_int4.h
@@ -103,6 +103,15 @@ ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
(mask.w)? a.w: b.w);
#endif
}
+
+ccl_device_inline int4 load_int4(const int *v)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_loadu_si128((__m128i*)v));
+#else
+ return make_int4(v[0], v[1], v[2], v[3]);
+#endif
+}
#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
new file mode 100644
index 00000000000..31ea10f18a8
--- /dev/null
+++ b/intern/cycles/util/util_math_matrix.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_MATRIX_H__
+#define __UTIL_MATH_MATRIX_H__
+
+CCL_NAMESPACE_BEGIN
+
+#define MAT(A, size, row, col) A[(row)*(size)+(col)]
+
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+#define VECS(V, i, s) V[(i)*(s)]
+#else
+#define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+#define VECS(V, i, s) V[i]
+#endif
+
+/* Zeroing helpers. */
+
+ccl_device_inline void math_vector_zero(float *v, int n)
+{
+ for(int i = 0; i < n; i++)
+ v[i] = 0.0f;
+}
+
+ccl_device_inline void math_matrix_zero(float *A, int n)
+{
+ for(int row = 0; row < n; row++)
+ for(int col = 0; col <= row; col++)
+ MAT(A, n, row, col) = 0.0f;
+}
+
+/* Elementary vector operations. */
+
+ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n)
+{
+ for(int i = 0; i < n; i++)
+ a[i] += b[i];
+}
+
+ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n)
+{
+ for(int i = 0; i < n; i++)
+ a[i] *= b[i];
+}
+
+ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n)
+{
+ for(int i = 0; i < n; i++)
+ a[i*astride] *= b[i];
+}
+
+ccl_device_inline void math_vector_scale(float *a, float b, int n)
+{
+ for(int i = 0; i < n; i++)
+ a[i] *= b;
+}
+
+ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n)
+{
+ for(int i = 0; i < n; i++)
+ a[i] = max(a[i], b[i]);
+}
+
+ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
+{
+ for(int i = 0; i < n; i++)
+ v[i] += w*x[i];
+}
+
+ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
+{
+ for(int i = 0; i < n; i++)
+ v[i*stride] += w*x[i];
+}
+
+/* Elementary matrix operations.
+ * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
+
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+{
+ for(int row = 0; row < n; row++)
+ MATHS(A, row, row, stride) += val;
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
+ int n,
+ float ccl_restrict_ptr v,
+ float weight)
+{
+ for(int row = 0; row < n; row++)
+ for(int col = 0; col <= row; col++)
+ MAT(A, n, row, col) += v[row]*v[col]*weight;
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
+ int n,
+ float ccl_restrict_ptr v,
+ float weight,
+ int stride)
+{
+ for(int row = 0; row < n; row++)
+ for(int col = 0; col <= row; col++)
+ MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+}
+
+/* Transpose matrix A inplace. */
+ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
+{
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < i; j++) {
+ float temp = MATS(A, n, i, j, stride);
+ MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
+ MATS(A, n, j, i, stride) = temp;
+ }
+ }
+}
+
+
+
+
+/* Solvers for matrix problems */
+
+/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
+ * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
+ * Also, only the lower triangular part of A is ever accessed. */
+ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
+{
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ float sum_col = MATHS(A, row, col, stride);
+ for(int k = 0; k < col; k++) {
+ sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
+ }
+ if(row == col) {
+ sum_col = sqrtf(max(sum_col, 0.0f));
+ }
+ else {
+ sum_col /= MATHS(A, col, col, stride);
+ }
+ MATHS(A, row, col, stride) = sum_col;
+ }
+ }
+}
+
+/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process.
+ *
+ * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
+ * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
+ * Since L is lower triangular, finding b is relatively easy since y is known.
+ * Then, the remaining problem is Lt*S = b, which again can be solved easily.
+ *
+ * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
+ * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
+ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
+{
+ math_trimatrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */
+ math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
+
+ /* Use forward substitution to solve L*b = y, replacing y by b. */
+ for(int row = 0; row < n; row++) {
+ float3 sum = VECS(y, row, stride);
+ for(int col = 0; col < row; col++)
+ sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+ VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+ }
+
+ /* Use backward substitution to solve Lt*S = b, replacing b by S. */
+ for(int row = n-1; row >= 0; row--) {
+ float3 sum = VECS(y, row, stride);
+ for(int col = row+1; col < n; col++)
+ sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+ VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+ }
+}
+
+
+
+
+
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ */
+ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride)
+{
+ const float singular_epsilon = 1e-9f;
+
+ for (int row = 0; row < n; row++)
+ for (int col = 0; col < n; col++)
+ MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+
+ for (int sweep = 0; sweep < 8; sweep++) {
+ float off_diagonal = 0.0f;
+ for (int row = 1; row < n; row++)
+ for (int col = 0; col < row; col++)
+ off_diagonal += fabsf(MAT(A, n, row, col));
+ if (off_diagonal < 1e-7f) {
+ /* The matrix has nearly reached diagonal form.
+ * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
+ break;
+ }
+
+ /* Set the threshold for the small element rotation skip in the first sweep:
+ * Skip all elements that are less than a tenth of the average off-diagonal element. */
+ float threshold = 0.2f*off_diagonal / (n*n);
+
+ for(int row = 1; row < n; row++) {
+ for(int col = 0; col < row; col++) {
+ /* Perform a Jacobi rotation on this element that reduces it to zero. */
+ float element = MAT(A, n, row, col);
+ float abs_element = fabsf(element);
+
+ /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
+ if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+ MAT(A, n, row, col) = 0.0f;
+ continue;
+ }
+
+ if(element == 0.0f) {
+ continue;
+ }
+
+ /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */
+ if(sweep < 3 && (abs_element < threshold)) {
+ continue;
+ }
+
+ /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi).
+ * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case.
+ * Then, we compute sin(phi) and cos(phi) themselves. */
+ float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
+ float ratio;
+ if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+ float cot_2phi = 0.5f*singular_diff / element;
+ ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
+ if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+ }
+ else {
+ ratio = element / singular_diff;
+ }
+
+ float c = 1.0f / sqrtf(1.0f + ratio*ratio);
+ float s = ratio*c;
+ /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */
+ float tan_phi_2 = s / (1.0f + c);
+
+ /* Update the singular values in the diagonal. */
+ float singular_delta = ratio*element;
+ MAT(A, n, row, row) += singular_delta;
+ MAT(A, n, col, col) -= singular_delta;
+
+ /* Set the element itself to zero. */
+ MAT(A, n, row, col) = 0.0f;
+
+ /* Perform the actual rotations on the matrices. */
+#define ROT(M, r1, c1, r2, c2, stride) \
+ { \
+ float M1 = MATS(M, n, r1, c1, stride); \
+ float M2 = MATS(M, n, r2, c2, stride); \
+ MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \
+ MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \
+ }
+
+ /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */
+ for(int i = 0 ; i < col; i++) ROT(A, col, i, row, i, 1);
+ for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1);
+ for(int i = row+1; i < n ; i++) ROT(A, i, col, i, row, 1);
+
+ for(int i = 0 ; i < n ; i++) ROT(V, col, i, row, i, v_stride);
+#undef ROT
+ }
+ }
+ }
+
+ /* Sort eigenvalues and the associated eigenvectors. */
+ for (int i = 0; i < n - 1; i++) {
+ float v = MAT(A, n, i, i);
+ int k = i;
+ for (int j = i; j < n; j++) {
+ if (MAT(A, n, j, j) >= v) {
+ v = MAT(A, n, j, j);
+ k = j;
+ }
+ }
+ if (k != i) {
+ /* Swap eigenvalues. */
+ MAT(A, n, k, k) = MAT(A, n, i, i);
+ MAT(A, n, i, i) = v;
+ /* Swap eigenvectors. */
+ for (int j = 0; j < n; j++) {
+ float v = MATS(V, n, i, j, v_stride);
+ MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
+ MATS(V, n, k, j, v_stride) = v;
+ }
+ }
+ }
+}
+
+#ifdef __KERNEL_SSE3__
+
+ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+{
+ for(int i = 0; i < n; i++)
+ A[i] = _mm_setzero_ps();
+}
+ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+{
+ for(int row = 0; row < n; row++)
+ for(int col = 0; col <= row; col++)
+ MAT(A, n, row, col) = _mm_setzero_ps();
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight)
+{
+ for(int row = 0; row < n; row++)
+ for(int col = 0; col <= row; col++)
+ MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+}
+
+ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
+{
+ for(int i = 0; i < n; i++)
+ V[i] = _mm_add_ps(V[i], a[i]);
+}
+
+ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
+{
+ for(int i = 0; i < n; i++)
+ V[i] = _mm_mul_ps(V[i], a[i]);
+}
+
+ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n)
+{
+ for(int i = 0; i < n; i++)
+ a[i] = _mm_max_ps(a[i], b[i]);
+}
+
+ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B)
+{
+ for(int row = 0; row < n; row++)
+ for(int col = 0; col <= row; col++)
+ MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+}
+#endif
+
+#undef MAT
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_MATRIX_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 557809a5719..545a3399f32 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v)
static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
static const size_t BITSCAN_NO_BIT_SET_64 = 64;
+#ifdef __KERNEL_SSE3__
/* Emulation of SSE4 functions with SSE3 */
-
-#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__)
+# ifndef __KERNEL_SSE41__
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_NEG_INF 0x01
@@ -362,7 +362,7 @@ __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
char* _r = (char*)(&rvalue + 1);
char* _v = (char*)(& value + 1);
char* _i = (char*)(& input + 1);
- for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i));
+ for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i));
return rvalue;
}
@@ -395,7 +395,7 @@ __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int inde
#define _mm_extract_ps __emu_mm_extract_ps
__forceinline int _mm_extract_ps( __m128 input, const int index ) {
- int32* ptr = (int32*)&input; return ptr[index];
+ int32_t* ptr = (int32_t*)&input; return ptr[index];
}
#define _mm_insert_ps __emu_mm_insert_ps
@@ -415,7 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
return value;
}
-#ifdef _M_X64
+# ifdef _M_X64
#define _mm_insert_epi64 __emu_mm_insert_epi64
__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) {
assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value;
@@ -426,7 +426,40 @@ __forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) {
assert(size_t(index) < 2);
return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input));
}
-#endif
+# endif
+
+# endif
+
+#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
+
+/* Return a __m128 with every element set to the largest element of v. */
+ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
+{
+ /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
+ v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
+ /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
+ v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
+ return v;
+}
+
+/* Return the sum of the four elements of x. */
+ccl_device_inline float _mm_hsum_ss(__m128 x)
+{
+ __m128 a = _mm_movehdup_ps(x);
+ __m128 b = _mm_add_ps(x, a);
+ return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
+}
+
+/* Return a __m128 with every element set to the sum of the four elements of x. */
+ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
+{
+ x = _mm_hadd_ps(x, x);
+ x = _mm_hadd_ps(x, x);
+ return x;
+}
+
+/* Replace elements of x with zero where mask isn't set. */
+#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
#endif
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index feab7996aee..0039c59ec48 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -133,6 +133,11 @@ ccl_device_inline size_t align_up(size_t offset, size_t alignment)
return (offset + alignment - 1) & ~(alignment - 1);
}
+ccl_device_inline size_t divide_up(size_t x, size_t y)
+{
+ return (x + y - 1) / y;
+}
+
ccl_device_inline size_t round_up(size_t x, size_t multiple)
{
return ((x + multiple - 1) / multiple) * multiple;